From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 17 Apr 2013 14:14:35 +0000 (+0400)
Subject: gpuimgproc module for image processing
X-Git-Tag: submit/tizen/20180620.034203~3^2~3902^2~35
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e41aea0acf0d6d40b03a2f38499f135504101752;p=platform%2Fupstream%2Fopencv.git

gpuimgproc module for image processing
---

diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 55fc1007e7..ee66608a2d 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()
 
 set(the_description "GPU-accelerated Computer Vision")
 
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy opencv_gpuarithm opencv_gpufilters OPTIONAL opencv_gpunvidia)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy opencv_gpuarithm opencv_gpufilters opencv_gpuimgproc OPTIONAL opencv_gpunvidia)
 
 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
 
diff --git a/modules/gpu/doc/gpu.rst b/modules/gpu/doc/gpu.rst
index de52ceaba1..6c082ccd17 100644
--- a/modules/gpu/doc/gpu.rst
+++ b/modules/gpu/doc/gpu.rst
@@ -8,7 +8,6 @@ gpu. GPU-accelerated Computer Vision
     introduction
     initalization_and_information
     data_structures
-    image_processing
     object_detection
     feature_detection_and_description
     camera_calibration_and_3d_reconstruction
diff --git a/modules/gpu/doc/image_processing.rst b/modules/gpu/doc/image_processing.rst
deleted file mode 100644
index 69e5003743..0000000000
--- a/modules/gpu/doc/image_processing.rst
+++ /dev/null
@@ -1,1065 +0,0 @@
-Image Processing
-================
-
-.. highlight:: cpp
-
-
-
-gpu::meanShiftFiltering
----------------------------
-Performs mean-shift filtering for each point of the source image.
-
-.. ocv:function:: void gpu::meanShiftFiltering( const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria=TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream=Stream::Null() )
-
-    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
-
-    :param dst: Destination image containing the color of mapped points. It has the same size and type as  ``src`` .
-
-    :param sp: Spatial window radius.
-
-    :param sr: Color window radius.
-
-    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
-
-It maps each point of the source image into another point. As a result, you have a new color and new position of each point.
-
-
-
-gpu::meanShiftProc
-----------------------
-Performs a mean-shift procedure and stores information about processed points (their colors and positions) in two images.
-
-.. ocv:function:: void gpu::meanShiftProc( const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria=TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream=Stream::Null() )
-
-    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
-
-    :param dstr: Destination image containing the color of mapped points. The size and type is the same as  ``src`` .
-
-    :param dstsp: Destination image containing the position of mapped points. The size is the same as  ``src`` size. The type is  ``CV_16SC2`` .
-
-    :param sp: Spatial window radius.
-
-    :param sr: Color window radius.
-
-    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
-
-.. seealso:: :ocv:func:`gpu::meanShiftFiltering`
-
-
-
-gpu::meanShiftSegmentation
-------------------------------
-Performs a mean-shift segmentation of the source image and eliminates small segments.
-
-.. ocv:function:: void gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
-
-    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
-
-    :param dst: Segmented image with the same size and type as  ``src`` .
-
-    :param sp: Spatial window radius.
-
-    :param sr: Color window radius.
-
-    :param minsize: Minimum segment size. Smaller segments are merged.
-
-    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
-
-
-
-gpu::integral
------------------
-Computes an integral image.
-
-.. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-
-    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`integral`
-
-
-
-gpu::sqrIntegral
---------------------
-Computes a squared integral image.
-
-.. ocv:function:: void gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-
-    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::columnSum
-------------------
-Computes a vertical (column) sum.
-
-.. ocv:function:: void gpu::columnSum(const GpuMat& src, GpuMat& sum)
-
-    :param src: Source image. Only  ``CV_32FC1`` images are supported for now.
-
-    :param sum: Destination image of the  ``CV_32FC1`` type.
-
-
-
-gpu::cornerHarris
----------------------
-Computes the Harris cornerness criteria at each image pixel.
-
-.. ocv:function:: void gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101)
-
-    :param src: Source image. Only  ``CV_8UC1`` and  ``CV_32FC1`` images are supported for now.
-
-    :param dst: Destination image containing cornerness values. It has the same size as ``src`` and ``CV_32FC1`` type.
-
-    :param blockSize: Neighborhood size.
-
-    :param ksize: Aperture parameter for the Sobel operator.
-
-    :param k: Harris detector free parameter.
-
-    :param borderType: Pixel extrapolation method. Only  ``BORDER_REFLECT101`` and  ``BORDER_REPLICATE`` are supported for now.
-
-.. seealso:: :ocv:func:`cornerHarris`
-
-
-
-gpu::cornerMinEigenVal
---------------------------
-Computes the minimum eigen value of a 2x2 derivative covariation matrix at each pixel (the cornerness criteria).
-
-.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101)
-
-.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101)
-
-.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only  ``CV_8UC1`` and  ``CV_32FC1`` images are supported for now.
-
-    :param dst: Destination image containing cornerness values. The size is the same. The type is  ``CV_32FC1`` .
-
-    :param blockSize: Neighborhood size.
-
-    :param ksize: Aperture parameter for the Sobel operator.
-
-    :param borderType: Pixel extrapolation method. Only ``BORDER_REFLECT101`` and ``BORDER_REPLICATE`` are supported for now.
-
-.. seealso:: :ocv:func:`cornerMinEigenVal`
-
-
-
-gpu::mulSpectrums
----------------------
-Performs a per-element multiplication of two Fourier spectrums.
-
-.. ocv:function:: void gpu::mulSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream=Stream::Null() )
-
-    :param a: First spectrum.
-
-    :param b: Second spectrum with the same size and type as  ``a`` .
-
-    :param c: Destination spectrum.
-
-    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
-
-    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
-
-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
-
-.. seealso:: :ocv:func:`mulSpectrums`
-
-
-
-gpu::mulAndScaleSpectrums
------------------------------
-Performs a per-element multiplication of two Fourier spectrums and scales the result.
-
-.. ocv:function:: void gpu::mulAndScaleSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream=Stream::Null() )
-
-    :param a: First spectrum.
-
-    :param b: Second spectrum with the same size and type as  ``a`` .
-
-    :param c: Destination spectrum.
-
-    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
-
-    :param scale: Scale constant.
-
-    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
-
-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
-
-.. seealso:: :ocv:func:`mulSpectrums`
-
-
-
-gpu::dft
-------------
-Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
-
-.. ocv:function:: void gpu::dft( const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream=Stream::Null() )
-
-    :param src: Source matrix (real or complex).
-
-    :param dst: Destination matrix (real or complex).
-
-    :param dft_size: Size of a discrete Fourier transform.
-
-    :param flags: Optional flags:
-
-        * **DFT_ROWS** transforms each individual row of the source matrix.
-
-        * **DFT_SCALE** scales the result: divide it by the number of elements in the transform (obtained from  ``dft_size`` ).
-
-        * **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real cases are always forward and inverse, respectively).
-
-        * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.
-
-Use to handle real matrices ( ``CV32FC1`` ) and complex matrices in the interleaved format ( ``CV32FC2`` ).
-
-The source matrix should be continuous, otherwise reallocation and data copying is performed. The function chooses an operation mode depending on the flags, size, and channel count of the source matrix:
-
-    * If the source matrix is complex and the output is not specified as real, the destination matrix is complex and has the ``dft_size``    size and ``CV_32FC2``    type. The destination matrix contains a full result of the DFT (forward or inverse).
-
-    * If the source matrix is complex and the output is specified as real, the function assumes that its input is the result of the forward transform (see the next item). The destination matrix has the ``dft_size`` size and ``CV_32FC1`` type. It contains the result of the inverse DFT.
-
-    * If the source matrix is real (its type is ``CV_32FC1`` ), forward DFT is performed. The result of the DFT is packed into complex ( ``CV_32FC2`` ) matrix. So, the width of the destination matrix is ``dft_size.width / 2 + 1`` . But if the source is a single column, the height is reduced instead of the width.
-
-.. seealso:: :ocv:func:`dft`
-
-
-gpu::ConvolveBuf
-----------------
-.. ocv:struct:: gpu::ConvolveBuf
-
-Class providing a memory buffer for :ocv:func:`gpu::convolve` function, plus it allows to adjust some specific parameters. ::
-
-    struct CV_EXPORTS ConvolveBuf
-    {
-        Size result_size;
-        Size block_size;
-        Size user_block_size;
-        Size dft_size;
-        int spect_len;
-
-        GpuMat image_spect, templ_spect, result_spect;
-        GpuMat image_block, templ_block, result_data;
-
-        void create(Size image_size, Size templ_size);
-        static Size estimateBlockSize(Size result_size, Size templ_size);
-    };
-
-You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
-
-gpu::ConvolveBuf::create
-------------------------
-.. ocv:function:: gpu::ConvolveBuf::create(Size image_size, Size templ_size)
-
-Constructs a buffer for :ocv:func:`gpu::convolve` function with respective arguments.
-
-
-gpu::convolve
------------------
-Computes a convolution (or cross-correlation) of two images.
-
-.. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr=false)
-
-.. ocv:function:: void gpu::convolve( const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream=Stream::Null() )
-
-    :param image: Source image. Only  ``CV_32FC1`` images are supported for now.
-
-    :param templ: Template image. The size is not greater than the  ``image`` size. The type is the same as  ``image`` .
-
-    :param result: Result image. If  ``image`` is  *W x H*  and ``templ`` is  *w x h*, then  ``result`` must be *W-w+1 x H-h+1*.
-
-    :param ccorr: Flags to evaluate cross-correlation instead of convolution.
-
-    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`gpu::ConvolveBuf`.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::filter2D`
-
-gpu::MatchTemplateBuf
----------------------
-.. ocv:struct:: gpu::MatchTemplateBuf
-
-Class providing memory buffers for :ocv:func:`gpu::matchTemplate` function, plus it allows to adjust some specific parameters. ::
-
-    struct CV_EXPORTS MatchTemplateBuf
-    {
-        Size user_block_size;
-        GpuMat imagef, templf;
-        std::vector<GpuMat> images;
-        std::vector<GpuMat> image_sums;
-        std::vector<GpuMat> image_sqsums;
-    };
-
-You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::matchTemplate` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
-
-gpu::matchTemplate
-----------------------
-Computes a proximity map for a raster template and an image where the template is searched for.
-
-.. ocv:function:: void gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null())
-
-.. ocv:function:: void gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null())
-
-    :param image: Source image.  ``CV_32F`` and  ``CV_8U`` depth images (1..4 channels) are supported for now.
-
-    :param templ: Template image with the size and type the same as  ``image`` .
-
-    :param result: Map containing comparison results ( ``CV_32FC1`` ). If  ``image`` is  *W x H*  and ``templ`` is  *w x h*, then  ``result`` must be *W-w+1 x H-h+1*.
-
-    :param method: Specifies the way to compare the template with the image.
-
-    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`gpu::MatchTemplateBuf`.
-
-    :param stream: Stream for the asynchronous version.
-
-    The following methods are supported for the ``CV_8U`` depth images for now:
-
-    * ``CV_TM_SQDIFF``
-    * ``CV_TM_SQDIFF_NORMED``
-    * ``CV_TM_CCORR``
-    * ``CV_TM_CCORR_NORMED``
-    * ``CV_TM_CCOEFF``
-    * ``CV_TM_CCOEFF_NORMED``
-
-    The following methods are supported for the ``CV_32F`` images for now:
-
-    * ``CV_TM_SQDIFF``
-    * ``CV_TM_CCORR``
-
-.. seealso:: :ocv:func:`matchTemplate`
-
-
-gpu::remap
---------------
-Applies a generic geometrical transformation to an image.
-
-.. ocv:function:: void gpu::remap( const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode=BORDER_CONSTANT, Scalar borderValue=Scalar(), Stream& stream=Stream::Null() )
-
-    :param src: Source image.
-
-    :param dst: Destination image with the size the same as  ``xmap`` and the type the same as  ``src`` .
-
-    :param xmap: X values. Only  ``CV_32FC1`` type is supported.
-
-    :param ymap: Y values. Only  ``CV_32FC1`` type is supported.
-
-    :param interpolation: Interpolation method (see  :ocv:func:`resize` ). ``INTER_NEAREST`` , ``INTER_LINEAR`` and ``INTER_CUBIC`` are supported for now.
-
-    :param borderMode: Pixel extrapolation method (see  :ocv:func:`borderInterpolate` ). ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
-
-    :param borderValue: Value used in case of a constant border. By default, it is 0.
-
-    :param stream: Stream for the asynchronous version.
-
-The function transforms the source image using the specified map:
-
-.. math::
-
-    \texttt{dst} (x,y) =  \texttt{src} (xmap(x,y), ymap(x,y))
-
-Values of pixels with non-integer coordinates are computed using the bilinear interpolation.
-
-.. seealso:: :ocv:func:`remap`
-
-
-
-gpu::cvtColor
------------------
-Converts an image from one color space to another.
-
-.. ocv:function:: void gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null())
-
-    :param src: Source image with  ``CV_8U`` , ``CV_16U`` , or  ``CV_32F`` depth and 1, 3, or 4 channels.
-
-    :param dst: Destination image with the same size and depth as  ``src`` .
-
-    :param code: Color space conversion code. For details, see  :ocv:func:`cvtColor` . Conversion to/from Luv and Bayer color spaces is not supported.
-
-    :param dcn: Number of channels in the destination image. If the parameter is 0, the number of the channels is derived automatically from  ``src`` and the  ``code`` .
-
-    :param stream: Stream for the asynchronous version.
-
-3-channel color spaces (like ``HSV``, ``XYZ``, and so on) can be stored in a 4-channel image for better performance.
-
-.. seealso:: :ocv:func:`cvtColor`
-
-
-
-gpu::swapChannels
------------------
-Exchanges the color channels of an image in-place.
-
-.. ocv:function:: void gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null())
-
-    :param image: Source image. Supports only ``CV_8UC4`` type.
-
-    :param dstOrder: Integer array describing how channel values are permutated. The n-th entry of the array contains the number of the channel that is stored in the n-th channel of the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order.
-
-    :param stream: Stream for the asynchronous version.
-
-The methods support arbitrary permutations of the original channels, including replication.
-
-
-
-gpu::resize
----------------
-Resizes an image.
-
-.. ocv:function:: void gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null())
-
-    :param src: Source image.
-
-    :param dst: Destination image  with the same type as  ``src`` . The size is ``dsize`` (when it is non-zero) or the size is computed from  ``src.size()`` , ``fx`` , and  ``fy`` .
-
-    :param dsize: Destination image size. If it is zero, it is computed as:
-
-        .. math::
-            \texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}
-
-        Either  ``dsize`` or both  ``fx`` and  ``fy`` must be non-zero.
-
-    :param fx: Scale factor along the horizontal axis. If it is zero, it is computed as:
-
-        .. math::
-
-            \texttt{(double)dsize.width/src.cols}
-
-    :param fy: Scale factor along the vertical axis. If it is zero, it is computed as:
-
-        .. math::
-
-            \texttt{(double)dsize.height/src.rows}
-
-    :param interpolation: Interpolation method. ``INTER_NEAREST`` , ``INTER_LINEAR`` and ``INTER_CUBIC`` are supported for now.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`resize`
-
-
-
-gpu::warpAffine
--------------------
-Applies an affine transformation to an image.
-
-.. ocv:function:: void gpu::warpAffine( const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags=INTER_LINEAR, int borderMode=BORDER_CONSTANT, Scalar borderValue=Scalar(), Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8U`` , ``CV_16U`` , ``CV_32S`` , or  ``CV_32F`` depth and 1, 3, or 4 channels are supported.
-
-    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .
-
-    :param M: *2x3*  transformation matrix.
-
-    :param dsize: Size of the destination image.
-
-    :param flags: Combination of interpolation methods (see  :ocv:func:`resize`) and the optional flag  ``WARP_INVERSE_MAP`` specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ). Only ``INTER_NEAREST`` , ``INTER_LINEAR`` , and  ``INTER_CUBIC`` interpolation methods are supported.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`warpAffine`
-
-
-
-gpu::buildWarpAffineMaps
-------------------------
-Builds transformation maps for affine transformation.
-
-.. ocv:function:: void gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null())
-
-    :param M: *2x3*  transformation matrix.
-
-    :param inverse: Flag  specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ).
-
-    :param dsize: Size of the destination image.
-
-    :param xmap: X values with  ``CV_32FC1`` type.
-
-    :param ymap: Y values with  ``CV_32FC1`` type.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::warpAffine` , :ocv:func:`gpu::remap`
-
-
-
-gpu::warpPerspective
-------------------------
-Applies a perspective transformation to an image.
-
-.. ocv:function:: void gpu::warpPerspective( const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags=INTER_LINEAR, int borderMode=BORDER_CONSTANT, Scalar borderValue=Scalar(), Stream& stream=Stream::Null() )
-
-    :param src: Source image. ``CV_8U`` , ``CV_16U`` , ``CV_32S`` , or  ``CV_32F`` depth and 1, 3, or 4 channels are supported.
-
-    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .
-
-    :param M: *3x3* transformation matrix.
-
-    :param dsize: Size of the destination image.
-
-    :param flags: Combination of interpolation methods (see  :ocv:func:`resize` ) and the optional flag  ``WARP_INVERSE_MAP`` specifying that  ``M`` is the inverse transformation ( ``dst => src`` ). Only  ``INTER_NEAREST`` , ``INTER_LINEAR`` , and  ``INTER_CUBIC`` interpolation methods are supported.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`warpPerspective`
-
-
-
-gpu::buildWarpPerspectiveMaps
------------------------------
-Builds transformation maps for perspective transformation.
-
-.. ocv:function:: void gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null())
-
-    :param M: *3x3*  transformation matrix.
-
-    :param inverse: Flag  specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ).
-
-    :param dsize: Size of the destination image.
-
-    :param xmap: X values with  ``CV_32FC1`` type.
-
-    :param ymap: Y values with  ``CV_32FC1`` type.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::warpPerspective` , :ocv:func:`gpu::remap`
-
-
-
-gpu::rotate
----------------
-Rotates an image around the origin (0,0) and then shifts it.
-
-.. ocv:function:: void gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null())
-
-    :param src: Source image. Supports 1, 3 or 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32F`` depth.
-
-    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .
-
-    :param dsize: Size of the destination image.
-
-    :param angle: Angle of rotation in degrees.
-
-    :param xShift: Shift along the horizontal axis.
-
-    :param yShift: Shift along the vertical axis.
-
-    :param interpolation: Interpolation method. Only  ``INTER_NEAREST`` , ``INTER_LINEAR`` , and  ``INTER_CUBIC`` are supported.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::warpAffine`
-
-
-
-gpu::copyMakeBorder
------------------------
-Forms a border around an image.
-
-.. ocv:function:: void gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null())
-
-    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and  ``CV_32FC1`` types are supported.
-
-    :param dst: Destination image with the same type as  ``src``. The size is  ``Size(src.cols+left+right, src.rows+top+bottom)`` .
-
-    :param top:
-
-    :param bottom:
-
-    :param left:
-
-    :param right: Number of pixels in each direction from the source image rectangle to extrapolate. For example:  ``top=1, bottom=1, left=1, right=1`` mean that 1 pixel-wide border needs to be built.
-
-    :param borderType: Border type. See  :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
-
-    :param value: Border value.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`copyMakeBorder`
-
-
-
-gpu::rectStdDev
--------------------
-Computes a standard deviation of integral images.
-
-.. ocv:function:: void gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only the ``CV_32SC1`` type is supported.
-
-    :param sqr: Squared source image. Only  the ``CV_32FC1`` type is supported.
-
-    :param dst: Destination image with the same type and size as  ``src`` .
-
-    :param rect: Rectangular window.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::evenLevels
--------------------
-Computes levels with even distribution.
-
-.. ocv:function:: void gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel)
-
-    :param levels: Destination array.  ``levels`` has 1 row, ``nLevels`` columns, and the ``CV_32SC1`` type.
-
-    :param nLevels: Number of computed levels.  ``nLevels`` must be at least 2.
-
-    :param lowerLevel: Lower boundary value of the lowest level.
-
-    :param upperLevel: Upper boundary value of the greatest level.
-
-
-
-gpu::histEven
------------------
-Calculates a histogram with evenly distributed bins.
-
-.. ocv:function:: void gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::histEven( const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::histEven( const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream=Stream::Null() )
-
-    :param src: Source image. ``CV_8U``, ``CV_16U``, or ``CV_16S`` depth and 1 or 4 channels are supported. For a four-channel image, all channels are processed separately.
-
-    :param hist: Destination histogram with one row, ``histSize`` columns, and the ``CV_32S`` type.
-
-    :param histSize: Size of the histogram.
-
-    :param lowerLevel: Lower boundary of lowest-level bin.
-
-    :param upperLevel: Upper boundary of highest-level bin.
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::histRange
-------------------
-Calculates a histogram with bins determined by the ``levels`` array.
-
-.. ocv:function:: void gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null())
-
-    :param src: Source image. ``CV_8U`` , ``CV_16U`` , or  ``CV_16S`` depth and 1 or 4 channels are supported. For a four-channel image, all channels are processed separately.
-
-    :param hist: Destination histogram with one row, ``(levels.cols-1)`` columns, and the  ``CV_32SC1`` type.
-
-    :param levels: Number of levels in the histogram.
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::calcHist
-------------------
-Calculates histogram for one channel 8-bit image.
-
-.. ocv:function:: void gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null())
-
-    :param src: Source image.
-
-    :param hist: Destination histogram with one row, 256 columns, and the  ``CV_32SC1`` type.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::equalizeHist
-------------------
-Equalizes the histogram of a grayscale image.
-
-.. ocv:function:: void gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null())
-
-    :param src: Source image.
-
-    :param dst: Destination image.
-
-    :param hist: Destination histogram with one row, 256 columns, and the  ``CV_32SC1`` type.
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`equalizeHist`
-
-
-
-gpu::buildWarpPlaneMaps
------------------------
-Builds plane warping maps.
-
-.. ocv:function:: void gpu::buildWarpPlaneMaps( Size src_size, Rect dst_roi, const Mat & K, const Mat& R, const Mat & T, float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream=Stream::Null() )
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::buildWarpCylindricalMaps
------------------------------
-Builds cylindrical warping maps.
-
-.. ocv:function:: void gpu::buildWarpCylindricalMaps( Size src_size, Rect dst_roi, const Mat & K, const Mat& R, float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream=Stream::Null() )
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::buildWarpSphericalMaps
----------------------------
-Builds spherical warping maps.
-
-.. ocv:function:: void gpu::buildWarpSphericalMaps( Size src_size, Rect dst_roi, const Mat & K, const Mat& R, float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream=Stream::Null() )
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::pyrDown
--------------------
-Smoothes an image and downsamples it.
-
-.. ocv:function:: void gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source image.
-
-    :param dst: Destination image. Will have ``Size((src.cols+1)/2, (src.rows+1)/2)`` size and the same type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`pyrDown`
-
-
-
-gpu::pyrUp
--------------------
-Upsamples an image and then smoothes it.
-
-.. ocv:function:: void gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source image.
-
-    :param dst: Destination image. Will have ``Size(src.cols*2, src.rows*2)`` size and the same type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`pyrUp`
-
-
-
-gpu::blendLinear
--------------------
-Performs linear blending of two images.
-
-.. ocv:function:: void gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, GpuMat& result, Stream& stream = Stream::Null())
-
-    :param img1: First image. Supports only ``CV_8U`` and ``CV_32F`` depth.
-
-    :param img2: Second image. Must have the same size and the same type as ``img1`` .
-
-    :param weights1: Weights for first image. Must have tha same size as ``img1`` . Supports only ``CV_32F`` type.
-
-    :param weights2: Weights for second image. Must have tha same size as ``img2`` . Supports only ``CV_32F`` type.
-
-    :param result: Destination image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-gpu::bilateralFilter
---------------------
-Performs bilateral filtering of passed image
-
-.. ocv:function:: void gpu::bilateralFilter( const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode=BORDER_DEFAULT, Stream& stream=Stream::Null() )
-
-    :param src: Source image. Supports only (channles != 2 && depth() != CV_8S && depth() != CV_32S && depth() != CV_64F).
-
-    :param dst: Destination imagwe.
-
-    :param kernel_size: Kernel window size.
-
-    :param sigma_color: Filter sigma in the color space.
-
-    :param sigma_spatial:  Filter sigma in the coordinate space.
-
-    :param borderMode:  Border type. See :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso::
-
-    :ocv:func:`bilateralFilter`,
-
-
-gpu::nonLocalMeans
--------------------
-Performs pure non local means denoising without any simplification, and thus it is not fast.
-
-.. ocv:function:: void gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null())
-
-    :param src: Source image. Supports only CV_8UC1, CV_8UC2 and CV_8UC3.
-
-    :param dst: Destination image.
-
-    :param h: Filter sigma regulating filter strength for color.
-
-    :param search_window: Size of search window.
-
-    :param block_size: Size of block used for computing weights.
-
-    :param borderMode:  Border type. See :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso::
-
-    :ocv:func:`fastNlMeansDenoising`
-
-gpu::FastNonLocalMeansDenoising
--------------------------------
-.. ocv:class:: gpu::FastNonLocalMeansDenoising
-
-    ::
-
-        class FastNonLocalMeansDenoising
-        {
-        public:
-            //! Simple method, recommended for grayscale images (though it supports multichannel images)
-            void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
-            //! Processes luminance and color components separatelly
-            void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
-        };
-
-The class implements fast approximate Non Local Means Denoising algorithm.
-
-gpu::FastNonLocalMeansDenoising::simpleMethod()
------------------------------------------------
-Perform image denoising using Non-local Means Denoising algorithm http://www.ipol.im/pub/algo/bcm_non_local_means_denoising with several computational optimizations. Noise expected to be a gaussian white noise
-
-.. ocv:function:: void gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
-
-    :param src: Input 8-bit 1-channel, 2-channel or 3-channel image.
-
-    :param dst: Output image with the same size and type as  ``src`` .
-
-    :param h: Parameter regulating filter strength. Big h value perfectly removes noise but also removes image details, smaller h value preserves details but also preserves some noise
-
-    :param search_window: Size in pixels of the window that is used to compute weighted average for given pixel. Should be odd. Affect performance linearly: greater search_window - greater denoising time. Recommended value 21 pixels
-
-    :param block_size: Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value 7 pixels
-
-    :param stream: Stream for the asynchronous invocations.
-
-This function expected to be applied to grayscale images. For colored images look at ``FastNonLocalMeansDenoising::labMethod``.
-
-.. seealso::
-
-    :ocv:func:`fastNlMeansDenoising`
-
-gpu::FastNonLocalMeansDenoising::labMethod()
---------------------------------------------
-Modification of ``FastNonLocalMeansDenoising::simpleMethod`` for color images
-
-.. ocv:function:: void gpu::FastNonLocalMeansDenoising::labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
-
-    :param src: Input 8-bit 3-channel image.
-
-    :param dst: Output image with the same size and type as  ``src`` .
-
-    :param h_luminance: Parameter regulating filter strength. Big h value perfectly removes noise but also removes image details, smaller h value preserves details but also preserves some noise
-
-    :param float: The same as h but for color components. For most images value equals 10 will be enought to remove colored noise and do not distort colors
-
-    :param search_window: Size in pixels of the window that is used to compute weighted average for given pixel. Should be odd. Affect performance linearly: greater search_window - greater denoising time. Recommended value 21 pixels
-
-    :param block_size: Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value 7 pixels
-
-    :param stream: Stream for the asynchronous invocations.
-
-The function converts image to CIELAB colorspace and then separately denoise L and AB components with given h parameters using ``FastNonLocalMeansDenoising::simpleMethod`` function.
-
-.. seealso::
-
-    :ocv:func:`fastNlMeansDenoisingColored`
-
-gpu::alphaComp
--------------------
-Composites two images using alpha opacity values contained in each image.
-
-.. ocv:function:: void gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null())
-
-    :param img1: First image. Supports ``CV_8UC4`` , ``CV_16UC4`` , ``CV_32SC4`` and ``CV_32FC4`` types.
-
-    :param img2: Second image. Must have the same size and the same type as ``img1`` .
-
-    :param dst: Destination image.
-
-    :param alpha_op: Flag specifying the alpha-blending operation:
-
-            * **ALPHA_OVER**
-            * **ALPHA_IN**
-            * **ALPHA_OUT**
-            * **ALPHA_ATOP**
-            * **ALPHA_XOR**
-            * **ALPHA_PLUS**
-            * **ALPHA_OVER_PREMUL**
-            * **ALPHA_IN_PREMUL**
-            * **ALPHA_OUT_PREMUL**
-            * **ALPHA_ATOP_PREMUL**
-            * **ALPHA_XOR_PREMUL**
-            * **ALPHA_PLUS_PREMUL**
-            * **ALPHA_PREMUL**
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::Canny
--------------------
-Finds edges in an image using the [Canny86]_ algorithm.
-
-.. ocv:function:: void gpu::Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false)
-
-.. ocv:function:: void gpu::Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false)
-
-.. ocv:function:: void gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false)
-
-.. ocv:function:: void gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false)
-
-    :param image: Single-channel 8-bit input image.
-
-    :param dx: First derivative of image in the vertical direction. Support only ``CV_32S`` type.
-
-    :param dy: First derivative of image in the horizontal direction. Support only ``CV_32S`` type.
-
-    :param edges: Output edge map. It has the same size and type as  ``image`` .
-
-    :param low_thresh: First threshold for the hysteresis procedure.
-
-    :param high_thresh: Second threshold for the hysteresis procedure.
-
-    :param apperture_size: Aperture size for the  :ocv:func:`Sobel`  operator.
-
-    :param L2gradient: Flag indicating whether a more accurate  :math:`L_2`  norm  :math:`=\sqrt{(dI/dx)^2 + (dI/dy)^2}`  should be used to compute the image gradient magnitude ( ``L2gradient=true`` ), or a faster default  :math:`L_1`  norm  :math:`=|dI/dx|+|dI/dy|`  is enough ( ``L2gradient=false`` ).
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-.. seealso:: :ocv:func:`Canny`
-
-
-
-gpu::HoughLines
----------------
-Finds lines in a binary image using the classical Hough transform.
-
-.. ocv:function:: void gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096)
-
-.. ocv:function:: void gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096)
-
-    :param src: 8-bit, single-channel binary source image.
-
-    :param lines: Output vector of lines. Each line is represented by a two-element vector  :math:`(\rho, \theta)` .  :math:`\rho`  is the distance from the coordinate origin  :math:`(0,0)`  (top-left corner of the image).  :math:`\theta`  is the line rotation angle in radians ( :math:`0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}` ).
-
-    :param rho: Distance resolution of the accumulator in pixels.
-
-    :param theta: Angle resolution of the accumulator in radians.
-
-    :param threshold: Accumulator threshold parameter. Only those lines are returned that get enough votes ( :math:`>\texttt{threshold}` ).
-
-    :param doSort: Performs lines sort by votes.
-
-    :param maxLines: Maximum number of output lines.
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-.. seealso:: :ocv:func:`HoughLines`
-
-
-
-gpu::HoughLinesDownload
------------------------
-Downloads results from :ocv:func:`gpu::HoughLines` to host memory.
-
-.. ocv:function:: void gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray())
-
-    :param d_lines: Result of :ocv:func:`gpu::HoughLines` .
-
-    :param h_lines: Output host array.
-
-    :param h_votes: Optional output array for line's votes.
-
-.. seealso:: :ocv:func:`gpu::HoughLines`
-
-
-
-gpu::HoughCircles
------------------
-Finds circles in a grayscale image using the Hough transform.
-
-.. ocv:function:: void gpu::HoughCircles(const GpuMat& src, GpuMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096)
-
-.. ocv:function:: void gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096)
-
-    :param src: 8-bit, single-channel grayscale input image.
-
-    :param circles: Output vector of found circles. Each vector is encoded as a 3-element floating-point vector  :math:`(x, y, radius)` .
-
-    :param method: Detection method to use. Currently, the only implemented method is  ``CV_HOUGH_GRADIENT`` , which is basically  *21HT* , described in  [Yuen90]_.
-
-    :param dp: Inverse ratio of the accumulator resolution to the image resolution. For example, if  ``dp=1`` , the accumulator has the same resolution as the input image. If  ``dp=2`` , the accumulator has half as big width and height.
-
-    :param minDist: Minimum distance between the centers of the detected circles. If the parameter is too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is too large, some circles may be missed.
-
-    :param cannyThreshold: The higher threshold of the two passed to  the :ocv:func:`gpu::Canny`  edge detector (the lower one is twice smaller).
-
-    :param votesThreshold: The accumulator threshold for the circle centers at the detection stage. The smaller it is, the more false circles may be detected.
-
-    :param minRadius: Minimum circle radius.
-
-    :param maxRadius: Maximum circle radius.
-
-    :param maxCircles: Maximum number of output circles.
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-.. seealso:: :ocv:func:`HoughCircles`
-
-
-
-gpu::HoughCirclesDownload
--------------------------
-Downloads results from :ocv:func:`gpu::HoughCircles` to host memory.
-
-.. ocv:function:: void gpu::HoughCirclesDownload(const GpuMat& d_circles, OutputArray h_circles)
-
-    :param d_circles: Result of :ocv:func:`gpu::HoughCircles` .
-
-    :param h_circles: Output host array.
-
-.. seealso:: :ocv:func:`gpu::HoughCircles`
diff --git a/modules/gpu/include/opencv2/gpu.hpp b/modules/gpu/include/opencv2/gpu.hpp
index 19fd7c93e9..7397321232 100644
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -52,6 +52,8 @@
 #include "opencv2/core/gpumat.hpp"
 #include "opencv2/gpuarithm.hpp"
 #include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuimgproc.hpp"
+
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
 #include "opencv2/features2d.hpp"
@@ -60,280 +62,7 @@ namespace cv { namespace gpu {
 ////////////////////////////// Image processing //////////////////////////////
 
 
-enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
-       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
-
-//! Composite two images using alpha opacity values contained in each image
-//! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
-CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null());
-
-//! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]
-//! supports only CV_32FC1 map type
-CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
-                      int interpolation, int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(),
-                      Stream& stream = Stream::Null());
-
-//! Does mean shift filtering on GPU.
-CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
-                                   TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
-                                   Stream& stream = Stream::Null());
-
-//! Does mean shift procedure on GPU.
-CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,
-                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
-                              Stream& stream = Stream::Null());
-
-//! Does mean shift segmentation with elimination of small regions.
-CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,
-                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
-
-//! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.
-//! Supported types of input disparity: CV_8U, CV_16S.
-//! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).
-CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null());
-
-//! Reprojects disparity image to 3D space.
-//! Supports CV_8U and CV_16S types of input disparity.
-//! The output is a 3- or 4-channel floating-point matrix.
-//! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.
-//! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.
-CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null());
-
-//! converts image from one color space to another
-CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
-
-enum
-{
-    // Bayer Demosaicing (Malvar, He, and Cutler)
-    COLOR_BayerBG2BGR_MHT = 256,
-    COLOR_BayerGB2BGR_MHT = 257,
-    COLOR_BayerRG2BGR_MHT = 258,
-    COLOR_BayerGR2BGR_MHT = 259,
-
-    COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT,
-    COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT,
-    COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT,
-    COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT,
-
-    COLOR_BayerBG2GRAY_MHT = 260,
-    COLOR_BayerGB2GRAY_MHT = 261,
-    COLOR_BayerRG2GRAY_MHT = 262,
-    COLOR_BayerGR2GRAY_MHT = 263
-};
-CV_EXPORTS void demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn = -1, Stream& stream = Stream::Null());
-
-//! swap channels
-//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
-//!            of the array contains the number of the channel that is stored in the n-th channel of
-//!            the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR
-//!            channel order.
-CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null());
-
-//! Routines for correcting image color gamma
-CV_EXPORTS void gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward = true, Stream& stream = Stream::Null());
-
-//! resizes the image
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
-CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
-
-//! warps the image using affine transformation
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
-    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
-
-CV_EXPORTS void buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null());
-
-//! warps the image using perspective transformation
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
-    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
-
-CV_EXPORTS void buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null());
-
-//! builds plane warping maps
-CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, float scale,
-                                   GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());
-
-//! builds cylindrical warping maps
-CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
-                                         GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());
-
-//! builds spherical warping maps
-CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
-                                       GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());
-
-//! rotates an image around the origin (0,0) and then shifts it
-//! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-//! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth
-CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
-                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
-
-//! computes Harris cornerness criteria at each image pixel
-CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
-CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
-CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k,
-                             int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());
-
-//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
-CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
-CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
-CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize,
-    int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());
-
-struct CV_EXPORTS MatchTemplateBuf
-{
-    Size user_block_size;
-    GpuMat imagef, templf;
-    std::vector<GpuMat> images;
-    std::vector<GpuMat> image_sums;
-    std::vector<GpuMat> image_sqsums;
-};
-
-//! computes the proximity map for the raster template and the image where the template is searched for
-CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null());
-
-//! computes the proximity map for the raster template and the image where the template is searched for
-CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null());
-
-//! smoothes the source image and downsamples it
-CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! upsamples the source image and then smoothes it
-CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! performs linear blending of two images
-//! to avoid accuracy errors sum of weigths shouldn't be very close to zero
-CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
-                            GpuMat& result, Stream& stream = Stream::Null());
-
-//! Performa bilateral filtering of passsed image
-CV_EXPORTS void bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial,
-                                int borderMode = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
-//! Brute force non-local means algorith (slow but universal)
-CV_EXPORTS void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null());
-
-//! Fast (but approximate)version of non-local means algorith similar to CPU function (running sums technique)
-class CV_EXPORTS FastNonLocalMeansDenoising
-{
-public:
-    //! Simple method, recommended for grayscale images (though it supports multichannel images)
-    void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
-
-    //! Processes luminance and color components separatelly
-    void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
-
-private:
-
-    GpuMat buffer, extended_src_buffer;
-    GpuMat lab, l, ab;
-};
-
-struct CV_EXPORTS CannyBuf
-{
-    void create(const Size& image_size, int apperture_size = 3);
-    void release();
-
-    GpuMat dx, dy;
-    GpuMat mag;
-    GpuMat map;
-    GpuMat st1, st2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
-};
-
-CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
-CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
-
-class CV_EXPORTS ImagePyramid
-{
-public:
-    inline ImagePyramid() : nLayers_(0) {}
-    inline ImagePyramid(const GpuMat& img, int nLayers, Stream& stream = Stream::Null())
-    {
-        build(img, nLayers, stream);
-    }
-
-    void build(const GpuMat& img, int nLayers, Stream& stream = Stream::Null());
-
-    void getLayer(GpuMat& outImg, Size outRoi, Stream& stream = Stream::Null()) const;
-
-    inline void release()
-    {
-        layer0_.release();
-        pyramid_.clear();
-        nLayers_ = 0;
-    }
-
-private:
-    GpuMat layer0_;
-    std::vector<GpuMat> pyramid_;
-    int nLayers_;
-};
-
-//! HoughLines
-
-struct HoughLinesBuf
-{
-    GpuMat accum;
-    GpuMat list;
-};
-
-CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
-CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
-CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
-
-//! HoughLinesP
-
-//! finds line segments in the black-n-white image using probabalistic Hough transform
-CV_EXPORTS void HoughLinesP(const GpuMat& image, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);
-
-//! HoughCircles
-
-struct HoughCirclesBuf
-{
-    GpuMat edges;
-    GpuMat accum;
-    GpuMat list;
-    CannyBuf cannyBuf;
-};
-
-CV_EXPORTS void HoughCircles(const GpuMat& src, GpuMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
-CV_EXPORTS void HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
-CV_EXPORTS void HoughCirclesDownload(const GpuMat& d_circles, OutputArray h_circles);
-
-//! finds arbitrary template in the grayscale image using Generalized Hough Transform
-//! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
-//! Guil, N., GonzÃ¡lez-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
-class CV_EXPORTS GeneralizedHough_GPU : public cv::Algorithm
-{
-public:
-    static Ptr<GeneralizedHough_GPU> create(int method);
-
-    virtual ~GeneralizedHough_GPU();
 
-    //! set template to search
-    void setTemplate(const GpuMat& templ, int cannyThreshold = 100, Point templCenter = Point(-1, -1));
-    void setTemplate(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter = Point(-1, -1));
-
-    //! find template on image
-    void detect(const GpuMat& image, GpuMat& positions, int cannyThreshold = 100);
-    void detect(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions);
-
-    void download(const GpuMat& d_positions, OutputArray h_positions, OutputArray h_votes = noArray());
-
-    void release();
-
-protected:
-    virtual void setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter) = 0;
-    virtual void detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions) = 0;
-    virtual void releaseImpl() = 0;
-
-private:
-    GpuMat edges_;
-    CannyBuf cannyBuf_;
-};
 
 ///////////////////////////// Calibration 3D //////////////////////////////////
 
@@ -351,68 +80,11 @@ CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& c
 
 //////////////////////////////// Image Labeling ////////////////////////////////
 
-//!performs labeling via graph cuts of a 2D regular 4-connected graph.
-CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
-                         GpuMat& buf, Stream& stream = Stream::Null());
 
-//!performs labeling via graph cuts of a 2D regular 8-connected graph.
-CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
-                         GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight,
-                         GpuMat& labels,
-                         GpuMat& buf, Stream& stream = Stream::Null());
-
-//! compute mask for Generalized Flood fill componetns labeling.
-CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& stream = Stream::Null());
-
-//! performs connected componnents labeling.
-CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, int flags = 0, Stream& stream = Stream::Null());
 
 ////////////////////////////////// Histograms //////////////////////////////////
 
-//! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.
-CV_EXPORTS void evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel);
-//! Calculates histogram with evenly distributed bins for signle channel source.
-//! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types.
-//! Output hist will have one row and histSize cols and CV_32SC1 type.
-CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
-CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
-//! Calculates histogram with evenly distributed bins for four-channel source.
-//! All channels of source are processed separately.
-//! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types.
-//! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type.
-CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
-CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
-//! Calculates histogram with bins determined by levels array.
-//! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
-//! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types.
-//! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type.
-CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null());
-CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null());
-//! Calculates histogram with bins determined by levels array.
-//! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
-//! All channels of source are processed separately.
-//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.
-//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.
-CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());
-CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream = Stream::Null());
-
-//! Calculates histogram for 8u one channel image
-//! Output hist will have one row, 256 cols and CV32SC1 type.
-CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
-CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
-
-//! normalizes the grayscale image brightness and contrast by normalizing its histogram
-CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
-CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
-
-class CV_EXPORTS CLAHE : public cv::CLAHE
-{
-public:
-    using cv::CLAHE::apply;
-    virtual void apply(InputArray src, OutputArray dst, Stream& stream) = 0;
-};
-CV_EXPORTS Ptr<cv::gpu::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
+
 
 //////////////////////////////// StereoBM_GPU ////////////////////////////////
 
@@ -1097,52 +769,7 @@ public:
     GpuMat buf;
 };
 
-class CV_EXPORTS GoodFeaturesToTrackDetector_GPU
-{
-public:
-    explicit GoodFeaturesToTrackDetector_GPU(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
-        int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
-
-    //! return 1 rows matrix with CV_32FC2 type
-    void operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask = GpuMat());
 
-    int maxCorners;
-    double qualityLevel;
-    double minDistance;
-
-    int blockSize;
-    bool useHarrisDetector;
-    double harrisK;
-
-    void releaseMemory()
-    {
-        Dx_.release();
-        Dy_.release();
-        buf_.release();
-        eig_.release();
-        minMaxbuf_.release();
-        tmpCorners_.release();
-    }
-
-private:
-    GpuMat Dx_;
-    GpuMat Dy_;
-    GpuMat buf_;
-    GpuMat eig_;
-    GpuMat minMaxbuf_;
-    GpuMat tmpCorners_;
-};
-
-inline GoodFeaturesToTrackDetector_GPU::GoodFeaturesToTrackDetector_GPU(int maxCorners_, double qualityLevel_, double minDistance_,
-        int blockSize_, bool useHarrisDetector_, double harrisK_)
-{
-    maxCorners = maxCorners_;
-    qualityLevel = qualityLevel_;
-    minDistance = minDistance_;
-    blockSize = blockSize_;
-    useHarrisDetector = useHarrisDetector_;
-    harrisK = harrisK_;
-}
 
 
 class CV_EXPORTS PyrLKOpticalFlow
diff --git a/modules/gpu/perf/perf_denoising.cpp b/modules/gpu/perf/perf_denoising.cpp
deleted file mode 100644
index 1e33601d60..0000000000
--- a/modules/gpu/perf/perf_denoising.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-using namespace perf;
-
-#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)
-
-//////////////////////////////////////////////////////////////////////
-// BilateralFilter
-
-DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, MatCn, int);
-
-PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter,
-            Combine(GPU_DENOISING_IMAGE_SIZES,
-                    Values(CV_8U, CV_32F),
-                    GPU_CHANNELS_1_3,
-                    Values(3, 5, 9)))
-{
-    declare.time(60.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int kernel_size = GET_PARAM(3);
-
-    const float sigma_color = 7;
-    const float sigma_spatial = 5;
-    const int borderMode = cv::BORDER_REFLECT101;
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::bilateralFilter(d_src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// nonLocalMeans
-
-DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
-
-PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES,
-                    Values<MatDepth>(CV_8U),
-                    GPU_CHANNELS_1_3,
-                    Values(21),
-                    Values(5)))
-{
-    declare.time(600.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int search_widow_size = GET_PARAM(3);
-    const int block_size = GET_PARAM(4);
-
-    const float h = 10;
-    const int borderMode = cv::BORDER_REFLECT101;
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::nonLocalMeans(d_src, dst, h, search_widow_size, block_size, borderMode);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-
-//////////////////////////////////////////////////////////////////////
-// fastNonLocalMeans
-
-DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
-
-PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES,
-                    Values<MatDepth>(CV_8U),
-                    GPU_CHANNELS_1_3,
-                    Values(21),
-                    Values(7)))
-{
-    declare.time(60.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int search_widow_size = GET_PARAM(2);
-    const int block_size = GET_PARAM(3);
-
-    const float h = 10;
-    const int type = CV_MAKE_TYPE(depth, 1);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::FastNonLocalMeansDenoising fnlmd;
-
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// fastNonLocalMeans (colored)
-
-DEF_PARAM_TEST(Sz_Depth_WinSz_BlockSz, cv::Size, MatDepth, int, int);
-
-PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeansColored,
-            Combine(GPU_DENOISING_IMAGE_SIZES,
-                    Values<MatDepth>(CV_8U),
-                    Values(21),
-                    Values(7)))
-{
-    declare.time(60.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int search_widow_size = GET_PARAM(2);
-    const int block_size = GET_PARAM(3);
-
-    const float h = 10;
-    const int type = CV_MAKE_TYPE(depth, 3);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::FastNonLocalMeansDenoising fnlmd;
-
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
deleted file mode 100644
index 5f8e9b297f..0000000000
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ /dev/null
@@ -1,1631 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-using namespace perf;
-
-//////////////////////////////////////////////////////////////////////
-// Remap
-
-enum { HALF_SIZE=0, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH };
-CV_ENUM(RemapMode, HALF_SIZE, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH);
-
-void generateMap(cv::Mat& map_x, cv::Mat& map_y, int remapMode)
-{
-    for (int j = 0; j < map_x.rows; ++j)
-    {
-        for (int i = 0; i < map_x.cols; ++i)
-        {
-            switch (remapMode)
-            {
-            case HALF_SIZE:
-                if (i > map_x.cols*0.25 && i < map_x.cols*0.75 && j > map_x.rows*0.25 && j < map_x.rows*0.75)
-                {
-                    map_x.at<float>(j,i) = 2.f * (i - map_x.cols * 0.25f) + 0.5f;
-                    map_y.at<float>(j,i) = 2.f * (j - map_x.rows * 0.25f) + 0.5f;
-                }
-                else
-                {
-                    map_x.at<float>(j,i) = 0.f;
-                    map_y.at<float>(j,i) = 0.f;
-                }
-                break;
-            case UPSIDE_DOWN:
-                map_x.at<float>(j,i) = static_cast<float>(i);
-                map_y.at<float>(j,i) = static_cast<float>(map_x.rows - j);
-                break;
-            case REFLECTION_X:
-                map_x.at<float>(j,i) = static_cast<float>(map_x.cols - i);
-                map_y.at<float>(j,i) = static_cast<float>(j);
-                break;
-            case REFLECTION_BOTH:
-                map_x.at<float>(j,i) = static_cast<float>(map_x.cols - i);
-                map_y.at<float>(j,i) = static_cast<float>(map_x.rows - j);
-                break;
-            } // end of switch
-        }
-    }
-}
-
-DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border_Mode, cv::Size, MatDepth, MatCn, Interpolation, BorderMode, RemapMode);
-
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES,
-                    RemapMode::all()))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int interpolation = GET_PARAM(3);
-    const int borderMode = GET_PARAM(4);
-    const int remapMode = GET_PARAM(5);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    cv::Mat xmap(size, CV_32FC1);
-    cv::Mat ymap(size, CV_32FC1);
-    generateMap(xmap, ymap, remapMode);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        const cv::gpu::GpuMat d_xmap(xmap);
-        const cv::gpu::GpuMat d_ymap(ymap);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::remap(d_src, dst, d_xmap, d_ymap, interpolation, borderMode);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Resize
-
-DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Scale, cv::Size, MatDepth, MatCn, Interpolation, double);
-
-PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    Values(0.5, 0.3, 2.0)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int interpolation = GET_PARAM(3);
-    const double f = GET_PARAM(4);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::resize(d_src, dst, cv::Size(), f, f, interpolation);
-
-        GPU_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ResizeArea
-
-DEF_PARAM_TEST(Sz_Depth_Cn_Scale, cv::Size, MatDepth, MatCn, double);
-
-PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(0.2, 0.1, 0.05)))
-{
-    declare.time(1.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int interpolation = cv::INTER_AREA;
-    const double f = GET_PARAM(3);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::resize(d_src, dst, cv::Size(), f, f, interpolation);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// WarpAffine
-
-DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border, cv::Size, MatDepth, MatCn, Interpolation, BorderMode);
-
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int interpolation = GET_PARAM(3);
-    const int borderMode = GET_PARAM(4);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    const double aplha = CV_PI / 4;
-    const double mat[2 * 3] =
-    {
-        std::cos(aplha), -std::sin(aplha), src.cols / 2,
-        std::sin(aplha),  std::cos(aplha), 0
-    };
-    const cv::Mat M(2, 3, CV_64F, (void*) mat);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::warpAffine(d_src, dst, M, size, interpolation, borderMode);
-
-        GPU_SANITY_CHECK(dst, 1);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::warpAffine(src, dst, M, size, interpolation, borderMode);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// WarpPerspective
-
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int interpolation = GET_PARAM(3);
-    const int borderMode = GET_PARAM(4);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    const double aplha = CV_PI / 4;
-    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
-                         {std::sin(aplha),  std::cos(aplha), 0},
-                         {0.0,              0.0,             1.0}};
-    const cv::Mat M(3, 3, CV_64F, (void*) mat);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::warpPerspective(d_src, dst, M, size, interpolation, borderMode);
-
-        GPU_SANITY_CHECK(dst, 1);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Threshold
-
-CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV)
-
-DEF_PARAM_TEST(Sz_Depth_Op, cv::Size, MatDepth, ThreshOp);
-
-PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-            Values(CV_8U, CV_16U, CV_32F, CV_64F),
-            ThreshOp::all()))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int threshOp = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::threshold(d_src, dst, 100.0, 255.0, threshOp);
-
-        GPU_SANITY_CHECK(dst, 1e-10);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::threshold(src, dst, 100.0, 255.0, threshOp);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// HistEvenC1
-
-PERF_TEST_P(Sz_Depth, ImgProc_HistEvenC1,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_16S)))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-
-    cv::Mat src(size, depth);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
-
-        TEST_CYCLE() cv::gpu::histEven(d_src, dst, d_buf, 30, 0, 180);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        const int hbins = 30;
-        const float hranges[] = {0.0f, 180.0f};
-        const int histSize[] = {hbins};
-        const float* ranges[] = {hranges};
-        const int channels[] = {0};
-
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::calcHist(&src, 1, channels, cv::Mat(), dst, 1, histSize, ranges);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// HistEvenC4
-
-PERF_TEST_P(Sz_Depth, ImgProc_HistEvenC4,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_16S)))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-
-    cv::Mat src(size, CV_MAKE_TYPE(depth, 4));
-    declare.in(src, WARMUP_RNG);
-
-    int histSize[] = {30, 30, 30, 30};
-    int lowerLevel[] = {0, 0, 0, 0};
-    int upperLevel[] = {180, 180, 180, 180};
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_hist[4];
-        cv::gpu::GpuMat d_buf;
-
-        TEST_CYCLE() cv::gpu::histEven(d_src, d_hist, d_buf, histSize, lowerLevel, upperLevel);
-
-        cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3;
-        d_hist[0].download(cpu_hist0);
-        d_hist[1].download(cpu_hist1);
-        d_hist[2].download(cpu_hist2);
-        d_hist[3].download(cpu_hist3);
-        SANITY_CHECK(cpu_hist0);
-        SANITY_CHECK(cpu_hist1);
-        SANITY_CHECK(cpu_hist2);
-        SANITY_CHECK(cpu_hist3);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CalcHist
-
-PERF_TEST_P(Sz, ImgProc_CalcHist,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::calcHist(d_src, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// EqualizeHist
-
-PERF_TEST_P(Sz, ImgProc_EqualizeHist,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_hist;
-        cv::gpu::GpuMat d_buf;
-
-        TEST_CYCLE() cv::gpu::equalizeHist(d_src, dst, d_hist, d_buf);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::equalizeHist(src, dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double);
-
-PERF_TEST_P(Sz_ClipLimit, ImgProc_CLAHE,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(0.0, 40.0)))
-{
-    const cv::Size size = GET_PARAM(0);
-    const double clipLimit = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::Ptr<cv::gpu::CLAHE> clahe = cv::gpu::createCLAHE(clipLimit);
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() clahe->apply(d_src, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE(clipLimit);
-        cv::Mat dst;
-
-        TEST_CYCLE() clahe->apply(src, dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Canny
-
-DEF_PARAM_TEST(Image_AppertureSz_L2gradient, string, int, bool);
-
-PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny,
-            Combine(Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
-                    Values(3, 5),
-                    Bool()))
-{
-    const string fileName = GET_PARAM(0);
-    const int apperture_size = GET_PARAM(1);
-    const bool useL2gradient = GET_PARAM(2);
-
-    const cv::Mat image = readImage(fileName, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    const double low_thresh = 50.0;
-    const double high_thresh = 100.0;
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_image(image);
-        cv::gpu::GpuMat dst;
-        cv::gpu::CannyBuf d_buf;
-
-        TEST_CYCLE() cv::gpu::Canny(d_image, d_buf, dst, low_thresh, high_thresh, apperture_size, useL2gradient);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::Canny(image, dst, low_thresh, high_thresh, apperture_size, useL2gradient);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// MeanShiftFiltering
-
-DEF_PARAM_TEST_1(Image, string);
-
-PERF_TEST_P(Image, ImgProc_MeanShiftFiltering,
-            Values<string>("gpu/meanshift/cones.png"))
-{
-    declare.time(300.0);
-
-    const cv::Mat img = readImage(GetParam());
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat rgba;
-    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
-
-    const int sp = 50;
-    const int sr = 50;
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(rgba);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::meanShiftFiltering(d_src, dst, sp, sr);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::pyrMeanShiftFiltering(img, dst, sp, sr);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// MeanShiftProc
-
-PERF_TEST_P(Image, ImgProc_MeanShiftProc,
-            Values<string>("gpu/meanshift/cones.png"))
-{
-    declare.time(300.0);
-
-    const cv::Mat img = readImage(GetParam());
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat rgba;
-    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
-
-    const int sp = 50;
-    const int sr = 50;
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(rgba);
-        cv::gpu::GpuMat dstr;
-        cv::gpu::GpuMat dstsp;
-
-        TEST_CYCLE() cv::gpu::meanShiftProc(d_src, dstr, dstsp, sp, sr);
-
-        GPU_SANITY_CHECK(dstr);
-        GPU_SANITY_CHECK(dstsp);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// MeanShiftSegmentation
-
-PERF_TEST_P(Image, ImgProc_MeanShiftSegmentation,
-            Values<string>("gpu/meanshift/cones.png"))
-{
-    declare.time(300.0);
-
-    const cv::Mat img = readImage(GetParam());
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat rgba;
-    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
-
-    const int sp = 10;
-    const int sr = 10;
-    const int minsize = 20;
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(rgba);
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::gpu::meanShiftSegmentation(d_src, dst, sp, sr, minsize);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BlendLinear
-
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_BlendLinear,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat img1(size, type);
-    cv::Mat img2(size, type);
-    declare.in(img1, img2, WARMUP_RNG);
-
-    const cv::Mat weights1(size, CV_32FC1, cv::Scalar::all(0.5));
-    const cv::Mat weights2(size, CV_32FC1, cv::Scalar::all(0.5));
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_img1(img1);
-        const cv::gpu::GpuMat d_img2(img2);
-        const cv::gpu::GpuMat d_weights1(weights1);
-        const cv::gpu::GpuMat d_weights2(weights2);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::blendLinear(d_img1, d_img2, d_weights1, d_weights2, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate8U
-
-CV_ENUM(TemplateMethod, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
-
-DEF_PARAM_TEST(Sz_TemplateSz_Cn_Method, cv::Size, cv::Size, MatCn, TemplateMethod);
-
-PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate8U,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(cv::Size(5, 5), cv::Size(16, 16), cv::Size(30, 30)),
-                    GPU_CHANNELS_1_3_4,
-                    TemplateMethod::all()))
-{
-    declare.time(300.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const cv::Size templ_size = GET_PARAM(1);
-    const int cn = GET_PARAM(2);
-    const int method = GET_PARAM(3);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_8U, cn));
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_8U, cn));
-    declare.in(image, templ, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_image(image);
-        const cv::gpu::GpuMat d_templ(templ);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
-
-        GPU_SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::matchTemplate(image, templ, dst, method);
-
-        CPU_SANITY_CHECK(dst);
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate32F
-
-PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate32F,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(cv::Size(5, 5), cv::Size(16, 16), cv::Size(30, 30)),
-                    GPU_CHANNELS_1_3_4,
-                    Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))))
-{
-    declare.time(300.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const cv::Size templ_size = GET_PARAM(1);
-    const int cn = GET_PARAM(2);
-    int method = GET_PARAM(3);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_32F, cn));
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_32F, cn));
-    declare.in(image, templ, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_image(image);
-        const cv::gpu::GpuMat d_templ(templ);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
-
-        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::matchTemplate(image, templ, dst, method);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CornerHarris
-
-DEF_PARAM_TEST(Image_Type_Border_BlockSz_ApertureSz, string, MatType, BorderMode, int, int);
-
-PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerHarris,
-            Combine(Values<string>("gpu/stereobm/aloe-L.png"),
-                    Values(CV_8UC1, CV_32FC1),
-                    Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-                    Values(3, 5, 7),
-                    Values(0, 3, 5, 7)))
-{
-    const string fileName = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int borderMode = GET_PARAM(2);
-    const int blockSize = GET_PARAM(3);
-    const int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    const double k = 0.5;
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_Dx;
-        cv::gpu::GpuMat d_Dy;
-        cv::gpu::GpuMat d_buf;
-
-        TEST_CYCLE() cv::gpu::cornerHarris(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, k, borderMode);
-
-        GPU_SANITY_CHECK(dst, 1e-4);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderMode);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CornerMinEigenVal
-
-PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerMinEigenVal,
-            Combine(Values<string>("gpu/stereobm/aloe-L.png"),
-                    Values(CV_8UC1, CV_32FC1),
-                    Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-                    Values(3, 5, 7),
-                    Values(0, 3, 5, 7)))
-{
-    const string fileName = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int borderMode = GET_PARAM(2);
-    const int blockSize = GET_PARAM(3);
-    const int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_Dx;
-        cv::gpu::GpuMat d_Dy;
-        cv::gpu::GpuMat d_buf;
-
-        TEST_CYCLE() cv::gpu::cornerMinEigenVal(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, borderMode);
-
-        GPU_SANITY_CHECK(dst, 1e-4);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderMode);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BuildWarpPlaneMaps
-
-PERF_TEST_P(Sz, ImgProc_BuildWarpPlaneMaps,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-    const cv::Mat T = cv::Mat::zeros(1, 3, CV_32F);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat map_x;
-        cv::gpu::GpuMat map_y;
-
-        TEST_CYCLE() cv::gpu::buildWarpPlaneMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, T, 1.0, map_x, map_y);
-
-        GPU_SANITY_CHECK(map_x);
-        GPU_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BuildWarpCylindricalMaps
-
-PERF_TEST_P(Sz, ImgProc_BuildWarpCylindricalMaps,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat map_x;
-        cv::gpu::GpuMat map_y;
-
-        TEST_CYCLE() cv::gpu::buildWarpCylindricalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
-
-        GPU_SANITY_CHECK(map_x);
-        GPU_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BuildWarpSphericalMaps
-
-PERF_TEST_P(Sz, ImgProc_BuildWarpSphericalMaps,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat map_x;
-        cv::gpu::GpuMat map_y;
-
-        TEST_CYCLE() cv::gpu::buildWarpSphericalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
-
-        GPU_SANITY_CHECK(map_x);
-        GPU_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Rotate
-
-DEF_PARAM_TEST(Sz_Depth_Cn_Inter, cv::Size, MatDepth, MatCn, Interpolation);
-
-PERF_TEST_P(Sz_Depth_Cn_Inter, ImgProc_Rotate,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-    const int interpolation = GET_PARAM(3);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::rotate(d_src, dst, size, 30.0, 0, 0, interpolation);
-
-        GPU_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// PyrDown
-
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::pyrDown(d_src, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::pyrDown(src, dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// PyrUp
-
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::pyrUp(d_src, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::pyrUp(src, dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CvtColor
-
-DEF_PARAM_TEST(Sz_Depth_Code, cv::Size, MatDepth, CvtColorInfo);
-
-PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColor,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_32F),
-                    Values(CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA),
-                           CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY),
-                           CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA),
-                           CvtColorInfo(3, 3, cv::COLOR_BGR2XYZ),
-                           CvtColorInfo(3, 3, cv::COLOR_XYZ2BGR),
-                           CvtColorInfo(3, 3, cv::COLOR_BGR2YCrCb),
-                           CvtColorInfo(3, 3, cv::COLOR_YCrCb2BGR),
-                           CvtColorInfo(3, 3, cv::COLOR_BGR2YUV),
-                           CvtColorInfo(3, 3, cv::COLOR_YUV2BGR),
-                           CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
-                           CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
-                           CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
-                           CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
-                           CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
-                           CvtColorInfo(3, 3, cv::COLOR_LBGR2Lab),
-                           CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
-                           CvtColorInfo(3, 3, cv::COLOR_LBGR2Luv),
-                           CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
-                           CvtColorInfo(3, 3, cv::COLOR_Lab2LBGR),
-                           CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
-                           CvtColorInfo(3, 3, cv::COLOR_Luv2LRGB))))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const CvtColorInfo info = GET_PARAM(2);
-
-    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
-    cv::randu(src, 0, depth == CV_8U ? 255.0 : 1.0);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::cvtColor(d_src, dst, info.code, info.dcn);
-
-        GPU_SANITY_CHECK(dst, 1e-4);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::cvtColor(src, dst, info.code, info.dcn);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U),
-                    Values(CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
-                           CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
-                           CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
-                           CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
-
-                           CvtColorInfo(1, 1, cv::COLOR_BayerBG2GRAY),
-                           CvtColorInfo(1, 1, cv::COLOR_BayerGB2GRAY),
-                           CvtColorInfo(1, 1, cv::COLOR_BayerRG2GRAY),
-                           CvtColorInfo(1, 1, cv::COLOR_BayerGR2GRAY))))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const CvtColorInfo info = GET_PARAM(2);
-
-    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::cvtColor(d_src, dst, info.code, info.dcn);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::cvtColor(src, dst, info.code, info.dcn);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-CV_ENUM(DemosaicingCode,
-        COLOR_BayerBG2BGR, COLOR_BayerGB2BGR, COLOR_BayerRG2BGR, COLOR_BayerGR2BGR,
-        COLOR_BayerBG2GRAY, COLOR_BayerGB2GRAY, COLOR_BayerRG2GRAY, COLOR_BayerGR2GRAY,
-        COLOR_BayerBG2BGR_MHT, COLOR_BayerGB2BGR_MHT, COLOR_BayerRG2BGR_MHT, COLOR_BayerGR2BGR_MHT,
-        COLOR_BayerBG2GRAY_MHT, COLOR_BayerGB2GRAY_MHT, COLOR_BayerRG2GRAY_MHT, COLOR_BayerGR2GRAY_MHT)
-
-DEF_PARAM_TEST(Sz_Code, cv::Size, DemosaicingCode);
-
-PERF_TEST_P(Sz_Code, ImgProc_Demosaicing,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    DemosaicingCode::all()))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int code = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::demosaicing(d_src, dst, code);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        if (code >= cv::COLOR_COLORCVT_MAX)
-        {
-            FAIL_NO_CPU();
-        }
-        else
-        {
-            cv::Mat dst;
-
-            TEST_CYCLE() cv::cvtColor(src, dst, code);
-
-            CPU_SANITY_CHECK(dst);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// SwapChannels
-
-PERF_TEST_P(Sz, ImgProc_SwapChannels,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    cv::Mat src(size, CV_8UC4);
-    declare.in(src, WARMUP_RNG);
-
-    const int dstOrder[] = {2, 1, 0, 3};
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat dst(src);
-
-        TEST_CYCLE() cv::gpu::swapChannels(dst, dstOrder);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// AlphaComp
-
-CV_ENUM(AlphaOp, ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL, ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL)
-
-DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, AlphaOp);
-
-PERF_TEST_P(Sz_Type_Op, ImgProc_AlphaComp,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8UC4, CV_16UC4, CV_32SC4, CV_32FC4),
-                    AlphaOp::all()))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int alpha_op = GET_PARAM(2);
-
-    cv::Mat img1(size, type);
-    cv::Mat img2(size, type);
-    declare.in(img1, img2, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_img1(img1);
-        const cv::gpu::GpuMat d_img2(img2);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::alphaComp(d_img1, d_img2, dst, alpha_op);
-
-        GPU_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ImagePyramidBuild
-
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidBuild,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    const int nLayers = 5;
-    const cv::Size dstSize(size.width / 2 + 10, size.height / 2 + 10);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-
-        cv::gpu::ImagePyramid d_pyr;
-
-        TEST_CYCLE() d_pyr.build(d_src, nLayers);
-
-        cv::gpu::GpuMat dst;
-        d_pyr.getLayer(dst, dstSize);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ImagePyramidGetLayer
-
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    const int nLayers = 3;
-    const cv::Size dstSize(size.width / 2 + 10, size.height / 2 + 10);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        cv::gpu::ImagePyramid d_pyr(d_src, nLayers);
-
-        TEST_CYCLE() d_pyr.getLayer(dst, dstSize);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// HoughLines
-
-namespace
-{
-    struct Vec4iComparator
-    {
-        bool operator()(const cv::Vec4i& a, const cv::Vec4i b) const
-        {
-            if (a[0] != b[0]) return a[0] < b[0];
-            else if(a[1] != b[1]) return a[1] < b[1];
-            else if(a[2] != b[2]) return a[2] < b[2];
-            else return a[3] < b[3];
-        }
-    };
-    struct Vec3fComparator
-    {
-        bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
-        {
-            if(a[0] != b[0]) return a[0] < b[0];
-            else if(a[1] != b[1]) return a[1] < b[1];
-            else return a[2] < b[2];
-        }
-    };
-    struct Vec2fComparator
-    {
-        bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const
-        {
-            if(a[0] != b[0]) return a[0] < b[0];
-            else return a[1] < b[1];
-        }
-    };
-}
-
-PERF_TEST_P(Sz, ImgProc_HoughLines,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    declare.time(30.0);
-
-    const cv::Size size = GetParam();
-
-    const float rho = 1.0f;
-    const float theta = static_cast<float>(CV_PI / 180.0);
-    const int threshold = 300;
-
-    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
-    cv::line(src, cv::Point(0, 100), cv::Point(src.cols, 100), cv::Scalar::all(255), 1);
-    cv::line(src, cv::Point(0, 200), cv::Point(src.cols, 200), cv::Scalar::all(255), 1);
-    cv::line(src, cv::Point(0, 400), cv::Point(src.cols, 400), cv::Scalar::all(255), 1);
-    cv::line(src, cv::Point(100, 0), cv::Point(100, src.rows), cv::Scalar::all(255), 1);
-    cv::line(src, cv::Point(200, 0), cv::Point(200, src.rows), cv::Scalar::all(255), 1);
-    cv::line(src, cv::Point(400, 0), cv::Point(400, src.rows), cv::Scalar::all(255), 1);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_lines;
-        cv::gpu::HoughLinesBuf d_buf;
-
-        TEST_CYCLE() cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold);
-
-        cv::Mat gpu_lines(d_lines.row(0));
-        cv::Vec2f* begin = gpu_lines.ptr<cv::Vec2f>(0);
-        cv::Vec2f* end = begin + gpu_lines.cols;
-        std::sort(begin, end, Vec2fComparator());
-        SANITY_CHECK(gpu_lines);
-    }
-    else
-    {
-        std::vector<cv::Vec2f> cpu_lines;
-
-        TEST_CYCLE() cv::HoughLines(src, cpu_lines, rho, theta, threshold);
-
-        SANITY_CHECK(cpu_lines);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// HoughLinesP
-
-DEF_PARAM_TEST_1(Image, std::string);
-
-PERF_TEST_P(Image, ImgProc_HoughLinesP,
-            testing::Values("cv/shared/pic5.png", "stitching/a1.png"))
-{
-    declare.time(30.0);
-
-    const std::string fileName = getDataPath(GetParam());
-
-    const float rho = 1.0f;
-    const float theta = static_cast<float>(CV_PI / 180.0);
-    const int threshold = 100;
-    const int minLineLenght = 50;
-    const int maxLineGap = 5;
-
-    const cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat mask;
-    cv::Canny(image, mask, 50, 100);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_mask(mask);
-        cv::gpu::GpuMat d_lines;
-        cv::gpu::HoughLinesBuf d_buf;
-
-        TEST_CYCLE() cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
-
-        cv::Mat gpu_lines(d_lines);
-        cv::Vec4i* begin = gpu_lines.ptr<cv::Vec4i>();
-        cv::Vec4i* end = begin + gpu_lines.cols;
-        std::sort(begin, end, Vec4iComparator());
-        SANITY_CHECK(gpu_lines);
-    }
-    else
-    {
-        std::vector<cv::Vec4i> cpu_lines;
-
-        TEST_CYCLE() cv::HoughLinesP(mask, cpu_lines, rho, theta, threshold, minLineLenght, maxLineGap);
-
-        SANITY_CHECK(cpu_lines);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// HoughCircles
-
-DEF_PARAM_TEST(Sz_Dp_MinDist, cv::Size, float, float);
-
-PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(1.0f, 2.0f, 4.0f),
-                    Values(1.0f)))
-{
-    declare.time(30.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const float dp = GET_PARAM(1);
-    const float minDist = GET_PARAM(2);
-
-    const int minRadius = 10;
-    const int maxRadius = 30;
-    const int cannyThreshold = 100;
-    const int votesThreshold = 15;
-
-    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
-    cv::circle(src, cv::Point(100, 100), 20, cv::Scalar::all(255), -1);
-    cv::circle(src, cv::Point(200, 200), 25, cv::Scalar::all(255), -1);
-    cv::circle(src, cv::Point(200, 100), 25, cv::Scalar::all(255), -1);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_circles;
-        cv::gpu::HoughCirclesBuf d_buf;
-
-        TEST_CYCLE() cv::gpu::HoughCircles(d_src, d_circles, d_buf, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
-
-        cv::Mat gpu_circles(d_circles);
-        cv::Vec3f* begin = gpu_circles.ptr<cv::Vec3f>(0);
-        cv::Vec3f* end = begin + gpu_circles.cols;
-        std::sort(begin, end, Vec3fComparator());
-        SANITY_CHECK(gpu_circles);
-    }
-    else
-    {
-        std::vector<cv::Vec3f> cpu_circles;
-
-        TEST_CYCLE() cv::HoughCircles(src, cpu_circles, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
-
-        SANITY_CHECK(cpu_circles);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// GeneralizedHough
-
-enum { GHT_POSITION = cv::GeneralizedHough::GHT_POSITION,
-       GHT_SCALE    = cv::GeneralizedHough::GHT_SCALE,
-       GHT_ROTATION = cv::GeneralizedHough::GHT_ROTATION
-     };
-
-CV_FLAGS(GHMethod, GHT_POSITION, GHT_SCALE, GHT_ROTATION);
-
-DEF_PARAM_TEST(Method_Sz, GHMethod, cv::Size);
-
-PERF_TEST_P(Method_Sz, ImgProc_GeneralizedHough,
-            Combine(Values(GHMethod(GHT_POSITION), GHMethod(GHT_POSITION | GHT_SCALE), GHMethod(GHT_POSITION | GHT_ROTATION), GHMethod(GHT_POSITION | GHT_SCALE | GHT_ROTATION)),
-                    GPU_TYPICAL_MAT_SIZES))
-{
-    declare.time(10);
-
-    const int method = GET_PARAM(0);
-    const cv::Size imageSize = GET_PARAM(1);
-
-    const cv::Mat templ = readImage("cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(templ.empty());
-
-    cv::Mat image(imageSize, CV_8UC1, cv::Scalar::all(0));
-    templ.copyTo(image(cv::Rect(50, 50, templ.cols, templ.rows)));
-
-    cv::RNG rng(123456789);
-    const int objCount = rng.uniform(5, 15);
-    for (int i = 0; i < objCount; ++i)
-    {
-        double scale = rng.uniform(0.7, 1.3);
-        bool rotate = 1 == rng.uniform(0, 2);
-
-        cv::Mat obj;
-        cv::resize(templ, obj, cv::Size(), scale, scale);
-        if (rotate)
-            obj = obj.t();
-
-        cv::Point pos;
-
-        pos.x = rng.uniform(0, image.cols - obj.cols);
-        pos.y = rng.uniform(0, image.rows - obj.rows);
-
-        cv::Mat roi = image(cv::Rect(pos, obj.size()));
-        cv::add(roi, obj, roi);
-    }
-
-    cv::Mat edges;
-    cv::Canny(image, edges, 50, 100);
-
-    cv::Mat dx, dy;
-    cv::Sobel(image, dx, CV_32F, 1, 0);
-    cv::Sobel(image, dy, CV_32F, 0, 1);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_edges(edges);
-        const cv::gpu::GpuMat d_dx(dx);
-        const cv::gpu::GpuMat d_dy(dy);
-        cv::gpu::GpuMat posAndVotes;
-
-        cv::Ptr<cv::gpu::GeneralizedHough_GPU> d_hough = cv::gpu::GeneralizedHough_GPU::create(method);
-        if (method & GHT_ROTATION)
-        {
-            d_hough->set("maxAngle", 90.0);
-            d_hough->set("angleStep", 2.0);
-        }
-
-        d_hough->setTemplate(cv::gpu::GpuMat(templ));
-
-        TEST_CYCLE() d_hough->detect(d_edges, d_dx, d_dy, posAndVotes);
-
-        const cv::gpu::GpuMat positions(1, posAndVotes.cols, CV_32FC4, posAndVotes.data);
-        GPU_SANITY_CHECK(positions);
-    }
-    else
-    {
-        cv::Mat positions;
-
-        cv::Ptr<cv::GeneralizedHough> hough = cv::GeneralizedHough::create(method);
-        if (method & GHT_ROTATION)
-        {
-            hough->set("maxAngle", 90.0);
-            hough->set("angleStep", 2.0);
-        }
-
-        hough->setTemplate(templ);
-
-        TEST_CYCLE() hough->detect(edges, dx, dy, positions);
-
-        CPU_SANITY_CHECK(positions);
-    }
-}
diff --git a/modules/gpu/perf/perf_labeling.cpp b/modules/gpu/perf/perf_labeling.cpp
deleted file mode 100644
index 0484da9d59..0000000000
--- a/modules/gpu/perf/perf_labeling.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-using namespace perf;
-
-DEF_PARAM_TEST_1(Image, string);
-
-struct GreedyLabeling
-{
-    struct dot
-    {
-        int x;
-        int y;
-
-        static dot make(int i, int j)
-        {
-            dot d; d.x = i; d.y = j;
-            return d;
-        }
-    };
-
-    struct InInterval
-    {
-        InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {}
-        const int lo, hi;
-
-        bool operator() (const unsigned char a, const unsigned char b) const
-        {
-            int d = a - b;
-            return lo <= d && d <= hi;
-        }
-
-    private:
-        InInterval& operator=(const InInterval&);
-
-
-    };
-
-    GreedyLabeling(cv::Mat img)
-    : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
-
-    ~GreedyLabeling(){delete[] stack;}
-
-    void operator() (cv::Mat labels) const
-    {
-        labels.setTo(cv::Scalar::all(-1));
-        InInterval inInt(0, 2);
-        int cc = -1;
-
-        int* dist_labels = (int*)labels.data;
-        int pitch = static_cast<int>(labels.step1());
-
-        unsigned char* source = (unsigned char*)image.data;
-        int width = image.cols;
-        int height = image.rows;
-
-        for (int j = 0; j < image.rows; ++j)
-            for (int i = 0; i < image.cols; ++i)
-            {
-                if (dist_labels[j * pitch + i] != -1) continue;
-
-                dot* top = stack;
-                dot p = dot::make(i, j);
-                cc++;
-
-                dist_labels[j * pitch + i] = cc;
-
-                while (top >= stack)
-                {
-                    int*  dl = &dist_labels[p.y * pitch + p.x];
-                    unsigned char* sp = &source[p.y * image.step1() + p.x];
-
-                    dl[0] = cc;
-
-                    //right
-                    if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
-                        *top++ = dot::make(p.x + 1, p.y);
-
-                    //left
-                    if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
-                        *top++ = dot::make(p.x - 1, p.y);
-
-                    //bottom
-                    if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
-                        *top++ = dot::make(p.x, p.y + 1);
-
-                    //top
-                    if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-static_cast<int>(image.step1())]))
-                        *top++ = dot::make(p.x, p.y - 1);
-
-                    p = *--top;
-                }
-            }
-    }
-
-    cv::Mat image;
-    cv::Mat _labels;
-    dot* stack;
-};
-
-PERF_TEST_P(Image, DISABLED_Labeling_ConnectivityMask,
-            Values<string>("gpu/labeling/aloe-disp.png"))
-{
-    declare.time(1.0);
-
-    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_image(image);
-        cv::gpu::GpuMat mask;
-
-        TEST_CYCLE() cv::gpu::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));
-
-        GPU_SANITY_CHECK(mask);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-PERF_TEST_P(Image, DISABLED_Labeling_ConnectedComponents,
-            Values<string>("gpu/labeling/aloe-disp.png"))
-{
-    declare.time(1.0);
-
-    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_mask;
-        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));
-
-        cv::gpu::GpuMat components;
-
-        TEST_CYCLE() cv::gpu::labelComponents(d_mask, components);
-
-        GPU_SANITY_CHECK(components);
-    }
-    else
-    {
-        GreedyLabeling host(image);
-
-        TEST_CYCLE() host(host._labels);
-
-        cv::Mat components = host._labels;
-        CPU_SANITY_CHECK(components);
-    }
-}
diff --git a/modules/gpu/src/bilateral_filter.cpp b/modules/gpu/src/bilateral_filter.cpp
deleted file mode 100644
index ef5be018da..0000000000
--- a/modules/gpu/src/bilateral_filter.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace disp_bilateral_filter
-    {
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
-
-        template<typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::disp_bilateral_filter;
-
-namespace
-{
-    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
-    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
-    const float DEFAULT_SIGMA_RANGE = 10.0f;
-
-    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
-    {
-        Mat cpu_table_color(1, len, CV_32F);
-
-        float* line = cpu_table_color.ptr<float>();
-
-        for(int i = 0; i < len; i++)
-            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
-
-        table_color.upload(cpu_table_color);
-    }
-
-    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
-    {
-        int half = (win_size >> 1);
-
-        Mat cpu_table_space(half + 1, half + 1, CV_32F);
-
-        for (int y = 0; y <= half; ++y)
-        {
-            float* row = cpu_table_space.ptr<float>(y);
-            for (int x = 0; x <= half; ++x)
-                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
-        }
-
-        table_space.upload(cpu_table_space);
-    }
-
-    template <typename T>
-    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
-                                   GpuMat& table_color, GpuMat& table_space,
-                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-    {
-        short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
-        short max_disc = short(ndisp * max_disc_threshold + 0.5);
-
-        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
-
-        if (&dst != &disp)
-        {
-            if (stream)
-                stream.enqueueCopy(disp, dst);
-            else
-                disp.copyTo(dst);
-        }
-
-        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
-    }
-
-    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
-                                                GpuMat& table_color, GpuMat& table_space,
-                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
-
-    const bilateral_filter_operator_t operators[] =
-        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
-      sigma_range(DEFAULT_SIGMA_RANGE)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
-                                                     float max_disc_threshold_, float sigma_range_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
-      sigma_range(sigma_range_)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-{
-    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
-    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
-}
-
-#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/blend.cpp b/modules/gpu/src/blend.cpp
deleted file mode 100644
index 3fd6507810..0000000000
--- a/modules/gpu/src/blend.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace blend
-    {
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::blend;
-
-void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
-                          GpuMat& result, Stream& stream)
-{
-    CV_Assert(img1.size() == img2.size());
-    CV_Assert(img1.type() == img2.type());
-    CV_Assert(weights1.size() == img1.size());
-    CV_Assert(weights2.size() == img2.size());
-    CV_Assert(weights1.type() == CV_32F);
-    CV_Assert(weights2.type() == CV_32F);
-
-    const Size size = img1.size();
-    const int depth = img1.depth();
-    const int cn = img1.channels();
-
-    result.create(size, CV_MAKE_TYPE(depth, cn));
-
-    switch (depth)
-    {
-    case CV_8U:
-        if (cn != 4)
-            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        else
-            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    case CV_32F:
-        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    default:
-        CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
-    }
-}
-
-#endif
diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp
deleted file mode 100644
index dc35823486..0000000000
--- a/modules/gpu/src/color.cpp
+++ /dev/null
@@ -1,1989 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_no_cuda(); }
-void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-#include "cvt_color_internal.h"
-
-namespace cv { namespace gpu {
-    namespace cudev
-    {
-        template <int cn>
-        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template <int cn>
-        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-
-        template <int cn>
-        void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    }
-}}
-
-using namespace ::cv::gpu::cudev;
-
-namespace
-{
-    typedef void (*gpu_func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, bgr_to_rgb_16u, 0, 0, bgr_to_rgb_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, bgr_to_bgra_16u, 0, 0, bgr_to_bgra_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, bgr_to_rgba_16u, 0, 0, bgr_to_rgba_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgra_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, bgra_to_bgr_16u, 0, 0, bgra_to_bgr_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgra_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, bgra_to_rgb_16u, 0, 0, bgra_to_rgb_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgra_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, bgra_to_rgba_16u, 0, 0, bgra_to_rgba_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::bgr_to_bgr555(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::bgr_to_bgr565(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::rgb_to_bgr555(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::rgb_to_bgr565(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgra_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::bgra_to_bgr555(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgra_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::bgra_to_bgr565(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgba_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::rgba_to_bgr555(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgba_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::rgba_to_bgr565(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr555_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC3);
-
-        cudev::bgr555_to_rgb(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr565_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC3);
-
-        cudev::bgr565_to_rgb(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr555_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC3);
-
-        cudev::bgr555_to_bgr(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr565_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC3);
-
-        cudev::bgr565_to_bgr(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr555_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC4);
-
-        cudev::bgr555_to_rgba(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr565_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC4);
-
-        cudev::bgr565_to_rgba(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr555_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC4);
-
-        cudev::bgr555_to_bgra(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr565_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC4);
-
-        cudev::bgr565_to_bgra(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void gray_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, gray_to_bgr_16u, 0, 0, gray_to_bgr_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 1);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void gray_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, gray_to_bgra_16u, 0, 0, gray_to_bgra_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 1);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void gray_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 1);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::gray_to_bgr555(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void gray_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 1);
-
-        dst.create(src.size(), CV_8UC2);
-
-        cudev::gray_to_bgr565(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr555_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC1);
-
-        cudev::bgr555_to_gray(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr565_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        CV_Assert(src.depth() == CV_8U);
-        CV_Assert(src.channels() == 2);
-
-        dst.create(src.size(), CV_8UC1);
-
-        cudev::bgr565_to_gray(src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, rgb_to_gray_16u, 0, 0, rgb_to_gray_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, bgr_to_gray_16u, 0, 0, bgr_to_gray_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgba_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, rgba_to_gray_16u, 0, 0, rgba_to_gray_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgra_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, bgra_to_gray_16u, 0, 0, bgra_to_gray_32f};
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
-
-        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {rgb_to_yuv_8u, 0, rgb_to_yuv_16u, 0, 0, rgb_to_yuv_32f},
-                {rgba_to_yuv_8u, 0, rgba_to_yuv_16u, 0, 0, rgba_to_yuv_32f}
-            },
-            {
-                {rgb_to_yuv4_8u, 0, rgb_to_yuv4_16u, 0, 0, rgb_to_yuv4_32f},
-                {rgba_to_yuv4_8u, 0, rgba_to_yuv4_16u, 0, 0, rgba_to_yuv4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {bgr_to_yuv_8u, 0, bgr_to_yuv_16u, 0, 0, bgr_to_yuv_32f},
-                {bgra_to_yuv_8u, 0, bgra_to_yuv_16u, 0, 0, bgra_to_yuv_32f}
-            },
-            {
-                {bgr_to_yuv4_8u, 0, bgr_to_yuv4_16u, 0, 0, bgr_to_yuv4_32f},
-                {bgra_to_yuv4_8u, 0, bgra_to_yuv4_16u, 0, 0, bgra_to_yuv4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void yuv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {yuv_to_rgb_8u, 0, yuv_to_rgb_16u, 0, 0, yuv_to_rgb_32f},
-                {yuv4_to_rgb_8u, 0, yuv4_to_rgb_16u, 0, 0, yuv4_to_rgb_32f}
-            },
-            {
-                {yuv_to_rgba_8u, 0, yuv_to_rgba_16u, 0, 0, yuv_to_rgba_32f},
-                {yuv4_to_rgba_8u, 0, yuv4_to_rgba_16u, 0, 0, yuv4_to_rgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void yuv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {yuv_to_bgr_8u, 0, yuv_to_bgr_16u, 0, 0, yuv_to_bgr_32f},
-                {yuv4_to_bgr_8u, 0, yuv4_to_bgr_16u, 0, 0, yuv4_to_bgr_32f}
-            },
-            {
-                {yuv_to_bgra_8u, 0, yuv_to_bgra_16u, 0, 0, yuv_to_bgra_32f},
-                {yuv4_to_bgra_8u, 0, yuv4_to_bgra_16u, 0, 0, yuv4_to_bgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {rgb_to_YCrCb_8u, 0, rgb_to_YCrCb_16u, 0, 0, rgb_to_YCrCb_32f},
-                {rgba_to_YCrCb_8u, 0, rgba_to_YCrCb_16u, 0, 0, rgba_to_YCrCb_32f}
-            },
-            {
-                {rgb_to_YCrCb4_8u, 0, rgb_to_YCrCb4_16u, 0, 0, rgb_to_YCrCb4_32f},
-                {rgba_to_YCrCb4_8u, 0, rgba_to_YCrCb4_16u, 0, 0, rgba_to_YCrCb4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {bgr_to_YCrCb_8u, 0, bgr_to_YCrCb_16u, 0, 0, bgr_to_YCrCb_32f},
-                {bgra_to_YCrCb_8u, 0, bgra_to_YCrCb_16u, 0, 0, bgra_to_YCrCb_32f}
-            },
-            {
-                {bgr_to_YCrCb4_8u, 0, bgr_to_YCrCb4_16u, 0, 0, bgr_to_YCrCb4_32f},
-                {bgra_to_YCrCb4_8u, 0, bgra_to_YCrCb4_16u, 0, 0, bgra_to_YCrCb4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void YCrCb_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {YCrCb_to_rgb_8u, 0, YCrCb_to_rgb_16u, 0, 0, YCrCb_to_rgb_32f},
-                {YCrCb4_to_rgb_8u, 0, YCrCb4_to_rgb_16u, 0, 0, YCrCb4_to_rgb_32f}
-            },
-            {
-                {YCrCb_to_rgba_8u, 0, YCrCb_to_rgba_16u, 0, 0, YCrCb_to_rgba_32f},
-                {YCrCb4_to_rgba_8u, 0, YCrCb4_to_rgba_16u, 0, 0, YCrCb4_to_rgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void YCrCb_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {YCrCb_to_bgr_8u, 0, YCrCb_to_bgr_16u, 0, 0, YCrCb_to_bgr_32f},
-                {YCrCb4_to_bgr_8u, 0, YCrCb4_to_bgr_16u, 0, 0, YCrCb4_to_bgr_32f}
-            },
-            {
-                {YCrCb_to_bgra_8u, 0, YCrCb_to_bgra_16u, 0, 0, YCrCb_to_bgra_32f},
-                {YCrCb4_to_bgra_8u, 0, YCrCb4_to_bgra_16u, 0, 0, YCrCb4_to_bgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {rgb_to_xyz_8u, 0, rgb_to_xyz_16u, 0, 0, rgb_to_xyz_32f},
-                {rgba_to_xyz_8u, 0, rgba_to_xyz_16u, 0, 0, rgba_to_xyz_32f}
-            },
-            {
-                {rgb_to_xyz4_8u, 0, rgb_to_xyz4_16u, 0, 0, rgb_to_xyz4_32f},
-                {rgba_to_xyz4_8u, 0, rgba_to_xyz4_16u, 0, 0, rgba_to_xyz4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {bgr_to_xyz_8u, 0, bgr_to_xyz_16u, 0, 0, bgr_to_xyz_32f},
-                {bgra_to_xyz_8u, 0, bgra_to_xyz_16u, 0, 0, bgra_to_xyz_32f}
-            },
-            {
-                {bgr_to_xyz4_8u, 0, bgr_to_xyz4_16u, 0, 0, bgr_to_xyz4_32f},
-                {bgra_to_xyz4_8u, 0, bgra_to_xyz4_16u, 0, 0, bgra_to_xyz4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void xyz_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {xyz_to_rgb_8u, 0, xyz_to_rgb_16u, 0, 0, xyz_to_rgb_32f},
-                {xyz4_to_rgb_8u, 0, xyz4_to_rgb_16u, 0, 0, xyz4_to_rgb_32f}
-            },
-            {
-                {xyz_to_rgba_8u, 0, xyz_to_rgba_16u, 0, 0, xyz_to_rgba_32f},
-                {xyz4_to_rgba_8u, 0, xyz4_to_rgba_16u, 0, 0, xyz4_to_rgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void xyz_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {xyz_to_bgr_8u, 0, xyz_to_bgr_16u, 0, 0, xyz_to_bgr_32f},
-                {xyz4_to_bgr_8u, 0, xyz4_to_bgr_16u, 0, 0, xyz4_to_bgr_32f}
-            },
-            {
-                {xyz_to_bgra_8u, 0, xyz_to_bgra_16u, 0, 0, xyz_to_bgra_32f},
-                {xyz4_to_bgra_8u, 0, xyz4_to_bgra_16u, 0, 0, xyz4_to_bgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {rgb_to_hsv_8u, 0, 0, 0, 0, rgb_to_hsv_32f},
-                {rgba_to_hsv_8u, 0, 0, 0, 0, rgba_to_hsv_32f},
-            },
-            {
-                {rgb_to_hsv4_8u, 0, 0, 0, 0, rgb_to_hsv4_32f},
-                {rgba_to_hsv4_8u, 0, 0, 0, 0, rgba_to_hsv4_32f},
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {bgr_to_hsv_8u, 0, 0, 0, 0, bgr_to_hsv_32f},
-                {bgra_to_hsv_8u, 0, 0, 0, 0, bgra_to_hsv_32f}
-            },
-            {
-                {bgr_to_hsv4_8u, 0, 0, 0, 0, bgr_to_hsv4_32f},
-                {bgra_to_hsv4_8u, 0, 0, 0, 0, bgra_to_hsv4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hsv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hsv_to_rgb_8u, 0, 0, 0, 0, hsv_to_rgb_32f},
-                {hsv4_to_rgb_8u, 0, 0, 0, 0, hsv4_to_rgb_32f}
-            },
-            {
-                {hsv_to_rgba_8u, 0, 0, 0, 0, hsv_to_rgba_32f},
-                {hsv4_to_rgba_8u, 0, 0, 0, 0, hsv4_to_rgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hsv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hsv_to_bgr_8u, 0, 0, 0, 0, hsv_to_bgr_32f},
-                {hsv4_to_bgr_8u, 0, 0, 0, 0, hsv4_to_bgr_32f}
-            },
-            {
-                {hsv_to_bgra_8u, 0, 0, 0, 0, hsv_to_bgra_32f},
-                {hsv4_to_bgra_8u, 0, 0, 0, 0, hsv4_to_bgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {rgb_to_hls_8u, 0, 0, 0, 0, rgb_to_hls_32f},
-                {rgba_to_hls_8u, 0, 0, 0, 0, rgba_to_hls_32f},
-            },
-            {
-                {rgb_to_hls4_8u, 0, 0, 0, 0, rgb_to_hls4_32f},
-                {rgba_to_hls4_8u, 0, 0, 0, 0, rgba_to_hls4_32f},
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {bgr_to_hls_8u, 0, 0, 0, 0, bgr_to_hls_32f},
-                {bgra_to_hls_8u, 0, 0, 0, 0, bgra_to_hls_32f}
-            },
-            {
-                {bgr_to_hls4_8u, 0, 0, 0, 0, bgr_to_hls4_32f},
-                {bgra_to_hls4_8u, 0, 0, 0, 0, bgra_to_hls4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hls_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hls_to_rgb_8u, 0, 0, 0, 0, hls_to_rgb_32f},
-                {hls4_to_rgb_8u, 0, 0, 0, 0, hls4_to_rgb_32f}
-            },
-            {
-                {hls_to_rgba_8u, 0, 0, 0, 0, hls_to_rgba_32f},
-                {hls4_to_rgba_8u, 0, 0, 0, 0, hls4_to_rgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hls_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hls_to_bgr_8u, 0, 0, 0, 0, hls_to_bgr_32f},
-                {hls4_to_bgr_8u, 0, 0, 0, 0, hls4_to_bgr_32f}
-            },
-            {
-                {hls_to_bgra_8u, 0, 0, 0, 0, hls_to_bgra_32f},
-                {hls4_to_bgra_8u, 0, 0, 0, 0, hls4_to_bgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {rgb_to_hsv_full_8u, 0, 0, 0, 0, rgb_to_hsv_full_32f},
-                {rgba_to_hsv_full_8u, 0, 0, 0, 0, rgba_to_hsv_full_32f},
-            },
-            {
-                {rgb_to_hsv4_full_8u, 0, 0, 0, 0, rgb_to_hsv4_full_32f},
-                {rgba_to_hsv4_full_8u, 0, 0, 0, 0, rgba_to_hsv4_full_32f},
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {bgr_to_hsv_full_8u, 0, 0, 0, 0, bgr_to_hsv_full_32f},
-                {bgra_to_hsv_full_8u, 0, 0, 0, 0, bgra_to_hsv_full_32f}
-            },
-            {
-                {bgr_to_hsv4_full_8u, 0, 0, 0, 0, bgr_to_hsv4_full_32f},
-                {bgra_to_hsv4_full_8u, 0, 0, 0, 0, bgra_to_hsv4_full_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hsv_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hsv_to_rgb_full_8u, 0, 0, 0, 0, hsv_to_rgb_full_32f},
-                {hsv4_to_rgb_full_8u, 0, 0, 0, 0, hsv4_to_rgb_full_32f}
-            },
-            {
-                {hsv_to_rgba_full_8u, 0, 0, 0, 0, hsv_to_rgba_full_32f},
-                {hsv4_to_rgba_full_8u, 0, 0, 0, 0, hsv4_to_rgba_full_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hsv_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hsv_to_bgr_full_8u, 0, 0, 0, 0, hsv_to_bgr_full_32f},
-                {hsv4_to_bgr_full_8u, 0, 0, 0, 0, hsv4_to_bgr_full_32f}
-            },
-            {
-                {hsv_to_bgra_full_8u, 0, 0, 0, 0, hsv_to_bgra_full_32f},
-                {hsv4_to_bgra_full_8u, 0, 0, 0, 0, hsv4_to_bgra_full_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {rgb_to_hls_full_8u, 0, 0, 0, 0, rgb_to_hls_full_32f},
-                {rgba_to_hls_full_8u, 0, 0, 0, 0, rgba_to_hls_full_32f},
-            },
-            {
-                {rgb_to_hls4_full_8u, 0, 0, 0, 0, rgb_to_hls4_full_32f},
-                {rgba_to_hls4_full_8u, 0, 0, 0, 0, rgba_to_hls4_full_32f},
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {bgr_to_hls_full_8u, 0, 0, 0, 0, bgr_to_hls_full_32f},
-                {bgra_to_hls_full_8u, 0, 0, 0, 0, bgra_to_hls_full_32f}
-            },
-            {
-                {bgr_to_hls4_full_8u, 0, 0, 0, 0, bgr_to_hls4_full_32f},
-                {bgra_to_hls4_full_8u, 0, 0, 0, 0, bgra_to_hls4_full_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hls_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hls_to_rgb_full_8u, 0, 0, 0, 0, hls_to_rgb_full_32f},
-                {hls4_to_rgb_full_8u, 0, 0, 0, 0, hls4_to_rgb_full_32f}
-            },
-            {
-                {hls_to_rgba_full_8u, 0, 0, 0, 0, hls_to_rgba_full_32f},
-                {hls4_to_rgba_full_8u, 0, 0, 0, 0, hls4_to_rgba_full_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void hls_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][6] =
-        {
-            {
-                {hls_to_bgr_full_8u, 0, 0, 0, 0, hls_to_bgr_full_32f},
-                {hls4_to_bgr_full_8u, 0, 0, 0, 0, hls4_to_bgr_full_32f}
-            },
-            {
-                {hls_to_bgra_full_8u, 0, 0, 0, 0, hls_to_bgra_full_32f},
-                {hls4_to_bgra_full_8u, 0, 0, 0, 0, hls4_to_bgra_full_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {bgr_to_lab_8u, bgr_to_lab_32f},
-                {bgra_to_lab_8u, bgra_to_lab_32f}
-            },
-            {
-                {bgr_to_lab4_8u, bgr_to_lab4_32f},
-                {bgra_to_lab4_8u, bgra_to_lab4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {rgb_to_lab_8u, rgb_to_lab_32f},
-                {rgba_to_lab_8u, rgba_to_lab_32f}
-            },
-            {
-                {rgb_to_lab4_8u, rgb_to_lab4_32f},
-                {rgba_to_lab4_8u, rgba_to_lab4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lbgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lbgr_to_lab_8u, lbgr_to_lab_32f},
-                {lbgra_to_lab_8u, lbgra_to_lab_32f}
-            },
-            {
-                {lbgr_to_lab4_8u, lbgr_to_lab4_32f},
-                {lbgra_to_lab4_8u, lbgra_to_lab4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lrgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lrgb_to_lab_8u, lrgb_to_lab_32f},
-                {lrgba_to_lab_8u, lrgba_to_lab_32f}
-            },
-            {
-                {lrgb_to_lab4_8u, lrgb_to_lab4_32f},
-                {lrgba_to_lab4_8u, lrgba_to_lab4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lab_to_bgr_8u, lab_to_bgr_32f},
-                {lab4_to_bgr_8u, lab4_to_bgr_32f}
-            },
-            {
-                {lab_to_bgra_8u, lab_to_bgra_32f},
-                {lab4_to_bgra_8u, lab4_to_bgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lab_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lab_to_rgb_8u, lab_to_rgb_32f},
-                {lab4_to_rgb_8u, lab4_to_rgb_32f}
-            },
-            {
-                {lab_to_rgba_8u, lab_to_rgba_32f},
-                {lab4_to_rgba_8u, lab4_to_rgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lab_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lab_to_lbgr_8u, lab_to_lbgr_32f},
-                {lab4_to_lbgr_8u, lab4_to_lbgr_32f}
-            },
-            {
-                {lab_to_lbgra_8u, lab_to_lbgra_32f},
-                {lab4_to_lbgra_8u, lab4_to_lbgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lab_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lab_to_lrgb_8u, lab_to_lrgb_32f},
-                {lab4_to_lrgb_8u, lab4_to_lrgb_32f}
-            },
-            {
-                {lab_to_lrgba_8u, lab_to_lrgba_32f},
-                {lab4_to_lrgba_8u, lab4_to_lrgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void bgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {bgr_to_luv_8u, bgr_to_luv_32f},
-                {bgra_to_luv_8u, bgra_to_luv_32f}
-            },
-            {
-                {bgr_to_luv4_8u, bgr_to_luv4_32f},
-                {bgra_to_luv4_8u, bgra_to_luv4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {rgb_to_luv_8u, rgb_to_luv_32f},
-                {rgba_to_luv_8u, rgba_to_luv_32f}
-            },
-            {
-                {rgb_to_luv4_8u, rgb_to_luv4_32f},
-                {rgba_to_luv4_8u, rgba_to_luv4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lbgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lbgr_to_luv_8u, lbgr_to_luv_32f},
-                {lbgra_to_luv_8u, lbgra_to_luv_32f}
-            },
-            {
-                {lbgr_to_luv4_8u, lbgr_to_luv4_32f},
-                {lbgra_to_luv4_8u, lbgra_to_luv4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void lrgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {lrgb_to_luv_8u, lrgb_to_luv_32f},
-                {lrgba_to_luv_8u, lrgba_to_luv_32f}
-            },
-            {
-                {lrgb_to_luv4_8u, lrgb_to_luv4_32f},
-                {lrgba_to_luv4_8u, lrgba_to_luv4_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void luv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {luv_to_bgr_8u, luv_to_bgr_32f},
-                {luv4_to_bgr_8u, luv4_to_bgr_32f}
-            },
-            {
-                {luv_to_bgra_8u, luv_to_bgra_32f},
-                {luv4_to_bgra_8u, luv4_to_bgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {luv_to_rgb_8u, luv_to_rgb_32f},
-                {luv4_to_rgb_8u, luv4_to_rgb_32f}
-            },
-            {
-                {luv_to_rgba_8u, luv_to_rgba_32f},
-                {luv4_to_rgba_8u, luv4_to_rgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void luv_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {luv_to_lbgr_8u, luv_to_lbgr_32f},
-                {luv4_to_lbgr_8u, luv4_to_lbgr_32f}
-            },
-            {
-                {luv_to_lbgra_8u, luv_to_lbgra_32f},
-                {luv4_to_lbgra_8u, luv4_to_lbgra_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void luv_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        using namespace cv::gpu::cudev;
-        static const gpu_func_t funcs[2][2][2] =
-        {
-            {
-                {luv_to_lrgb_8u, luv_to_lrgb_32f},
-                {luv4_to_lrgb_8u, luv4_to_lrgb_32f}
-            },
-            {
-                {luv_to_lrgba_8u, luv_to_lrgba_32f},
-                {luv4_to_lrgba_8u, luv4_to_lrgba_32f}
-            }
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
-        CV_Assert(src.channels() == 3 || src.channels() == 4);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
-
-        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
-    }
-
-    void rgba_to_mbgra(const GpuMat& src, GpuMat& dst, int, Stream& st)
-    {
-    #if (CUDA_VERSION < 5000)
-        (void)src;
-        (void)dst;
-        (void)st;
-        CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
-    #else
-        CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4);
-
-        dst.create(src.size(), src.type());
-
-        cudaStream_t stream = StreamAccessor::getStream(st);
-        NppStreamHandler h(stream);
-
-        NppiSize oSizeROI;
-        oSizeROI.width = src.cols;
-        oSizeROI.height = src.rows;
-
-        if (src.depth() == CV_8U)
-            nppSafeCall( nppiAlphaPremul_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
-        else
-            nppSafeCall( nppiAlphaPremul_16u_AC4R(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    #endif
-    }
-
-    void bayer_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, bool blue_last, bool start_with_green, Stream& stream)
-    {
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        static const func_t funcs[3][4] =
-        {
-            {0,0,Bayer2BGR_8u_gpu<3>, Bayer2BGR_8u_gpu<4>},
-            {0,0,0,0},
-            {0,0,Bayer2BGR_16u_gpu<3>, Bayer2BGR_16u_gpu<4>}
-        };
-
-        if (dcn <= 0) dcn = 3;
-
-        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
-        CV_Assert(src.rows > 2 && src.cols > 2);
-        CV_Assert(dcn == 3 || dcn == 4);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
-
-        funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
-    }
-    void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        bayer_to_bgr(src, dst, dcn, false, false, stream);
-    }
-    void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        bayer_to_bgr(src, dst, dcn, false, true, stream);
-    }
-    void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        bayer_to_bgr(src, dst, dcn, true, false, stream);
-    }
-    void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
-    {
-        bayer_to_bgr(src, dst, dcn, true, true, stream);
-    }
-
-    void bayer_to_gray(const GpuMat& src, GpuMat& dst, bool blue_last, bool start_with_green, Stream& stream)
-    {
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        static const func_t funcs[3] =
-        {
-            Bayer2BGR_8u_gpu<1>,
-            0,
-            Bayer2BGR_16u_gpu<1>,
-        };
-
-        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
-        CV_Assert(src.rows > 2 && src.cols > 2);
-
-        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
-
-        funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
-    }
-    void bayerBG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
-    {
-        bayer_to_gray(src, dst, false, false, stream);
-    }
-    void bayerGB_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
-    {
-        bayer_to_gray(src, dst, false, true, stream);
-    }
-    void bayerRG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
-    {
-        bayer_to_gray(src, dst, true, false, stream);
-    }
-    void bayerGR_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
-    {
-        bayer_to_gray(src, dst, true, true, stream);
-    }
-}
-
-void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
-{
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream);
-    static const func_t funcs[] =
-    {
-        bgr_to_bgra,            // CV_BGR2BGRA    =0
-        bgra_to_bgr,            // CV_BGRA2BGR    =1
-        bgr_to_rgba,            // CV_BGR2RGBA    =2
-        bgra_to_rgb,            // CV_RGBA2BGR    =3
-        bgr_to_rgb,             // CV_BGR2RGB     =4
-        bgra_to_rgba,           // CV_BGRA2RGBA   =5
-
-        bgr_to_gray,            // CV_BGR2GRAY    =6
-        rgb_to_gray,            // CV_RGB2GRAY    =7
-        gray_to_bgr,            // CV_GRAY2BGR    =8
-        gray_to_bgra,           // CV_GRAY2BGRA   =9
-        bgra_to_gray,           // CV_BGRA2GRAY   =10
-        rgba_to_gray,           // CV_RGBA2GRAY   =11
-
-        bgr_to_bgr565,          // CV_BGR2BGR565  =12
-        rgb_to_bgr565,          // CV_RGB2BGR565  =13
-        bgr565_to_bgr,          // CV_BGR5652BGR  =14
-        bgr565_to_rgb,          // CV_BGR5652RGB  =15
-        bgra_to_bgr565,         // CV_BGRA2BGR565 =16
-        rgba_to_bgr565,         // CV_RGBA2BGR565 =17
-        bgr565_to_bgra,         // CV_BGR5652BGRA =18
-        bgr565_to_rgba,         // CV_BGR5652RGBA =19
-
-        gray_to_bgr565,         // CV_GRAY2BGR565 =20
-        bgr565_to_gray,         // CV_BGR5652GRAY =21
-
-        bgr_to_bgr555,          // CV_BGR2BGR555  =22
-        rgb_to_bgr555,          // CV_RGB2BGR555  =23
-        bgr555_to_bgr,          // CV_BGR5552BGR  =24
-        bgr555_to_rgb,          // CV_BGR5552RGB  =25
-        bgra_to_bgr555,         // CV_BGRA2BGR555 =26
-        rgba_to_bgr555,         // CV_RGBA2BGR555 =27
-        bgr555_to_bgra,         // CV_BGR5552BGRA =28
-        bgr555_to_rgba,         // CV_BGR5552RGBA =29
-
-        gray_to_bgr555,         // CV_GRAY2BGR555 =30
-        bgr555_to_gray,         // CV_BGR5552GRAY =31
-
-        bgr_to_xyz,             // CV_BGR2XYZ     =32
-        rgb_to_xyz,             // CV_RGB2XYZ     =33
-        xyz_to_bgr,             // CV_XYZ2BGR     =34
-        xyz_to_rgb,             // CV_XYZ2RGB     =35
-
-        bgr_to_YCrCb,           // CV_BGR2YCrCb   =36
-        rgb_to_YCrCb,           // CV_RGB2YCrCb   =37
-        YCrCb_to_bgr,           // CV_YCrCb2BGR   =38
-        YCrCb_to_rgb,           // CV_YCrCb2RGB   =39
-
-        bgr_to_hsv,             // CV_BGR2HSV     =40
-        rgb_to_hsv,             // CV_RGB2HSV     =41
-
-        0,                      //                =42
-        0,                      //                =43
-
-        bgr_to_lab,             // CV_BGR2Lab     =44
-        rgb_to_lab,             // CV_RGB2Lab     =45
-
-        bayerBG_to_bgr,         // CV_BayerBG2BGR =46
-        bayerGB_to_bgr,         // CV_BayerGB2BGR =47
-        bayerRG_to_bgr,         // CV_BayerRG2BGR =48
-        bayerGR_to_bgr,         // CV_BayerGR2BGR =49
-
-        bgr_to_luv,             // CV_BGR2Luv     =50
-        rgb_to_luv,             // CV_RGB2Luv     =51
-
-        bgr_to_hls,             // CV_BGR2HLS     =52
-        rgb_to_hls,             // CV_RGB2HLS     =53
-
-        hsv_to_bgr,             // CV_HSV2BGR     =54
-        hsv_to_rgb,             // CV_HSV2RGB     =55
-
-        lab_to_bgr,             // CV_Lab2BGR     =56
-        lab_to_rgb,             // CV_Lab2RGB     =57
-        luv_to_bgr,             // CV_Luv2BGR     =58
-        luv_to_rgb,             // CV_Luv2RGB     =59
-
-        hls_to_bgr,             // CV_HLS2BGR     =60
-        hls_to_rgb,             // CV_HLS2RGB     =61
-
-        0,                      // CV_BayerBG2BGR_VNG =62
-        0,                      // CV_BayerGB2BGR_VNG =63
-        0,                      // CV_BayerRG2BGR_VNG =64
-        0,                      // CV_BayerGR2BGR_VNG =65
-
-        bgr_to_hsv_full,        // CV_BGR2HSV_FULL = 66
-        rgb_to_hsv_full,        // CV_RGB2HSV_FULL = 67
-        bgr_to_hls_full,        // CV_BGR2HLS_FULL = 68
-        rgb_to_hls_full,        // CV_RGB2HLS_FULL = 69
-
-        hsv_to_bgr_full,        // CV_HSV2BGR_FULL = 70
-        hsv_to_rgb_full,        // CV_HSV2RGB_FULL = 71
-        hls_to_bgr_full,        // CV_HLS2BGR_FULL = 72
-        hls_to_rgb_full,        // CV_HLS2RGB_FULL = 73
-
-        lbgr_to_lab,            // CV_LBGR2Lab     = 74
-        lrgb_to_lab,            // CV_LRGB2Lab     = 75
-        lbgr_to_luv,            // CV_LBGR2Luv     = 76
-        lrgb_to_luv,            // CV_LRGB2Luv     = 77
-
-        lab_to_lbgr,            // CV_Lab2LBGR     = 78
-        lab_to_lrgb,            // CV_Lab2LRGB     = 79
-        luv_to_lbgr,            // CV_Luv2LBGR     = 80
-        luv_to_lrgb,            // CV_Luv2LRGB     = 81
-
-        bgr_to_yuv,             // CV_BGR2YUV      = 82
-        rgb_to_yuv,             // CV_RGB2YUV      = 83
-        yuv_to_bgr,             // CV_YUV2BGR      = 84
-        yuv_to_rgb,             // CV_YUV2RGB      = 85
-
-        bayerBG_to_gray,        // CV_BayerBG2GRAY = 86
-        bayerGB_to_gray,        // CV_BayerGB2GRAY = 87
-        bayerRG_to_gray,        // CV_BayerRG2GRAY = 88
-        bayerGR_to_gray,        // CV_BayerGR2GRAY = 89
-
-        //YUV 4:2:0 formats family
-        0,                      // CV_YUV2RGB_NV12 = 90,
-        0,                      // CV_YUV2BGR_NV12 = 91,
-        0,                      // CV_YUV2RGB_NV21 = 92,
-        0,                      // CV_YUV2BGR_NV21 = 93,
-
-        0,                      // CV_YUV2RGBA_NV12 = 94,
-        0,                      // CV_YUV2BGRA_NV12 = 95,
-        0,                      // CV_YUV2RGBA_NV21 = 96,
-        0,                      // CV_YUV2BGRA_NV21 = 97,
-
-        0,                      // CV_YUV2RGB_YV12 = 98,
-        0,                      // CV_YUV2BGR_YV12 = 99,
-        0,                      // CV_YUV2RGB_IYUV = 100,
-        0,                      // CV_YUV2BGR_IYUV = 101,
-
-        0,                      // CV_YUV2RGBA_YV12 = 102,
-        0,                      // CV_YUV2BGRA_YV12 = 103,
-        0,                      // CV_YUV2RGBA_IYUV = 104,
-        0,                      // CV_YUV2BGRA_IYUV = 105,
-
-        0,                      // CV_YUV2GRAY_420 = 106,
-
-        //YUV 4:2:2 formats family
-        0,                      // CV_YUV2RGB_UYVY = 107,
-        0,                      // CV_YUV2BGR_UYVY = 108,
-        0,                      // //CV_YUV2RGB_VYUY = 109,
-        0,                      // //CV_YUV2BGR_VYUY = 110,
-
-        0,                      // CV_YUV2RGBA_UYVY = 111,
-        0,                      // CV_YUV2BGRA_UYVY = 112,
-        0,                      // //CV_YUV2RGBA_VYUY = 113,
-        0,                      // //CV_YUV2BGRA_VYUY = 114,
-
-        0,                      // CV_YUV2RGB_YUY2 = 115,
-        0,                      // CV_YUV2BGR_YUY2 = 116,
-        0,                      // CV_YUV2RGB_YVYU = 117,
-        0,                      // CV_YUV2BGR_YVYU = 118,
-
-        0,                      // CV_YUV2RGBA_YUY2 = 119,
-        0,                      // CV_YUV2BGRA_YUY2 = 120,
-        0,                      // CV_YUV2RGBA_YVYU = 121,
-        0,                      // CV_YUV2BGRA_YVYU = 122,
-
-        0,                      // CV_YUV2GRAY_UYVY = 123,
-        0,                      // CV_YUV2GRAY_YUY2 = 124,
-
-        // alpha premultiplication
-        rgba_to_mbgra,          // CV_RGBA2mRGBA = 125,
-        0,                      // CV_mRGBA2RGBA = 126,
-
-        0,                      // CV_COLORCVT_MAX  = 127
-    };
-
-    CV_Assert(code < 128);
-
-    func_t func = funcs[code];
-
-    if (func == 0)
-        CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" );
-
-    func(src, dst, dcn, stream);
-}
-
-void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
-{
-    const int depth = src.depth();
-
-    CV_Assert( src.channels() == 1 );
-
-    switch (code)
-    {
-    case cv::COLOR_BayerBG2GRAY: case cv::COLOR_BayerGB2GRAY: case cv::COLOR_BayerRG2GRAY: case cv::COLOR_BayerGR2GRAY:
-        bayer_to_gray(src, dst, code == cv::COLOR_BayerBG2GRAY || code == cv::COLOR_BayerGB2GRAY, code == cv::COLOR_BayerGB2GRAY || code == cv::COLOR_BayerGR2GRAY, stream);
-        break;
-
-    case cv::COLOR_BayerBG2BGR: case cv::COLOR_BayerGB2BGR: case cv::COLOR_BayerRG2BGR: case cv::COLOR_BayerGR2BGR:
-        bayer_to_bgr(src, dst, dcn, code == cv::COLOR_BayerBG2BGR || code == cv::COLOR_BayerGB2BGR, code == cv::COLOR_BayerGB2BGR || code == cv::COLOR_BayerGR2BGR, stream);
-        break;
-
-    case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT:
-    {
-        if (dcn <= 0)
-            dcn = 3;
-
-        CV_Assert( depth == CV_8U );
-        CV_Assert( dcn == 3 || dcn == 4 );
-
-        dst.create(src.size(), CV_MAKETYPE(depth, dcn));
-        dst.setTo(Scalar::all(0));
-
-        Size wholeSize;
-        Point ofs;
-        src.locateROI(wholeSize, ofs);
-        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
-
-        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
-                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
-
-        if (dcn == 3)
-            cudev::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
-        else
-            cudev::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
-
-        break;
-    }
-
-    case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT:
-    {
-        CV_Assert( depth == CV_8U );
-
-        dst.create(src.size(), CV_MAKETYPE(depth, 1));
-        dst.setTo(Scalar::all(0));
-
-        Size wholeSize;
-        Point ofs;
-        src.locateROI(wholeSize, ofs);
-        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
-
-        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
-                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
-
-        cudev::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
-
-        break;
-    }
-
-    default:
-        CV_Error( cv::Error::StsBadFlag, "Unknown / unsupported color conversion code" );
-    }
-}
-
-void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
-{
-    CV_Assert(image.type() == CV_8UC4);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    NppiSize sz;
-    sz.width  = image.cols;
-    sz.height = image.rows;
-
-    nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder) );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-void cv::gpu::gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward, Stream& stream)
-{
-#if (CUDA_VERSION < 5000)
-    (void)src;
-    (void)dst;
-    (void)forward;
-    (void)stream;
-    CV_Error( cv::Error::StsNotImplemented, "This function works only with CUDA 5.0 or higher" );
-#else
-    typedef NppStatus (*func_t)(const Npp8u* pSrc, int nSrcStep, Npp8u* pDst, int nDstStep, NppiSize oSizeROI);
-    typedef NppStatus (*func_inplace_t)(Npp8u* pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
-
-    static const func_t funcs[2][5] =
-    {
-        {0, 0, 0, nppiGammaInv_8u_C3R, nppiGammaInv_8u_AC4R},
-        {0, 0, 0, nppiGammaFwd_8u_C3R, nppiGammaFwd_8u_AC4R}
-    };
-    static const func_inplace_t funcs_inplace[2][5] =
-    {
-        {0, 0, 0, nppiGammaInv_8u_C3IR, nppiGammaInv_8u_AC4IR},
-        {0, 0, 0, nppiGammaFwd_8u_C3IR, nppiGammaFwd_8u_AC4IR}
-    };
-
-    CV_Assert(src.type() == CV_8UC3 || src.type() == CV_8UC4);
-
-    dst.create(src.size(), src.type());
-
-    NppStreamHandler h(StreamAccessor::getStream(stream));
-
-    NppiSize oSizeROI;
-    oSizeROI.width = src.cols;
-    oSizeROI.height = src.rows;
-
-    if (dst.data == src.data)
-        funcs_inplace[forward][src.channels()](dst.ptr<Npp8u>(), static_cast<int>(src.step), oSizeROI);
-    else
-        funcs[forward][src.channels()](src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI);
-
-#endif
-}
-
-#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu
deleted file mode 100644
index 4449274548..0000000000
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ /dev/null
@@ -1,199 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-/// Bilateral filtering
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
-        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
-        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
-        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
-
-        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
-
-        template<typename T, typename B>
-        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x >= src.cols || y >= src.rows)
-                return;
-
-            value_type center = saturate_cast<value_type>(src(y, x));
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0;
-
-            int r = ksz / 2;
-            float r2 = (float)(r * r);
-
-            int tx = x - r + ksz;
-            int ty = y - r + ksz;
-
-            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(src(cy, cx));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            else
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            dst(y, x) = saturate_cast<T>(sum1 / sum2);
-        }
-
-        template<typename T, template <typename> class B>
-        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
-             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
-
-            static caller_t funcs[] =
-            {
-                bilateral_caller<T, BrdReflect101>,
-                bilateral_caller<T, BrdReplicate>,
-                bilateral_caller<T, BrdConstant>,
-                bilateral_caller<T, BrdReflect>,
-                bilateral_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
-        }
-    }
-}}}
-
-
-#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
-    template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
-
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(short)
-//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
-OCV_INSTANTIATE_BILATERAL_FILTER(short3)
-OCV_INSTANTIATE_BILATERAL_FILTER(short4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
-//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(int)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(float)
-//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
-OCV_INSTANTIATE_BILATERAL_FILTER(float3)
-OCV_INSTANTIATE_BILATERAL_FILTER(float4)
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/blend.cu b/modules/gpu/src/cuda/blend.cu
deleted file mode 100644
index be8c0b2f35..0000000000
--- a/modules/gpu/src/cuda/blend.cu
+++ /dev/null
@@ -1,121 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace blend
-    {
-        template <typename T>
-        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                int x_ = x / cn;
-                float w1 = weights1.ptr(y)[x_];
-                float w2 = weights2.ptr(y)[x_];
-                T p1 = img1.ptr(y)[x];
-                T p2 = img2.ptr(y)[x];
-                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-            }
-        }
-
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
-
-
-        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                float w1 = weights1.ptr(y)[x];
-                float w2 = weights2.ptr(y)[x];
-                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-                w1 *= sum_inv;
-                w2 *= sum_inv;
-                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-            }
-        }
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-    } // namespace blend
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
deleted file mode 100644
index 042e9afcc6..0000000000
--- a/modules/gpu/src/cuda/canny.cu
+++ /dev/null
@@ -1,494 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <utility>
-#include <algorithm>//std::swap
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace canny
-{
-    struct L1 : binary_function<int, int, float>
-    {
-        __device__ __forceinline__ float operator ()(int x, int y) const
-        {
-            return ::abs(x) + ::abs(y);
-        }
-
-        __device__ __forceinline__ L1() {}
-        __device__ __forceinline__ L1(const L1&) {}
-    };
-    struct L2 : binary_function<int, int, float>
-    {
-        __device__ __forceinline__ float operator ()(int x, int y) const
-        {
-            return ::sqrtf(x * x + y * y);
-        }
-
-        __device__ __forceinline__ L2() {}
-        __device__ __forceinline__ L2(const L2&) {}
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace canny
-{
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
-    struct SrcTex
-    {
-        const int xoff;
-        const int yoff;
-        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
-
-        __device__ __forceinline__ int operator ()(int y, int x) const
-        {
-            return tex2D(tex_src, x + xoff, y + yoff);
-        }
-    };
-
-    template <class Norm> __global__
-    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (y >= mag.rows || x >= mag.cols)
-            return;
-
-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
-
-        dx(y, x) = dxVal;
-        dy(y, x) = dyVal;
-
-        mag(y, x) = norm(dxVal, dyVal);
-    }
-
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
-    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
-
-        bindTexture(&tex_src, srcWhole);
-        SrcTex src(xoff, yoff);
-
-        if (L2Grad)
-        {
-            L2 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
-        }
-        else
-        {
-            L1 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
-        }
-
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall(cudaThreadSynchronize());
-    }
-
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
-    {
-        if (L2Grad)
-        {
-            L2 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
-        }
-        else
-        {
-            L1 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
-    {
-        const int CANNY_SHIFT = 15;
-        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
-            return;
-
-        int dxVal = dx(y, x);
-        int dyVal = dy(y, x);
-
-        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D(tex_mag, x, y);
-
-        dxVal = ::abs(dxVal);
-        dyVal = ::abs(dyVal);
-
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-
-        if (m > low_thresh)
-        {
-            const int tg22x = dxVal * TG22;
-            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
-
-            dyVal <<= CANNY_SHIFT;
-
-            if (dyVal < tg22x)
-            {
-                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else if(dyVal > tg67x)
-            {
-                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-        }
-
-        map(y, x) = edge_type;
-    }
-
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
-    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
-
-        bindTexture(&tex_mag, mag);
-
-        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    __device__ int counter = 0;
-
-    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
-    {
-        __shared__ volatile int smem[18][18];
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
-        if (threadIdx.y == 0)
-            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
-        if (threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
-        if (threadIdx.x == 0)
-            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1)
-            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
-        if (threadIdx.x == 0 && threadIdx.y == 0)
-            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
-            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
-        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
-
-        __syncthreads();
-
-        if (x >= map.cols || y >= map.rows)
-            return;
-
-        int n;
-
-        #pragma unroll
-        for (int k = 0; k < 16; ++k)
-        {
-            n = 0;
-
-            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-            {
-                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-
-                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-
-                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-            }
-
-            if (n > 0)
-                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-        }
-
-        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-        map(y, x) = e;
-
-        n = 0;
-
-        if (e == 2)
-        {
-            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-
-            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-
-            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
-        }
-
-        if (n > 0)
-        {
-            const int ind =  ::atomicAdd(&counter, 1);
-            st[ind] = make_ushort2(x, y);
-        }
-    }
-
-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
-    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
-
-        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
-    {
-        const int stack_size = 512;
-
-        __shared__ int s_counter;
-        __shared__ int s_ind;
-        __shared__ ushort2 s_st[stack_size];
-
-        if (threadIdx.x == 0)
-            s_counter = 0;
-
-        __syncthreads();
-
-        int ind = blockIdx.y * gridDim.x + blockIdx.x;
-
-        if (ind >= count)
-            return;
-
-        ushort2 pos = st1[ind];
-
-        if (threadIdx.x < 8)
-        {
-            pos.x += c_dx[threadIdx.x];
-            pos.y += c_dy[threadIdx.x];
-
-            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
-            {
-                map(pos.y, pos.x) = 2;
-
-                ind = Emulation::smem::atomicAdd(&s_counter, 1);
-
-                s_st[ind] = pos;
-            }
-        }
-
-        __syncthreads();
-
-        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
-        {
-            const int subTaskIdx = threadIdx.x >> 3;
-            const int portion = ::min(s_counter, blockDim.x >> 3);
-
-            if (subTaskIdx < portion)
-                pos = s_st[s_counter - 1 - subTaskIdx];
-
-            __syncthreads();
-
-            if (threadIdx.x == 0)
-                s_counter -= portion;
-
-            __syncthreads();
-
-            if (subTaskIdx < portion)
-            {
-                pos.x += c_dx[threadIdx.x & 7];
-                pos.y += c_dy[threadIdx.x & 7];
-
-                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
-                {
-                    map(pos.y, pos.x) = 2;
-
-                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
-
-                    s_st[ind] = pos;
-                }
-            }
-
-            __syncthreads();
-        }
-
-        if (s_counter > 0)
-        {
-            if (threadIdx.x == 0)
-            {
-                ind = ::atomicAdd(&counter, s_counter);
-                s_ind = ind - s_counter;
-            }
-
-            __syncthreads();
-
-            ind = s_ind;
-
-            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                st2[ind + i] = s_st[i];
-        }
-    }
-
-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
-    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
-
-        int count;
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-        while (count > 0)
-        {
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-            const dim3 block(128);
-            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
-
-            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            std::swap(st1, st2);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    struct GetEdges : unary_function<int, uchar>
-    {
-        __device__ __forceinline__ uchar operator ()(int e) const
-        {
-            return (uchar)(-(e >> 1));
-        }
-
-        __device__ __forceinline__ GetEdges() {}
-        __device__ __forceinline__ GetEdges(const GetEdges&) {}
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace canny
-{
-    void getEdges(PtrStepSzi map, PtrStepSzb dst)
-    {
-        transform(map, dst, GetEdges(), WithOutMask(), 0);
-    }
-}
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/ccomponetns.cu b/modules/gpu/src/cuda/ccomponetns.cu
deleted file mode 100644
index 9552f1b06f..0000000000
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ /dev/null
@@ -1,534 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <opencv2/core/cuda/common.hpp>
-#include <opencv2/core/cuda/vec_traits.hpp>
-#include <opencv2/core/cuda/vec_math.hpp>
-#include <opencv2/core/cuda/emulation.hpp>
-
-#include <iostream>
-#include <stdio.h>
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace ccl
-    {
-        enum
-        {
-            WARP_SIZE  = 32,
-            WARP_LOG   = 5,
-
-            CTA_SIZE_X = 32,
-            CTA_SIZE_Y = 8,
-
-            STA_SIZE_MERGE_Y = 4,
-            STA_SIZE_MERGE_X = 32,
-
-            TPB_X = 1,
-            TPB_Y = 4,
-
-            TILE_COLS = CTA_SIZE_X * TPB_X,
-            TILE_ROWS = CTA_SIZE_Y * TPB_Y
-        };
-
-        template<typename T> struct IntervalsTraits
-        {
-            typedef T elem_type;
-        };
-
-        template<> struct IntervalsTraits<unsigned char>
-        {
-            typedef int dist_type;
-            enum {ch = 1};
-        };
-
-        template<> struct IntervalsTraits<uchar3>
-        {
-            typedef int3 dist_type;
-            enum {ch = 3};
-        };
-
-        template<> struct IntervalsTraits<uchar4>
-        {
-            typedef int4 dist_type;
-            enum {ch = 4};
-        };
-
-        template<> struct IntervalsTraits<unsigned short>
-        {
-            typedef int dist_type;
-            enum {ch = 1};
-        };
-
-        template<> struct IntervalsTraits<ushort3>
-        {
-            typedef int3 dist_type;
-            enum {ch = 3};
-        };
-
-        template<> struct IntervalsTraits<ushort4>
-        {
-            typedef int4 dist_type;
-            enum {ch = 4};
-        };
-
-        template<> struct IntervalsTraits<float>
-        {
-            typedef float dist_type;
-            enum {ch = 1};
-        };
-
-        template<> struct IntervalsTraits<int>
-        {
-            typedef int dist_type;
-            enum {ch = 1};
-        };
-
-        typedef unsigned char component;
-        enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
-
-        template<typename T, int CH> struct InInterval {};
-
-        template<typename T> struct InInterval<T, 1>
-        {
-            typedef typename VecTraits<T>::elem_type E;
-            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
-            T lo, hi;
-
-            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
-            {
-                I d = a - b;
-                return lo <= d && d <= hi;
-            }
-        };
-
-
-        template<typename T> struct InInterval<T, 3>
-        {
-            typedef typename VecTraits<T>::elem_type E;
-            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
-            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
-            T lo, hi;
-
-            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
-            {
-                I d = a - b;
-                return lo.x <= d.x && d.x <= hi.x &&
-                       lo.y <= d.y && d.y <= hi.y &&
-                       lo.z <= d.z && d.z <= hi.z;
-            }
-        };
-
-        template<typename T> struct InInterval<T, 4>
-        {
-            typedef typename VecTraits<T>::elem_type E;
-            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
-            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
-            T lo, hi;
-
-            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
-            {
-                I d = a - b;
-                return lo.x <= d.x && d.x <= hi.x &&
-                       lo.y <= d.y && d.y <= hi.y &&
-                       lo.z <= d.z && d.z <= hi.z &&
-                       lo.w <= d.w && d.w <= hi.w;
-            }
-        };
-
-
-        template<typename T, typename F>
-        __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x >= image.cols || y >= image.rows) return;
-
-            T intensity = image(y, x);
-            component c = 0;
-
-            if ( x > 0 && connected(intensity, image(y, x - 1)))
-                c |= LEFT;
-
-            if ( y > 0 && connected(intensity, image(y - 1, x)))
-                c |= UP;
-
-            if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
-                c |= RIGHT;
-
-            if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
-                c |= DOWN;
-
-            components(y, x) = c;
-        }
-
-        template< typename T>
-        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
-        {
-            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
-            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
-
-            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
-
-            Int_t inInt(lo, hi);
-            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
-
-            cudaSafeCall( cudaGetLastError() );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-
-        __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
-        {
-            int x = threadIdx.x + blockIdx.x * TILE_COLS;
-            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
-
-            if (x >= edges.cols || y >= edges.rows) return;
-
-            //currently x is 1
-            int bounds = ((y + TPB_Y) < edges.rows);
-
-            __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
-            __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
-
-            int new_labels[TPB_Y][TPB_X];
-            int old_labels[TPB_Y][TPB_X];
-
-            #pragma unroll
-            for (int i = 0; i < TPB_Y; ++i)
-                #pragma unroll
-                for (int j = 0; j < TPB_X; ++j)
-                {
-                    int yloc = threadIdx.y + CTA_SIZE_Y * i;
-                    int xloc = threadIdx.x + CTA_SIZE_X * j;
-                    component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
-
-                    if (!xloc) c &= ~LEFT;
-                    if (!yloc) c &= ~UP;
-
-                    if (xloc == TILE_COLS -1) c &= ~RIGHT;
-                    if (yloc == TILE_ROWS -1) c &= ~DOWN;
-
-                    new_labels[i][j] = yloc * TILE_COLS + xloc;
-                    edgesTile[yloc][xloc] = c;
-                }
-
-            for (int k = 0; ;++k)
-            {
-                //1. backup
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
-                        int xloc = threadIdx.x + CTA_SIZE_X * j;
-
-                        old_labels[i][j]       = new_labels[i][j];
-                        labelsTile[yloc][xloc] = new_labels[i][j];
-                    }
-
-                __syncthreads();
-
-                //2. compare local arrays
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
-                        int xloc = threadIdx.x + CTA_SIZE_X * j;
-
-                        component c = edgesTile[yloc][xloc];
-                        int label = new_labels[i][j];
-
-                        if (c & UP)
-                           label = ::min(label, labelsTile[yloc - 1][xloc]);
-
-                        if (c &  DOWN)
-                           label = ::min(label, labelsTile[yloc + 1][xloc]);
-
-                        if (c & LEFT)
-                           label = ::min(label, labelsTile[yloc][xloc - 1]);
-
-                        if (c & RIGHT)
-                           label = ::min(label, labelsTile[yloc][xloc + 1]);
-
-                       new_labels[i][j] = label;
-                    }
-
-                __syncthreads();
-
-                //3. determine: Is any value changed?
-                int changed = 0;
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        if (new_labels[i][j] < old_labels[i][j])
-                        {
-                            changed = 1;
-                            Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
-                        }
-                    }
-
-                changed = Emulation::syncthreadsOr(changed);
-
-                if (!changed)
-                    break;
-
-                //4. Compact paths
-                const int *labels = &labelsTile[0][0];
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        int label = new_labels[i][j];
-
-                        while( labels[label] < label ) label = labels[label];
-
-                        new_labels[i][j] = label;
-                    }
-                __syncthreads();
-            }
-
-            #pragma unroll
-            for (int i = 0; i < TPB_Y; ++i)
-            #pragma unroll
-                for (int j = 0; j < TPB_X; ++j)
-                {
-                    int label = new_labels[i][j];
-                    int yloc = label / TILE_COLS;
-                    int xloc = label - yloc * TILE_COLS;
-
-                    xloc += blockIdx.x * TILE_COLS;
-                    yloc += blockIdx.y * TILE_ROWS;
-
-                    label = yloc * edges.cols + xloc;
-                    // do it for x too.
-                    if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
-                }
-        }
-
-        __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
-        {
-            while(1)
-            {
-                int y = label / comps.cols;
-                int x = label - y * comps.cols;
-
-                int parent = comps(y, x);
-
-                if (label == parent) break;
-
-                label = parent;
-            }
-            return label;
-        }
-
-        __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
-        {
-            int r1 = root(comps, l1);
-            int r2 = root(comps, l2);
-
-            if (r1 == r2) return;
-
-            int mi = ::min(r1, r2);
-            int ma = ::max(r1, r2);
-
-            int y = ma / comps.cols;
-            int x = ma - y * comps.cols;
-
-            atomicMin(&comps.ptr(y)[x], mi);
-            changed = true;
-        }
-
-        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
-            const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
-        {
-            int tid = threadIdx.y * blockDim.x + threadIdx.x;
-            int stride = blockDim.y * blockDim.x;
-
-            int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
-            int yend   = ybegin + tilesNumY * tileSizeY;
-
-            if (blockIdx.y == gridDim.y - 1)
-            {
-                yend -= yIncomplete * tileSizeY;
-                yend -= tileSizeY;
-                tileSizeY = (edges.rows % tileSizeY);
-
-                yend += tileSizeY;
-            }
-
-            int xbegin = blockIdx.x * tilesNumX * tileSizeX;
-            int xend   = xbegin + tilesNumX * tileSizeX;
-
-            if (blockIdx.x == gridDim.x - 1)
-            {
-                if (xIncomplete) yend = ybegin;
-                xend -= xIncomplete * tileSizeX;
-                xend -= tileSizeX;
-                tileSizeX = (edges.cols % tileSizeX);
-
-                xend += tileSizeX;
-            }
-
-            if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
-            {
-                xend = xbegin;
-            }
-
-            int tasksV = (tilesNumX - 1) * (yend - ybegin);
-            int tasksH = (tilesNumY - 1) * (xend - xbegin);
-
-            int total = tasksH + tasksV;
-
-            bool changed;
-            do
-            {
-                changed = false;
-                for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
-                {
-                    if (taskIdx < tasksH)
-                    {
-                        int indexH = taskIdx;
-
-                        int row = indexH / (xend - xbegin);
-                        int col = indexH - row * (xend - xbegin);
-
-                        int y = ybegin + (row + 1) * tileSizeY;
-                        int x = xbegin + col;
-
-                        component e = edges( x, y);
-                        if (e & UP)
-                        {
-                            int lc = comps(y,x);
-                            int lu = comps(y - 1, x);
-
-                            isConnected(comps, lc, lu, changed);
-                        }
-                    }
-                    else
-                    {
-                        int indexV = taskIdx - tasksH;
-
-                        int col = indexV / (yend - ybegin);
-                        int row = indexV - col * (yend - ybegin);
-
-                        int x = xbegin + (col + 1) * tileSizeX;
-                        int y = ybegin + row;
-
-                        component e = edges(x, y);
-                        if (e & LEFT)
-                        {
-                            int lc = comps(y, x);
-                            int ll = comps(y, x - 1);
-
-                            isConnected(comps, lc, ll, changed);
-                        }
-                    }
-                }
-            } while (Emulation::syncthreadsOr(changed));
-        }
-
-        __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if( x < comps.cols && y < comps.rows)
-                comps(y, x) = root(comps, comps(y, x));
-        }
-
-        enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
-
-        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
-        {
-            (void) flags;
-            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
-            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
-
-            lableTiles<<<grid, block, 0, stream>>>(edges, comps);
-            cudaSafeCall( cudaGetLastError() );
-
-            int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
-            while (grid.x > 1 || grid.y > 1)
-            {
-                dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
-                dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
-                // debug log
-                // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
-                crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
-                tileSizeX <<= 1;
-                tileSizeY <<= 1;
-                grid = mergeGrid;
-
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            grid.x = divUp(edges.cols, block.x);
-            grid.y = divUp(edges.rows, block.y);
-            flatten<<<grid, block, 0, stream>>>(edges, comps);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-} } }
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/clahe.cu b/modules/gpu/src/cuda/clahe.cu
deleted file mode 100644
index 7c6645749b..0000000000
--- a/modules/gpu/src/cuda/clahe.cu
+++ /dev/null
@@ -1,186 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace clahe
-{
-    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
-                                  const int2 tileSize, const int tilesX,
-                                  const int clipLimit, const float lutScale)
-    {
-        __shared__ int smem[512];
-
-        const int tx = blockIdx.x;
-        const int ty = blockIdx.y;
-        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        smem[tid] = 0;
-        __syncthreads();
-
-        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
-        {
-            const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
-            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
-            {
-                const int data = srcPtr[j];
-                Emulation::smem::atomicAdd(&smem[data], 1);
-            }
-        }
-
-        __syncthreads();
-
-        int tHistVal = smem[tid];
-
-        __syncthreads();
-
-        if (clipLimit > 0)
-        {
-            // clip histogram bar
-
-            int clipped = 0;
-            if (tHistVal > clipLimit)
-            {
-                clipped = tHistVal - clipLimit;
-                tHistVal = clipLimit;
-            }
-
-            // find number of overall clipped samples
-
-            reduce<256>(smem, clipped, tid, plus<int>());
-
-            // broadcast evaluated value
-
-            __shared__ int totalClipped;
-
-            if (tid == 0)
-                totalClipped = clipped;
-            __syncthreads();
-
-            // redistribute clipped samples evenly
-
-            int redistBatch = totalClipped / 256;
-            tHistVal += redistBatch;
-
-            int residual = totalClipped - redistBatch * 256;
-            if (tid < residual)
-                ++tHistVal;
-        }
-
-        const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
-
-        lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
-    }
-
-    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(tilesX, tilesY);
-
-        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    __global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= src.cols || y >= src.rows)
-            return;
-
-        const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
-        int ty1 = __float2int_rd(tyf);
-        int ty2 = ty1 + 1;
-        const float ya = tyf - ty1;
-        ty1 = ::max(ty1, 0);
-        ty2 = ::min(ty2, tilesY - 1);
-
-        const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
-        int tx1 = __float2int_rd(txf);
-        int tx2 = tx1 + 1;
-        const float xa = txf - tx1;
-        tx1 = ::max(tx1, 0);
-        tx2 = ::min(tx2, tilesX - 1);
-
-        const int srcVal = src(y, x);
-
-        float res = 0;
-
-        res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
-        res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
-        res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
-        res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
-
-        dst(y, x) = saturate_cast<uchar>(res);
-    }
-
-    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
-
-        tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif // CUDA_DISABLER
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu
deleted file mode 100644
index 1a5d4865ed..0000000000
--- a/modules/gpu/src/cuda/color.cu
+++ /dev/null
@@ -1,461 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/color.hpp"
-#include "cvt_color_internal.h"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
-    { \
-        traits::functor_type functor = traits::create_functor(); \
-        typedef typename traits::functor_type::argument_type src_t; \
-        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
-    }
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
-
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/debayer.cu b/modules/gpu/src/cuda/debayer.cu
deleted file mode 100644
index 46a1c14ef4..0000000000
--- a/modules/gpu/src/cuda/debayer.cu
+++ /dev/null
@@ -1,544 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/color.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T> struct Bayer2BGR;
-
-    template <> struct Bayer2BGR<uchar>
-    {
-        uchar3 res0;
-        uchar3 res1;
-        uchar3 res2;
-        uchar3 res3;
-
-        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
-        {
-            uchar4 patch[3][3];
-            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
-            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
-            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
-            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            if ((s_y & 1) ^ start_with_green)
-            {
-                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
-                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
-
-                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
-                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
-
-                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
-                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
-
-                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
-                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = t1;
-                    res0.y = patch[1][1].x;
-                    res0.z = t0;
-
-                    res1.x = patch[1][1].y;
-                    res1.y = t3;
-                    res1.z = t2;
-
-                    res2.x = t5;
-                    res2.y = patch[1][1].z;
-                    res2.z = t4;
-
-                    res3.x = patch[1][1].w;
-                    res3.y = t7;
-                    res3.z = t6;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = patch[1][1].x;
-                    res0.z = t1;
-
-                    res1.x = t2;
-                    res1.y = t3;
-                    res1.z = patch[1][1].y;
-
-                    res2.x = t4;
-                    res2.y = patch[1][1].z;
-                    res2.z = t5;
-
-                    res3.x = t6;
-                    res3.y = t7;
-                    res3.z = patch[1][1].w;
-                }
-            }
-            else
-            {
-                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
-                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
-
-                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
-                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
-
-                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
-                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
-
-                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
-                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = patch[1][1].x;
-                    res0.y = t1;
-                    res0.z = t0;
-
-                    res1.x = t3;
-                    res1.y = patch[1][1].y;
-                    res1.z = t2;
-
-                    res2.x = patch[1][1].z;
-                    res2.y = t5;
-                    res2.z = t4;
-
-                    res3.x = t7;
-                    res3.y = patch[1][1].w;
-                    res3.z = t6;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = t1;
-                    res0.z = patch[1][1].x;
-
-                    res1.x = t2;
-                    res1.y = patch[1][1].y;
-                    res1.z = t3;
-
-                    res2.x = t4;
-                    res2.y = t5;
-                    res2.z = patch[1][1].z;
-
-                    res3.x = t6;
-                    res3.y = patch[1][1].w;
-                    res3.z = t7;
-                }
-            }
-        }
-    };
-
-    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
-    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
-    {
-        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
-        return f(pix);
-    }
-    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
-    {
-        return pix;
-    }
-    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
-    {
-        return make_uchar4(pix.x, pix.y, pix.z, 255);
-    }
-
-    template <typename D>
-    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
-    {
-        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (s_y >= src.rows || (s_x << 2) >= src.cols)
-            return;
-
-        s_y = ::min(::max(s_y, 1), src.rows - 2);
-
-        Bayer2BGR<uchar> bayer;
-        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
-
-        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        dst(d_y, d_x) = toDst<D>(bayer.res0);
-        if (d_x + 1 < src.cols)
-            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
-        if (d_x + 2 < src.cols)
-            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
-        if (d_x + 3 < src.cols)
-            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
-    }
-
-    template <> struct Bayer2BGR<ushort>
-    {
-        ushort3 res0;
-        ushort3 res1;
-
-        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
-        {
-            ushort2 patch[3][3];
-            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
-            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
-            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
-            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            if ((s_y & 1) ^ start_with_green)
-            {
-                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
-                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
-
-                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
-                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = t1;
-                    res0.y = patch[1][1].x;
-                    res0.z = t0;
-
-                    res1.x = patch[1][1].y;
-                    res1.y = t3;
-                    res1.z = t2;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = patch[1][1].x;
-                    res0.z = t1;
-
-                    res1.x = t2;
-                    res1.y = t3;
-                    res1.z = patch[1][1].y;
-                }
-            }
-            else
-            {
-                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
-                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
-
-                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
-                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = patch[1][1].x;
-                    res0.y = t1;
-                    res0.z = t0;
-
-                    res1.x = t3;
-                    res1.y = patch[1][1].y;
-                    res1.z = t2;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = t1;
-                    res0.z = patch[1][1].x;
-
-                    res1.x = t2;
-                    res1.y = patch[1][1].y;
-                    res1.z = t3;
-                }
-            }
-        }
-    };
-
-    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
-    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
-    {
-        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
-        return f(pix);
-    }
-    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
-    {
-        return pix;
-    }
-    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
-    {
-        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
-    }
-
-    template <typename D>
-    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
-    {
-        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (s_y >= src.rows || (s_x << 1) >= src.cols)
-            return;
-
-        s_y = ::min(::max(s_y, 1), src.rows - 2);
-
-        Bayer2BGR<ushort> bayer;
-        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
-
-        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        dst(d_y, d_x) = toDst<D>(bayer.res0);
-        if (d_x + 1 < src.cols)
-            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
-    }
-
-    template <int cn>
-    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-    {
-        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
-
-        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <int cn>
-    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-    {
-        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
-
-        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-
-    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////
-    // Bayer Demosaicing (Malvar, He, and Cutler)
-    //
-    // by Morgan McGuire, Williams College
-    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
-    //
-    // ported to CUDA
-
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    template <typename DstType>
-    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
-    {
-        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
-        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
-        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
-        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
-        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
-        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
-            return;
-
-        int2 center;
-        center.x = x + sourceOffset.x;
-        center.y = y + sourceOffset.y;
-
-        int4 xCoord;
-        xCoord.x = center.x - 2;
-        xCoord.y = center.x - 1;
-        xCoord.z = center.x + 1;
-        xCoord.w = center.x + 2;
-
-        int4 yCoord;
-        yCoord.x = center.y - 2;
-        yCoord.y = center.y - 1;
-        yCoord.z = center.y + 1;
-        yCoord.w = center.y + 2;
-
-        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
-
-        float4 Dvec;
-        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
-        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
-        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
-        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
-
-        float4 value;
-        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
-        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
-        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
-        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
-
-        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
-        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
-        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
-        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
-        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
-
-        float4 PATTERN;
-        PATTERN.x = kCx * C;
-        PATTERN.y = kCy * C;
-        PATTERN.z = kCz * C;
-        PATTERN.w = PATTERN.z;
-
-        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
-
-        // There are five filter patterns (identity, cross, checker,
-        // theta, phi). Precompute the terms from all of them and then
-        // use swizzles to assign to color channels.
-        //
-        // Channel Matches
-        // x cross (e.g., EE G)
-        // y checker (e.g., EE B)
-        // z theta (e.g., EO R)
-        // w phi (e.g., EO B)
-
-        #define A value.x  // A0 + A1
-        #define B value.y  // B0 + B1
-        #define E value.z  // E0 + E1
-        #define F value.w  // F0 + F1
-
-        float3 temp;
-
-        // PATTERN.yzw += (kD.yz * D).xyy;
-        temp.x = kDy * D;
-        temp.y = kDz * D;
-        PATTERN.y += temp.x;
-        PATTERN.z += temp.y;
-        PATTERN.w += temp.y;
-
-        // PATTERN += (kA.xyz * A).xyzx;
-        temp.x = kAx * A;
-        temp.y = kAy * A;
-        temp.z = kAz * A;
-        PATTERN.x += temp.x;
-        PATTERN.y += temp.y;
-        PATTERN.z += temp.z;
-        PATTERN.w += temp.x;
-
-        // PATTERN += (kE.xyw * E).xyxz;
-        temp.x = kEx * E;
-        temp.y = kEy * E;
-        temp.z = kEw * E;
-        PATTERN.x += temp.x;
-        PATTERN.y += temp.y;
-        PATTERN.z += temp.x;
-        PATTERN.w += temp.z;
-
-        // PATTERN.xw += kB.xw * B;
-        PATTERN.x += kBx * B;
-        PATTERN.w += kBw * B;
-
-        // PATTERN.xz += kF.xz * F;
-        PATTERN.x += kFx * F;
-        PATTERN.z += kFz * F;
-
-        // Determine which of four types of pixels we are on.
-        int2 alternate;
-        alternate.x = (x + firstRed.x) % 2;
-        alternate.y = (y + firstRed.y) % 2;
-
-        // in BGR sequence;
-        uchar3 pixelColor =
-            (alternate.y == 0) ?
-                ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
-                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
-                ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
-                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
-
-        dst(y, x) = toDst<DstType>(pixelColor);
-    }
-
-    template <int cn>
-    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
-    {
-        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        bindTexture(&sourceTex, src);
-
-        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-}}}
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/gftt.cu b/modules/gpu/src/cuda/gftt.cu
deleted file mode 100644
index b4af9e5dbc..0000000000
--- a/modules/gpu/src/cuda/gftt.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace gfft
-    {
-        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __device__ int g_counter = 0;
-
-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
-        {
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
-            {
-                float val = tex2D(eigTex, j, i);
-
-                if (val > threshold)
-                {
-                    float maxVal = val;
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
-
-                    if (val == maxVal)
-                    {
-                        const int ind = ::atomicAdd(&g_counter, 1);
-
-                        if (ind < max_count)
-                            corners[ind] = make_float2(j, i);
-                    }
-                }
-            }
-        }
-
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
-        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-            bindTexture(&eigTex, eig);
-
-            dim3 block(16, 16);
-            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
-
-            if (mask.data)
-                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
-            else
-                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            return std::min(count, max_count);
-        }
-
-        class EigGreater
-        {
-        public:
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
-            {
-                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
-            }
-        };
-
-
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
-        {
-            bindTexture(&eigTex, eig);
-
-            thrust::device_ptr<float2> ptr(corners);
-
-            thrust::sort(ptr, ptr + count, EigGreater());
-        }
-    } // namespace optical_flow
-}}}
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
deleted file mode 100644
index 474c27cf76..0000000000
--- a/modules/gpu/src/cuda/hist.cu
+++ /dev/null
@@ -1,153 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace hist
-{
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
-    {
-        __shared__ int shist[256];
-
-        const int y = blockIdx.x * blockDim.y + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        shist[tid] = 0;
-        __syncthreads();
-
-        if (y < rows)
-        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
-
-                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
-            }
-
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    Emulation::smem::atomicAdd(&shist[data], 1);
-                }
-            }
-        }
-
-        __syncthreads();
-
-        const int histVal = shist[tid];
-        if (histVal > 0)
-            ::atomicAdd(hist + tid, histVal);
-    }
-
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.rows, block.y));
-
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-
-namespace hist
-{
-    __constant__ int c_lut[256];
-
-    struct EqualizeHist : unary_function<uchar, uchar>
-    {
-        float scale;
-
-        __host__ EqualizeHist(float _scale) : scale(_scale) {}
-
-        __device__ __forceinline__ uchar operator ()(uchar val) const
-        {
-            const int lut = c_lut[val];
-            return __float2int_rn(scale * lut);
-        }
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace hist
-{
-    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
-    {
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        const float scale = 255.0f / (src.cols * src.rows);
-
-        cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
-    }
-}
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
deleted file mode 100644
index 5a4481b6e5..0000000000
--- a/modules/gpu/src/cuda/hough.cu
+++ /dev/null
@@ -1,1709 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/dynamic_smem.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace hough
-    {
-        __device__ int g_counter;
-
-        ////////////////////////////////////////////////////////////////////////
-        // buildPointList
-
-        template <int PIXELS_PER_THREAD>
-        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
-        {
-            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
-            __shared__ int s_qsize[4];
-            __shared__ int s_globStart[4];
-
-            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (threadIdx.x == 0)
-                s_qsize[threadIdx.y] = 0;
-            __syncthreads();
-
-            if (y < src.rows)
-            {
-                // fill the queue
-                const uchar* srcRow = src.ptr(y);
-                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
-                {
-                    if (srcRow[xx])
-                    {
-                        const unsigned int val = (y << 16) | xx;
-                        const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
-                        s_queues[threadIdx.y][qidx] = val;
-                    }
-                }
-            }
-
-            __syncthreads();
-
-            // let one thread reserve the space required in the global list
-            if (threadIdx.x == 0 && threadIdx.y == 0)
-            {
-                // find how many items are stored in each list
-                int totalSize = 0;
-                for (int i = 0; i < blockDim.y; ++i)
-                {
-                    s_globStart[i] = totalSize;
-                    totalSize += s_qsize[i];
-                }
-
-                // calculate the offset in the global list
-                const int globalOffset = atomicAdd(&g_counter, totalSize);
-                for (int i = 0; i < blockDim.y; ++i)
-                    s_globStart[i] += globalOffset;
-            }
-
-            __syncthreads();
-
-            // copy local queues to global queue
-            const int qsize = s_qsize[threadIdx.y];
-            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
-            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
-                list[gidx] = s_queues[threadIdx.y][i];
-        }
-
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
-        {
-            const int PIXELS_PER_THREAD = 16;
-
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 4);
-            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
-
-            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // linesAccum
-
-        __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
-        {
-            const int n = blockIdx.x;
-            const float ang = n * theta;
-
-            float sinVal;
-            float cosVal;
-            sincosf(ang, &sinVal, &cosVal);
-            sinVal *= irho;
-            cosVal *= irho;
-
-            const int shift = (numrho - 1) / 2;
-
-            int* accumRow = accum.ptr(n + 1);
-            for (int i = threadIdx.x; i < count; i += blockDim.x)
-            {
-                const unsigned int val = list[i];
-
-                const int x = (val & 0xFFFF);
-                const int y = (val >> 16) & 0xFFFF;
-
-                int r = __float2int_rn(x * cosVal + y * sinVal);
-                r += shift;
-
-                ::atomicAdd(accumRow + r + 1, 1);
-            }
-        }
-
-        __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
-        {
-            int* smem = DynamicSharedMem<int>();
-
-            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
-                smem[i] = 0;
-
-            __syncthreads();
-
-            const int n = blockIdx.x;
-            const float ang = n * theta;
-
-            float sinVal;
-            float cosVal;
-            sincosf(ang, &sinVal, &cosVal);
-            sinVal *= irho;
-            cosVal *= irho;
-
-            const int shift = (numrho - 1) / 2;
-
-            for (int i = threadIdx.x; i < count; i += blockDim.x)
-            {
-                const unsigned int val = list[i];
-
-                const int x = (val & 0xFFFF);
-                const int y = (val >> 16) & 0xFFFF;
-
-                int r = __float2int_rn(x * cosVal + y * sinVal);
-                r += shift;
-
-                Emulation::smem::atomicAdd(&smem[r + 1], 1);
-            }
-
-            __syncthreads();
-
-            int* accumRow = accum.ptr(n + 1);
-            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
-                accumRow[i] = smem[i];
-        }
-
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
-        {
-            const dim3 block(has20 ? 1024 : 512);
-            const dim3 grid(accum.rows - 2);
-
-            size_t smemSize = (accum.cols - 1) * sizeof(int);
-
-            if (smemSize < sharedMemPerBlock - 1000)
-                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
-            else
-                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // linesGetResult
-
-        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
-        {
-            const int r = blockIdx.x * blockDim.x + threadIdx.x;
-            const int n = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (r >= accum.cols - 2 || n >= accum.rows - 2)
-                return;
-
-            const int curVotes = accum(n + 1, r + 1);
-
-            if (curVotes > threshold &&
-                curVotes >  accum(n + 1, r) &&
-                curVotes >= accum(n + 1, r + 2) &&
-                curVotes >  accum(n, r + 1) &&
-                curVotes >= accum(n + 2, r + 1))
-            {
-                const float radius = (r - (numrho - 1) * 0.5f) * rho;
-                const float angle = n * theta;
-
-                const int ind = ::atomicAdd(&g_counter, 1);
-                if (ind < maxSize)
-                {
-                    out[ind] = make_float2(radius, angle);
-                    votes[ind] = curVotes;
-                }
-            }
-        }
-
-        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
-
-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            if (doSort && totalCount > 0)
-            {
-                thrust::device_ptr<float2> outPtr(out);
-                thrust::device_ptr<int> votesPtr(votes);
-                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
-            }
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // houghLinesProbabilistic
-
-        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
-                                                int4* out, const int maxSize,
-                                                const float rho, const float theta,
-                                                const int lineGap, const int lineLength,
-                                                const int rows, const int cols)
-        {
-            const int r = blockIdx.x * blockDim.x + threadIdx.x;
-            const int n = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (r >= accum.cols - 2 || n >= accum.rows - 2)
-                return;
-
-            const int curVotes = accum(n + 1, r + 1);
-
-            if (curVotes >= lineLength &&
-                curVotes > accum(n, r) &&
-                curVotes > accum(n, r + 1) &&
-                curVotes > accum(n, r + 2) &&
-                curVotes > accum(n + 1, r) &&
-                curVotes > accum(n + 1, r + 2) &&
-                curVotes > accum(n + 2, r) &&
-                curVotes > accum(n + 2, r + 1) &&
-                curVotes > accum(n + 2, r + 2))
-            {
-                const float radius = (r - (accum.cols - 2 - 1) * 0.5f) * rho;
-                const float angle = n * theta;
-
-                float cosa;
-                float sina;
-                sincosf(angle, &sina, &cosa);
-
-                float2 p0 = make_float2(cosa * radius, sina * radius);
-                float2 dir = make_float2(-sina, cosa);
-
-                float2 pb[4] = {make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1)};
-                float a;
-
-                if (dir.x != 0)
-                {
-                    a = -p0.x / dir.x;
-                    pb[0].x = 0;
-                    pb[0].y = p0.y + a * dir.y;
-
-                    a = (cols - 1 - p0.x) / dir.x;
-                    pb[1].x = cols - 1;
-                    pb[1].y = p0.y + a * dir.y;
-                }
-                if (dir.y != 0)
-                {
-                    a = -p0.y / dir.y;
-                    pb[2].x = p0.x + a * dir.x;
-                    pb[2].y = 0;
-
-                    a = (rows - 1 - p0.y) / dir.y;
-                    pb[3].x = p0.x + a * dir.x;
-                    pb[3].y = rows - 1;
-                }
-
-                if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < rows))
-                {
-                    p0 = pb[0];
-                    if (dir.x < 0)
-                        dir = -dir;
-                }
-                else if (pb[1].x == cols - 1 && (pb[0].y >= 0 && pb[0].y < rows))
-                {
-                    p0 = pb[1];
-                    if (dir.x > 0)
-                        dir = -dir;
-                }
-                else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < cols))
-                {
-                    p0 = pb[2];
-                    if (dir.y < 0)
-                        dir = -dir;
-                }
-                else if (pb[3].y == rows - 1 && (pb[3].x >= 0 && pb[3].x < cols))
-                {
-                    p0 = pb[3];
-                    if (dir.y > 0)
-                        dir = -dir;
-                }
-
-                float2 d;
-                if (::fabsf(dir.x) > ::fabsf(dir.y))
-                {
-                    d.x = dir.x > 0 ? 1 : -1;
-                    d.y = dir.y / ::fabsf(dir.x);
-                }
-                else
-                {
-                    d.x = dir.x / ::fabsf(dir.y);
-                    d.y = dir.y > 0 ? 1 : -1;
-                }
-
-                float2 line_end[2];
-                int gap;
-                bool inLine = false;
-
-                float2 p1 = p0;
-                if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
-                    return;
-
-                for (;;)
-                {
-                    if (tex2D(tex_mask, p1.x, p1.y))
-                    {
-                        gap = 0;
-
-                        if (!inLine)
-                        {
-                            line_end[0] = p1;
-                            line_end[1] = p1;
-                            inLine = true;
-                        }
-                        else
-                        {
-                            line_end[1] = p1;
-                        }
-                    }
-                    else if (inLine)
-                    {
-                        if (++gap > lineGap)
-                        {
-                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
-                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
-
-                            if (good_line)
-                            {
-                                const int ind = ::atomicAdd(&g_counter, 1);
-                                if (ind < maxSize)
-                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
-                            }
-
-                            gap = 0;
-                            inLine = false;
-                        }
-                    }
-
-                    p1 = p1 + d;
-                    if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
-                    {
-                        if (inLine)
-                        {
-                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
-                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
-
-                            if (good_line)
-                            {
-                                const int ind = ::atomicAdd(&g_counter, 1);
-                                if (ind < maxSize)
-                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
-                            }
-
-                        }
-                        break;
-                    }
-                }
-            }
-        }
-
-        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
-
-            bindTexture(&tex_mask, mask);
-
-            houghLinesProbabilistic<<<grid, block>>>(accum,
-                                                     out, maxSize,
-                                                     rho, theta,
-                                                     lineGap, lineLength,
-                                                     mask.rows, mask.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // circlesAccumCenters
-
-        __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
-                                            PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
-        {
-            const int SHIFT = 10;
-            const int ONE = 1 << SHIFT;
-
-            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (tid >= count)
-                return;
-
-            const unsigned int val = list[tid];
-
-            const int x = (val & 0xFFFF);
-            const int y = (val >> 16) & 0xFFFF;
-
-            const int vx = dx(y, x);
-            const int vy = dy(y, x);
-
-            if (vx == 0 && vy == 0)
-                return;
-
-            const float mag = ::sqrtf(vx * vx + vy * vy);
-
-            const int x0 = __float2int_rn((x * idp) * ONE);
-            const int y0 = __float2int_rn((y * idp) * ONE);
-
-            int sx = __float2int_rn((vx * idp) * ONE / mag);
-            int sy = __float2int_rn((vy * idp) * ONE / mag);
-
-            // Step from minRadius to maxRadius in both directions of the gradient
-            for (int k1 = 0; k1 < 2; ++k1)
-            {
-                int x1 = x0 + minRadius * sx;
-                int y1 = y0 + minRadius * sy;
-
-                for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
-                {
-                    const int x2 = x1 >> SHIFT;
-                    const int y2 = y1 >> SHIFT;
-
-                    if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
-                        break;
-
-                    ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
-                }
-
-                sx = -sx;
-                sy = -sy;
-            }
-        }
-
-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(count, block.x));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
-
-            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // buildCentersList
-
-        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < accum.cols - 2 && y < accum.rows - 2)
-            {
-                const int top = accum(y, x + 1);
-
-                const int left = accum(y + 1, x);
-                const int cur = accum(y + 1, x + 1);
-                const int right = accum(y + 1, x + 2);
-
-                const int bottom = accum(y + 2, x + 1);
-
-                if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
-                {
-                    const unsigned int val = (y << 16) | x;
-                    const int idx = ::atomicAdd(&g_counter, 1);
-                    centers[idx] = val;
-                }
-            }
-        }
-
-        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
-
-            buildCentersList<<<grid, block>>>(accum, centers, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // circlesAccumRadius
-
-        __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
-                                           float3* circles, const int maxCircles, const float dp,
-                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
-        {
-            int* smem = DynamicSharedMem<int>();
-
-            for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
-                smem[i] = 0;
-            __syncthreads();
-
-            unsigned int val = centers[blockIdx.x];
-
-            float cx = (val & 0xFFFF);
-            float cy = (val >> 16) & 0xFFFF;
-
-            cx = (cx + 0.5f) * dp;
-            cy = (cy + 0.5f) * dp;
-
-            for (int i = threadIdx.x; i < count; i += blockDim.x)
-            {
-                val = list[i];
-
-                const int x = (val & 0xFFFF);
-                const int y = (val >> 16) & 0xFFFF;
-
-                const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
-                if (rad >= minRadius && rad <= maxRadius)
-                {
-                    const int r = __float2int_rn(rad - minRadius);
-
-                    Emulation::smem::atomicAdd(&smem[r + 1], 1);
-                }
-            }
-
-            __syncthreads();
-
-            for (int i = threadIdx.x; i < histSize; i += blockDim.x)
-            {
-                const int curVotes = smem[i + 1];
-
-                if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
-                {
-                    const int ind = ::atomicAdd(&g_counter, 1);
-                    if (ind < maxCircles)
-                        circles[ind] = make_float3(cx, cy, i + minRadius);
-                }
-            }
-        }
-
-        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
-                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(has20 ? 1024 : 512);
-            const dim3 grid(centersCount);
-
-            const int histSize = maxRadius - minRadius + 1;
-            size_t smemSize = (histSize + 2) * sizeof(int);
-
-            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxCircles);
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Generalized Hough
-
-        template <typename T, int PIXELS_PER_THREAD>
-        __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
-        {
-            __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
-            __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
-            __shared__ int s_sizes[4];
-            __shared__ int s_globStart[4];
-
-            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (threadIdx.x == 0)
-                s_sizes[threadIdx.y] = 0;
-            __syncthreads();
-
-            if (y < edges.rows)
-            {
-                // fill the queue
-                const uchar* edgesRow = edges.ptr(y);
-                const T* dxRow = dx.ptr(y);
-                const T* dyRow = dy.ptr(y);
-
-                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
-                {
-                    const T dxVal = dxRow[xx];
-                    const T dyVal = dyRow[xx];
-
-                    if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
-                    {
-                        const unsigned int coord = (y << 16) | xx;
-
-                        float theta = ::atan2f(dyVal, dxVal);
-                        if (theta < 0)
-                            theta += 2.0f * CV_PI_F;
-
-                        const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
-
-                        s_coordLists[threadIdx.y][qidx] = coord;
-                        s_thetaLists[threadIdx.y][qidx] = theta;
-                    }
-                }
-            }
-
-            __syncthreads();
-
-            // let one thread reserve the space required in the global list
-            if (threadIdx.x == 0 && threadIdx.y == 0)
-            {
-                // find how many items are stored in each list
-                int totalSize = 0;
-                for (int i = 0; i < blockDim.y; ++i)
-                {
-                    s_globStart[i] = totalSize;
-                    totalSize += s_sizes[i];
-                }
-
-                // calculate the offset in the global list
-                const int globalOffset = atomicAdd(&g_counter, totalSize);
-                for (int i = 0; i < blockDim.y; ++i)
-                    s_globStart[i] += globalOffset;
-            }
-
-            __syncthreads();
-
-            // copy local queues to global queue
-            const int qsize = s_sizes[threadIdx.y];
-            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
-            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
-            {
-                coordList[gidx] = s_coordLists[threadIdx.y][i];
-                thetaList[gidx] = s_thetaLists[threadIdx.y][i];
-            }
-        }
-
-        template <typename T>
-        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
-        {
-            const int PIXELS_PER_THREAD = 8;
-
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 4);
-            const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
-
-            buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            return totalCount;
-        }
-
-        template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
-        template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
-        template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
-
-        __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
-                                    PtrStep<short2> r_table, int* r_sizes, int maxSize,
-                                    const short2 templCenter, const float thetaScale)
-        {
-            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (tid >= pointsCount)
-                return;
-
-            const unsigned int coord = coordList[tid];
-            short2 p;
-            p.x = (coord & 0xFFFF);
-            p.y = (coord >> 16) & 0xFFFF;
-
-            const float theta = thetaList[tid];
-            const int n = __float2int_rn(theta * thetaScale);
-
-            const int ind = ::atomicAdd(r_sizes + n, 1);
-            if (ind < maxSize)
-                r_table(n, ind) = p - templCenter;
-        }
-
-        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                             PtrStepSz<short2> r_table, int* r_sizes,
-                             short2 templCenter, int levels)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(pointsCount, block.x));
-
-            const float thetaScale = levels / (2.0f * CV_PI_F);
-
-            buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // GHT_Ballard_Pos
-
-        __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
-                                                 const PtrStep<short2> r_table, const int* r_sizes,
-                                                 PtrStepSzi hist,
-                                                 const float idp, const float thetaScale)
-        {
-            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (tid >= pointsCount)
-                return;
-
-            const unsigned int coord = coordList[tid];
-            short2 p;
-            p.x = (coord & 0xFFFF);
-            p.y = (coord >> 16) & 0xFFFF;
-
-            const float theta = thetaList[tid];
-            const int n = __float2int_rn(theta * thetaScale);
-
-            const short2* r_row = r_table.ptr(n);
-            const int r_row_size = r_sizes[n];
-
-            for (int j = 0; j < r_row_size; ++j)
-            {
-                short2 c = p - r_row[j];
-
-                c.x = __float2int_rn(c.x * idp);
-                c.y = __float2int_rn(c.y * idp);
-
-                if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
-                    ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
-            }
-        }
-
-        void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                          PtrStepSz<short2> r_table, const int* r_sizes,
-                                          PtrStepSzi hist,
-                                          float dp, int levels)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(pointsCount, block.x));
-
-            const float idp = 1.0f / dp;
-            const float thetaScale = levels / (2.0f * CV_PI_F);
-
-            GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= hist.cols - 2 || y >= hist.rows - 2)
-                return;
-
-            const int curVotes = hist(y + 1, x + 1);
-
-            if (curVotes > threshold &&
-                curVotes >  hist(y + 1, x) &&
-                curVotes >= hist(y + 1, x + 2) &&
-                curVotes >  hist(y, x + 1) &&
-                curVotes >= hist(y + 2, x + 1))
-            {
-                const int ind = ::atomicAdd(&g_counter, 1);
-
-                if (ind < maxSize)
-                {
-                    out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
-                    votes[ind] = make_int3(curVotes, 0, 0);
-                }
-            }
-        }
-
-        int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
-
-            GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // GHT_Ballard_PosScale
-
-        __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
-                                                      PtrStep<short2> r_table, const int* r_sizes,
-                                                      PtrStepi hist, const int rows, const int cols,
-                                                      const float minScale, const float scaleStep, const int scaleRange,
-                                                      const float idp, const float thetaScale)
-        {
-            const unsigned int coord = coordList[blockIdx.x];
-            float2 p;
-            p.x = (coord & 0xFFFF);
-            p.y = (coord >> 16) & 0xFFFF;
-
-            const float theta = thetaList[blockIdx.x];
-            const int n = __float2int_rn(theta * thetaScale);
-
-            const short2* r_row = r_table.ptr(n);
-            const int r_row_size = r_sizes[n];
-
-            for (int j = 0; j < r_row_size; ++j)
-            {
-                const float2 d = saturate_cast<float2>(r_row[j]);
-
-                for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
-                {
-                    const float scale = minScale + s * scaleStep;
-
-                    float2 c = p - scale * d;
-
-                    c.x *= idp;
-                    c.y *= idp;
-
-                    if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
-                        ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
-                }
-            }
-        }
-
-        void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                               PtrStepSz<short2> r_table, const int* r_sizes,
-                                               PtrStepi hist, int rows, int cols,
-                                               float minScale, float scaleStep, int scaleRange,
-                                               float dp, int levels)
-        {
-            const dim3 block(256);
-            const dim3 grid(pointsCount);
-
-            const float idp = 1.0f / dp;
-            const float thetaScale = levels / (2.0f * CV_PI_F);
-
-            GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
-                                                           r_table, r_sizes,
-                                                           hist, rows, cols,
-                                                           minScale, scaleStep, scaleRange,
-                                                           idp, thetaScale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
-                                                           float4* out, int3* votes, const int maxSize,
-                                                           const float minScale, const float scaleStep, const float dp, const int threshold)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= cols || y >= rows)
-                return;
-
-            for (int s = 0; s < scaleRange; ++s)
-            {
-                const float scale = minScale + s * scaleStep;
-
-                const int prevScaleIdx = (s) * (rows + 2);
-                const int curScaleIdx = (s + 1) * (rows + 2);
-                const int nextScaleIdx = (s + 2) * (rows + 2);
-
-                const int curVotes = hist(curScaleIdx + y + 1, x + 1);
-
-                if (curVotes > threshold &&
-                    curVotes >  hist(curScaleIdx + y + 1, x) &&
-                    curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
-                    curVotes >  hist(curScaleIdx + y, x + 1) &&
-                    curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
-                    curVotes >  hist(prevScaleIdx + y + 1, x + 1) &&
-                    curVotes >= hist(nextScaleIdx + y + 1, x + 1))
-                {
-                    const int ind = ::atomicAdd(&g_counter, 1);
-
-                    if (ind < maxSize)
-                    {
-                        out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
-                        votes[ind] = make_int3(curVotes, curVotes, 0);
-                    }
-                }
-            }
-        }
-
-        int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
-                                                   float minScale, float scaleStep, float dp, int threshold)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
-
-            GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // GHT_Ballard_PosRotation
-
-        __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
-                                                         PtrStep<short2> r_table, const int* r_sizes,
-                                                         PtrStepi hist, const int rows, const int cols,
-                                                         const float minAngle, const float angleStep, const int angleRange,
-                                                         const float idp, const float thetaScale)
-        {
-            const unsigned int coord = coordList[blockIdx.x];
-            float2 p;
-            p.x = (coord & 0xFFFF);
-            p.y = (coord >> 16) & 0xFFFF;
-
-            const float thetaVal = thetaList[blockIdx.x];
-
-            for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
-            {
-                const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
-                float sinA, cosA;
-                sincosf(angle, &sinA, &cosA);
-
-                float theta = thetaVal - angle;
-                if (theta < 0)
-                    theta += 2.0f * CV_PI_F;
-
-                const int n = __float2int_rn(theta * thetaScale);
-
-                const short2* r_row = r_table.ptr(n);
-                const int r_row_size = r_sizes[n];
-
-                for (int j = 0; j < r_row_size; ++j)
-                {
-                    const float2 d = saturate_cast<float2>(r_row[j]);
-
-                    const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
-
-                    float2 c = make_float2(p.x - dr.x, p.y - dr.y);
-                    c.x *= idp;
-                    c.y *= idp;
-
-                    if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
-                        ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
-                }
-            }
-        }
-
-        void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                                  PtrStepSz<short2> r_table, const int* r_sizes,
-                                                  PtrStepi hist, int rows, int cols,
-                                                  float minAngle, float angleStep, int angleRange,
-                                                  float dp, int levels)
-        {
-            const dim3 block(256);
-            const dim3 grid(pointsCount);
-
-            const float idp = 1.0f / dp;
-            const float thetaScale = levels / (2.0f * CV_PI_F);
-
-            GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
-                                                              r_table, r_sizes,
-                                                              hist, rows, cols,
-                                                              minAngle, angleStep, angleRange,
-                                                              idp, thetaScale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
-                                                              float4* out, int3* votes, const int maxSize,
-                                                              const float minAngle, const float angleStep, const float dp, const int threshold)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= cols || y >= rows)
-                return;
-
-            for (int a = 0; a < angleRange; ++a)
-            {
-                const float angle = minAngle + a * angleStep;
-
-                const int prevAngleIdx = (a) * (rows + 2);
-                const int curAngleIdx = (a + 1) * (rows + 2);
-                const int nextAngleIdx = (a + 2) * (rows + 2);
-
-                const int curVotes = hist(curAngleIdx + y + 1, x + 1);
-
-                if (curVotes > threshold &&
-                    curVotes >  hist(curAngleIdx + y + 1, x) &&
-                    curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
-                    curVotes >  hist(curAngleIdx + y, x + 1) &&
-                    curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
-                    curVotes >  hist(prevAngleIdx + y + 1, x + 1) &&
-                    curVotes >= hist(nextAngleIdx + y + 1, x + 1))
-                {
-                    const int ind = ::atomicAdd(&g_counter, 1);
-
-                    if (ind < maxSize)
-                    {
-                        out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
-                        votes[ind] = make_int3(curVotes, 0, curVotes);
-                    }
-                }
-            }
-        }
-
-        int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
-                                                      float minAngle, float angleStep, float dp, int threshold)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
-
-            GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // GHT_Guil_Full
-
-        struct FeatureTable
-        {
-            uchar* p1_pos_data;
-            size_t p1_pos_step;
-
-            uchar* p1_theta_data;
-            size_t p1_theta_step;
-
-            uchar* p2_pos_data;
-            size_t p2_pos_step;
-
-            uchar* d12_data;
-            size_t d12_step;
-
-            uchar* r1_data;
-            size_t r1_step;
-
-            uchar* r2_data;
-            size_t r2_step;
-        };
-
-        __constant__ FeatureTable c_templFeatures;
-        __constant__ FeatureTable c_imageFeatures;
-
-        void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
-        {
-            FeatureTable tbl;
-
-            tbl.p1_pos_data = p1_pos.data;
-            tbl.p1_pos_step = p1_pos.step;
-
-            tbl.p1_theta_data = p1_theta.data;
-            tbl.p1_theta_step = p1_theta.step;
-
-            tbl.p2_pos_data = p2_pos.data;
-            tbl.p2_pos_step = p2_pos.step;
-
-            tbl.d12_data = d12.data;
-            tbl.d12_step = d12.step;
-
-            tbl.r1_data = r1.data;
-            tbl.r1_step = r1.step;
-
-            tbl.r2_data = r2.data;
-            tbl.r2_step = r2.step;
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
-        }
-        void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
-        {
-            FeatureTable tbl;
-
-            tbl.p1_pos_data = p1_pos.data;
-            tbl.p1_pos_step = p1_pos.step;
-
-            tbl.p1_theta_data = p1_theta.data;
-            tbl.p1_theta_step = p1_theta.step;
-
-            tbl.p2_pos_data = p2_pos.data;
-            tbl.p2_pos_step = p2_pos.step;
-
-            tbl.d12_data = d12.data;
-            tbl.d12_step = d12.step;
-
-            tbl.r1_data = r1.data;
-            tbl.r1_step = r1.step;
-
-            tbl.r2_data = r2.data;
-            tbl.r2_step = r2.step;
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
-        }
-
-        struct TemplFeatureTable
-        {
-            static __device__ float2* p1_pos(int n)
-            {
-                return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
-            }
-            static __device__ float* p1_theta(int n)
-            {
-                return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
-            }
-            static __device__ float2* p2_pos(int n)
-            {
-                return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
-            }
-
-            static __device__ float* d12(int n)
-            {
-                return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
-            }
-
-            static __device__ float2* r1(int n)
-            {
-                return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
-            }
-            static __device__ float2* r2(int n)
-            {
-                return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
-            }
-        };
-        struct ImageFeatureTable
-        {
-            static __device__ float2* p1_pos(int n)
-            {
-                return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
-            }
-            static __device__ float* p1_theta(int n)
-            {
-                return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
-            }
-            static __device__ float2* p2_pos(int n)
-            {
-                return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
-            }
-
-            static __device__ float* d12(int n)
-            {
-                return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
-            }
-
-            static __device__ float2* r1(int n)
-            {
-                return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
-            }
-            static __device__ float2* r2(int n)
-            {
-                return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
-            }
-        };
-
-        __device__ float clampAngle(float a)
-        {
-            float res = a;
-
-            while (res > 2.0f * CV_PI_F)
-                res -= 2.0f * CV_PI_F;
-            while (res < 0.0f)
-                res += 2.0f * CV_PI_F;
-
-            return res;
-        }
-
-        __device__ bool angleEq(float a, float b, float eps)
-        {
-            return (::fabs(clampAngle(a - b)) <= eps);
-        }
-
-        template <class FT, bool isTempl>
-        __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
-                                                       int* sizes, const int maxSize,
-                                                       const float xi, const float angleEpsilon, const float alphaScale,
-                                                       const float2 center, const float maxDist)
-        {
-            const float p1_theta = thetaList[blockIdx.x];
-            const unsigned int coord1 = coordList[blockIdx.x];
-            float2 p1_pos;
-            p1_pos.x = (coord1 & 0xFFFF);
-            p1_pos.y = (coord1 >> 16) & 0xFFFF;
-
-            for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
-            {
-                const float p2_theta = thetaList[i];
-                const unsigned int coord2 = coordList[i];
-                float2 p2_pos;
-                p2_pos.x = (coord2 & 0xFFFF);
-                p2_pos.y = (coord2 >> 16) & 0xFFFF;
-
-                if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
-                {
-                    const float2 d = p1_pos - p2_pos;
-
-                    float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
-                    float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
-
-                    if (d12 > maxDist)
-                        continue;
-
-                    float2 r1 = p1_pos - center;
-                    float2 r2 = p2_pos - center;
-
-                    const int n = __float2int_rn(alpha12 * alphaScale);
-
-                    const int ind = ::atomicAdd(sizes + n, 1);
-
-                    if (ind < maxSize)
-                    {
-                        if (!isTempl)
-                        {
-                            FT::p1_pos(n)[ind] = p1_pos;
-                            FT::p2_pos(n)[ind] = p2_pos;
-                        }
-
-                        FT::p1_theta(n)[ind] = p1_theta;
-
-                        FT::d12(n)[ind] = d12;
-
-                        if (isTempl)
-                        {
-                            FT::r1(n)[ind] = r1;
-                            FT::r2(n)[ind] = r2;
-                        }
-                    }
-                }
-            }
-        }
-
-        template <class FT, bool isTempl>
-        void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                                   int* sizes, int maxSize,
-                                                   float xi, float angleEpsilon, int levels,
-                                                   float2 center, float maxDist)
-        {
-            const dim3 block(256);
-            const dim3 grid(pointsCount);
-
-            const float alphaScale = levels / (2.0f * CV_PI_F);
-
-            GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
-                                                                         sizes, maxSize,
-                                                                         xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
-                                                                         center, maxDist);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            thrust::device_ptr<int> sizesPtr(sizes);
-            thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, cudev::bind2nd(cudev::minimum<int>(), maxSize));
-        }
-
-        void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                                     int* sizes, int maxSize,
-                                                     float xi, float angleEpsilon, int levels,
-                                                     float2 center, float maxDist)
-        {
-            GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
-                                                                           sizes, maxSize,
-                                                                           xi, angleEpsilon, levels,
-                                                                           center, maxDist);
-        }
-        void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                                     int* sizes, int maxSize,
-                                                     float xi, float angleEpsilon, int levels,
-                                                     float2 center, float maxDist)
-        {
-            GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
-                                                                            sizes, maxSize,
-                                                                            xi, angleEpsilon, levels,
-                                                                            center, maxDist);
-        }
-
-        __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
-                                                const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
-        {
-            extern __shared__ int s_OHist[];
-            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
-                s_OHist[i] = 0;
-            __syncthreads();
-
-            const int tIdx = blockIdx.x;
-            const int level = blockIdx.y;
-
-            const int tSize = templSizes[level];
-
-            if (tIdx < tSize)
-            {
-                const int imSize = imageSizes[level];
-
-                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
-
-                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
-                {
-                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
-
-                    const float angle = clampAngle(im_p1_theta - t_p1_theta);
-
-                    if (angle >= minAngle && angle <= maxAngle)
-                    {
-                        const int n = __float2int_rn((angle - minAngle) * iAngleStep);
-                        Emulation::smem::atomicAdd(&s_OHist[n], 1);
-                    }
-                }
-            }
-            __syncthreads();
-
-            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
-                ::atomicAdd(OHist + i, s_OHist[i]);
-        }
-
-        void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
-                                         float minAngle, float maxAngle, float angleStep, int angleRange,
-                                         int levels, int tMaxSize)
-        {
-            const dim3 block(256);
-            const dim3 grid(tMaxSize, levels + 1);
-
-            minAngle *= (CV_PI_F / 180.0f);
-            maxAngle *= (CV_PI_F / 180.0f);
-            angleStep *= (CV_PI_F / 180.0f);
-
-            const size_t smemSize = (angleRange + 1) * sizeof(float);
-
-            GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
-                                                               minAngle, maxAngle, 1.0f / angleStep, angleRange);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
-                                                const float angle, const float angleEpsilon,
-                                                const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
-        {
-            extern __shared__ int s_SHist[];
-            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
-                s_SHist[i] = 0;
-            __syncthreads();
-
-            const int tIdx = blockIdx.x;
-            const int level = blockIdx.y;
-
-            const int tSize = templSizes[level];
-
-            if (tIdx < tSize)
-            {
-                const int imSize = imageSizes[level];
-
-                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
-                const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
-
-                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
-                {
-                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
-                    const float im_d12 = ImageFeatureTable::d12(level)[i];
-
-                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
-                    {
-                        const float scale = im_d12 / t_d12;
-
-                        if (scale >= minScale && scale <= maxScale)
-                        {
-                            const int s = __float2int_rn((scale - minScale) * iScaleStep);
-                            Emulation::smem::atomicAdd(&s_SHist[s], 1);
-                        }
-                    }
-                }
-            }
-            __syncthreads();
-
-            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
-                ::atomicAdd(SHist + i, s_SHist[i]);
-        }
-
-        void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
-                                         float angle, float angleEpsilon,
-                                         float minScale, float maxScale, float iScaleStep, int scaleRange,
-                                         int levels, int tMaxSize)
-        {
-            const dim3 block(256);
-            const dim3 grid(tMaxSize, levels + 1);
-
-            angle *= (CV_PI_F / 180.0f);
-            angleEpsilon *= (CV_PI_F / 180.0f);
-
-            const size_t smemSize = (scaleRange + 1) * sizeof(float);
-
-            GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
-                                                               angle, angleEpsilon,
-                                                               minScale, maxScale, iScaleStep, scaleRange);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
-                                                const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
-                                                const float idp)
-        {
-            const int tIdx = blockIdx.x;
-            const int level = blockIdx.y;
-
-            const int tSize = templSizes[level];
-
-            if (tIdx < tSize)
-            {
-                const int imSize = imageSizes[level];
-
-                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
-
-                float2 r1 = TemplFeatureTable::r1(level)[tIdx];
-                float2 r2 = TemplFeatureTable::r2(level)[tIdx];
-
-                r1 = r1 * scale;
-                r2 = r2 * scale;
-
-                r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
-                r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
-
-                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
-                {
-                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
-
-                    const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
-                    const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
-
-                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
-                    {
-                        float2 c1, c2;
-
-                        c1 = im_p1_pos - r1;
-                        c1 = c1 * idp;
-
-                        c2 = im_p2_pos - r2;
-                        c2 = c2 * idp;
-
-                        if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
-                            continue;
-
-                        if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
-                            ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
-                    }
-                }
-            }
-        }
-
-        void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
-                                         float angle, float angleEpsilon, float scale,
-                                         float dp,
-                                         int levels, int tMaxSize)
-        {
-            const dim3 block(256);
-            const dim3 grid(tMaxSize, levels + 1);
-
-            angle *= (CV_PI_F / 180.0f);
-            angleEpsilon *= (CV_PI_F / 180.0f);
-
-            const float sinVal = ::sinf(angle);
-            const float cosVal = ::cosf(angle);
-
-            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
-
-            GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
-                                                     angle, sinVal, cosVal, angleEpsilon, scale,
-                                                     1.0f / dp);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
-                                                    const float angle, const int angleVotes, const float scale, const int scaleVotes,
-                                                    const float dp, const int threshold)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= hist.cols - 2 || y >= hist.rows - 2)
-                return;
-
-            const int curVotes = hist(y + 1, x + 1);
-
-            if (curVotes > threshold &&
-                curVotes >  hist(y + 1, x) &&
-                curVotes >= hist(y + 1, x + 2) &&
-                curVotes >  hist(y, x + 1) &&
-                curVotes >= hist(y + 2, x + 1))
-            {
-                const int ind = ::atomicAdd(&g_counter, 1);
-
-                if (ind < maxSize)
-                {
-                    out[ind] = make_float4(x * dp, y * dp, scale, angle);
-                    votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
-                }
-            }
-        }
-
-        int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
-                                             float angle, int angleVotes, float scale, int scaleVotes,
-                                             float dp, int threshold)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
-
-            GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
-                                                         angle, angleVotes, scale, scaleVotes,
-                                                         dp, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            return totalCount;
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
deleted file mode 100644
index 01cfae4cbd..0000000000
--- a/modules/gpu/src/cuda/imgproc.cu
+++ /dev/null
@@ -1,754 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "internal_shared.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
-
-        texture<uchar4, 2> tex_meanshift;
-
-        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
-                                        size_t out_step, int cols, int rows,
-                                        int sp, int sr, int maxIter, float eps)
-        {
-            int isr2 = sr*sr;
-            uchar4 c = tex2D(tex_meanshift, x0, y0 );
-
-            // iterate meanshift procedure
-            for( int iter = 0; iter < maxIter; iter++ )
-            {
-                int count = 0;
-                int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-                float icount;
-
-                //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-                int minx = x0-sp;
-                int miny = y0-sp;
-                int maxx = x0+sp;
-                int maxy = y0+sp;
-
-                for( int y = miny; y <= maxy; y++)
-                {
-                    int rowCount = 0;
-                    for( int x = minx; x <= maxx; x++ )
-                    {
-                        uchar4 t = tex2D( tex_meanshift, x, y );
-
-                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
-                        if( norm2 <= isr2 )
-                        {
-                            s0 += t.x; s1 += t.y; s2 += t.z;
-                            sx += x; rowCount++;
-                        }
-                    }
-                    count += rowCount;
-                    sy += y*rowCount;
-                }
-
-                if( count == 0 )
-                    break;
-
-                icount = 1.f/count;
-                int x1 = __float2int_rz(sx*icount);
-                int y1 = __float2int_rz(sy*icount);
-                s0 = __float2int_rz(s0*icount);
-                s1 = __float2int_rz(s1*icount);
-                s2 = __float2int_rz(s2*icount);
-
-                int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
-
-                bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
-
-                x0 = x1; y0 = y1;
-                c.x = s0; c.y = s1; c.z = s2;
-
-                if( stopFlag )
-                    break;
-            }
-
-            int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
-            *(uchar4*)(out + base) = c;
-
-            return make_short2((short)x0, (short)y0);
-        }
-
-        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
-        {
-            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
-            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if( x0 < cols && y0 < rows )
-                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
-        }
-
-        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
-                                             unsigned char* outsp, size_t outspstep,
-                                             int cols, int rows,
-                                             int sp, int sr, int maxIter, float eps)
-        {
-            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
-            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if( x0 < cols && y0 < rows )
-            {
-                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
-                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
-            }
-        }
-
-        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
-        {
-            dim3 grid(1, 1, 1);
-            dim3 threads(32, 8, 1);
-            grid.x = divUp(src.cols, threads.x);
-            grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
-        }
-
-        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
-        {
-            dim3 grid(1, 1, 1);
-            dim3 threads(32, 8, 1);
-            grid.x = divUp(src.cols, threads.x);
-            grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
-        }
-
-        /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
-
-        template <typename T>
-        __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
-        {
-            unsigned int H = ((ndisp-d) * 240)/ndisp;
-
-            unsigned int hi = (H/60) % 6;
-            float f = H/60.f - H/60;
-            float p = V * (1 - S);
-            float q = V * (1 - f * S);
-            float t = V * (1 - (1 - f) * S);
-
-            float3 res;
-
-            if (hi == 0) //R = V,	G = t,	B = p
-            {
-                res.x = p;
-                res.y = t;
-                res.z = V;
-            }
-
-            if (hi == 1) // R = q,	G = V,	B = p
-            {
-                res.x = p;
-                res.y = V;
-                res.z = q;
-            }
-
-            if (hi == 2) // R = p,	G = V,	B = t
-            {
-                res.x = t;
-                res.y = V;
-                res.z = p;
-            }
-
-            if (hi == 3) // R = p,	G = q,	B = V
-            {
-                res.x = V;
-                res.y = q;
-                res.z = p;
-            }
-
-            if (hi == 4) // R = t,	G = p,	B = V
-            {
-                res.x = V;
-                res.y = p;
-                res.z = t;
-            }
-
-            if (hi == 5) // R = V,	G = p,	B = q
-            {
-                res.x = q;
-                res.y = p;
-                res.z = V;
-            }
-            const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
-            const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
-            const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
-            const unsigned int a = 255U;
-
-            return (a << 24) + (r << 16) + (g << 8) + b;
-        }
-
-        __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if(x < width && y < height)
-            {
-                uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
-
-                uint4 res;
-                res.x = cvtPixel(d4.x, ndisp);
-                res.y = cvtPixel(d4.y, ndisp);
-                res.z = cvtPixel(d4.z, ndisp);
-                res.w = cvtPixel(d4.w, ndisp);
-
-                uint4* line = (uint4*)(out_image + y * out_step);
-                line[x >> 2] = res;
-            }
-        }
-
-        __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if(x < width && y < height)
-            {
-                short2 d2 = *(short2*)(disp + y * disp_step + x);
-
-                uint2 res;
-                res.x = cvtPixel(d2.x, ndisp);
-                res.y = cvtPixel(d2.y, ndisp);
-
-                uint2* line = (uint2*)(out_image + y * out_step);
-                line[x >> 1] = res;
-            }
-        }
-
-
-        void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
-        {
-            dim3 threads(16, 16, 1);
-            dim3 grid(1, 1, 1);
-            grid.x = divUp(src.cols, threads.x << 2);
-            grid.y = divUp(src.rows, threads.y);
-
-            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-            grid.x = divUp(src.cols, threads.x << 1);
-            grid.y = divUp(src.rows, threads.y);
-
-            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
-
-        __constant__ float cq[16];
-
-        template <typename T, typename D>
-        __global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y >= disp.rows || x >= disp.cols)
-                return;
-
-            const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
-            const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
-            const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
-            const float qw = x * cq[12] + y * cq[13] + cq[15];
-
-            const T d = disp(y, x);
-
-            const float iW = 1.f / (qw + cq[14] * d);
-
-            D v = VecTraits<D>::all(1.0f);
-            v.x = (qx + cq[2] * d) * iW;
-            v.y = (qy + cq[6] * d) * iW;
-            v.z = (qz + cq[10] * d) * iW;
-
-            xyz(y, x) = v;
-        }
-
-        template <typename T, typename D>
-        void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));
-
-            cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
-
-            reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-
-        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
-
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        float dx = tex2D(harrisDxTex, j, i);
-                        float dy = tex2D(harrisDyTex, j, i);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
-            }
-        }
-
-        template <typename BR, typename BC>
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    const int y = border_col.idx_row(i);
-
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        const int x = border_row.idx_col(j);
-
-                        float dx = tex2D(harrisDxTex, x, y);
-                        float dy = tex2D(harrisDyTex, x, y);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
-            }
-        }
-
-        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&harrisDxTex, Dx);
-            bindTexture(&harrisDyTex, Dy);
-
-            switch (border_type)
-            {
-            case BORDER_REFLECT101_GPU:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
-                break;
-
-            case BORDER_REFLECT_GPU:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
-                break;
-
-            case BORDER_REPLICATE_GPU:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
-
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        float dx = tex2D(minEigenValDxTex, j, i);
-                        float dy = tex2D(minEigenValDyTex, j, i);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                a *= 0.5f;
-                c *= 0.5f;
-
-                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
-            }
-        }
-
-
-        template <typename BR, typename BC>
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    int y = border_col.idx_row(i);
-
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        int x = border_row.idx_col(j);
-
-                        float dx = tex2D(minEigenValDxTex, x, y);
-                        float dy = tex2D(minEigenValDyTex, x, y);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                a *= 0.5f;
-                c *= 0.5f;
-
-                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
-            }
-        }
-
-        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&minEigenValDxTex, Dx);
-            bindTexture(&minEigenValDyTex, Dy);
-
-            switch (border_type)
-            {
-            case BORDER_REFLECT101_GPU:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
-                break;
-
-            case BORDER_REFLECT_GPU:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
-                break;
-
-            case BORDER_REPLICATE_GPU:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-        //////////////////////////////////////////////////////////////////////////
-        // buildWarpMaps
-
-        // TODO use intrinsics like __sinf and so on
-
-        namespace build_warp_maps
-        {
-
-            __constant__ float ck_rinv[9];
-            __constant__ float cr_kinv[9];
-            __constant__ float ct[3];
-            __constant__ float cscale;
-        }
-
-
-        class PlaneMapper
-        {
-        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-            {
-                using namespace build_warp_maps;
-
-                float x_ = u / cscale - ct[0];
-                float y_ = v / cscale - ct[1];
-
-                float z;
-                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
-                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
-                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
-
-                x /= z;
-                y /= z;
-            }
-        };
-
-
-        class CylindricalMapper
-        {
-        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-            {
-                using namespace build_warp_maps;
-
-                u /= cscale;
-                float x_ = ::sinf(u);
-                float y_ = v / cscale;
-                float z_ = ::cosf(u);
-
-                float z;
-                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
-
-                if (z > 0) { x /= z; y /= z; }
-                else x = y = -1;
-            }
-        };
-
-
-        class SphericalMapper
-        {
-        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-            {
-                using namespace build_warp_maps;
-
-                v /= cscale;
-                u /= cscale;
-
-                float sinv = ::sinf(v);
-                float x_ = sinv * ::sinf(u);
-                float y_ = -::cosf(v);
-                float z_ = sinv * ::cosf(u);
-
-                float z;
-                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
-
-                if (z > 0) { x /= z; y /= z; }
-                else x = y = -1;
-            }
-        };
-
-
-        template <typename Mapper>
-        __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
-                                            PtrStepf map_x, PtrStepf map_y)
-        {
-            int du = blockIdx.x * blockDim.x + threadIdx.x;
-            int dv = blockIdx.y * blockDim.y + threadIdx.y;
-            if (du < cols && dv < rows)
-            {
-                float u = tl_u + du;
-                float v = tl_v + dv;
-                float x, y;
-                Mapper::mapBackward(u, v, x, y);
-                map_x.ptr(dv)[du] = x;
-                map_y.ptr(dv)[du] = y;
-            }
-        }
-
-
-        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                const float k_rinv[9], const float r_kinv[9], const float t[3],
-                                float scale, cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
-
-            int cols = map_x.cols;
-            int rows = map_x.rows;
-
-            dim3 threads(32, 8);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-            cudaSafeCall(cudaGetLastError());
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                      const float k_rinv[9], const float r_kinv[9], float scale,
-                                      cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
-
-            int cols = map_x.cols;
-            int rows = map_x.rows;
-
-            dim3 threads(32, 8);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-            cudaSafeCall(cudaGetLastError());
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                    const float k_rinv[9], const float r_kinv[9], float scale,
-                                    cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
-
-            int cols = map_x.cols;
-            int rows = map_x.rows;
-
-            dim3 threads(32, 8);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-            cudaSafeCall(cudaGetLastError());
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev {
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
deleted file mode 100644
index 6670639290..0000000000
--- a/modules/gpu/src/cuda/match_template.cu
+++ /dev/null
@@ -1,916 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace match_template
-    {
-        __device__ __forceinline__ float sum(float v) { return v; }
-        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
-        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
-        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
-
-        __device__ __forceinline__ float first(float v) { return v; }
-        __device__ __forceinline__ float first(float2 v) { return v.x; }
-        __device__ __forceinline__ float first(float3 v) { return v.x; }
-        __device__ __forceinline__ float first(float4 v) { return v.x; }
-
-        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
-        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
-        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
-        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-
-        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
-        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-
-        //////////////////////////////////////////////////////////////////////
-        // Naive_CCORR
-
-        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
-        {
-            typedef typename TypeVec<T, cn>::vec_type Type;
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef res = VecTraits<Typef>::all(0);
-
-                for (int i = 0; i < h; ++i)
-                {
-                    const Type* image_ptr = (const Type*)image.ptr(y + i);
-                    const Type* templ_ptr = (const Type*)templ.ptr(i);
-                    for (int j = 0; j < w; ++j)
-                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
-                }
-
-                result.ptr(y)[x] = sum(res);
-            }
-        }
-
-        template <typename T, int cn>
-        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-
-        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Naive_SQDIFF
-
-        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
-        {
-            typedef typename TypeVec<T, cn>::vec_type Type;
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef res = VecTraits<Typef>::all(0);
-                Typef delta;
-
-                for (int i = 0; i < h; ++i)
-                {
-                    const Type* image_ptr = (const Type*)image.ptr(y + i);
-                    const Type* templ_ptr = (const Type*)templ.ptr(i);
-                    for (int j = 0; j < w; ++j)
-                    {
-                        delta = sub(image_ptr[x + j], templ_ptr[j]);
-                        res = res + delta * delta;
-                    }
-                }
-
-                result.ptr(y)[x] = sum(res);
-            }
-        }
-
-        template <typename T, int cn>
-        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_SQDIFF
-
-        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
-            }
-        }
-
-        template <int cn>
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
-                                             cudaStream_t stream)
-        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
-            };
-
-            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_SQDIFF_NORMED
-
-        // normAcc* are accurate normalization routines which make GPU matchTemplate
-        // consistent with CPU one
-
-        __device__ float normAcc(float num, float denum)
-        {
-            if (::fabs(num) < denum)
-                return num / denum;
-            if (::fabs(num) < denum * 1.125f)
-                return num > 0 ? 1 : -1;
-            return 0;
-        }
-
-
-        __device__ float normAcc_SQDIFF(float num, float denum)
-        {
-            if (::fabs(num) < denum)
-                return num / denum;
-            if (::fabs(num) < denum * 1.125f)
-                return num > 0 ? 1 : -1;
-            return 1;
-        }
-
-
-        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
-                                                  sqrtf(image_sqsum_ * templ_sqsum));
-            }
-        }
-
-        template <int cn>
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
-            static const caller_t callers[] =
-            {
-                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
-            };
-
-            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_CCOFF
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_ = (float)(
-                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
-                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC2(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                unsigned int templ_sum_r, unsigned int templ_sum_g,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
-                    image_sum_r, image_sum_g, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
-                int w, int h,
-                float templ_sum_scale_r,
-                float templ_sum_scale_g,
-                float templ_sum_scale_b,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g
-                                         - image_sum_b_ * templ_sum_scale_b;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h,
-                    (float)templ_sum_r / (w * h),
-                    (float)templ_sum_g / (w * h),
-                    (float)templ_sum_b / (w * h),
-                    image_sum_r, image_sum_g, image_sum_b, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
-                int w, int h,
-                float templ_sum_scale_r,
-                float templ_sum_scale_g,
-                float templ_sum_scale_b,
-                float templ_sum_scale_a,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                const PtrStep<unsigned int> image_sum_a,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sum_a_ = (float)(
-                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g
-                                         - image_sum_b_ * templ_sum_scale_b
-                                         - image_sum_a_ * templ_sum_scale_a;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h,
-                    (float)templ_sum_r / (w * h),
-                    (float)templ_sum_g / (w * h),
-                    (float)templ_sum_b / (w * h),
-                    (float)templ_sum_a / (w * h),
-                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_CCOFF_NORMED
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
-                int w, int h, float weight,
-                float templ_sum_scale, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum,
-                const PtrStep<unsigned long long> image_sqsum,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float ccorr = result.ptr(y)[x];
-                float image_sum_ = (float)(
-                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
-                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
-                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
-                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const PtrStepSz<unsigned int> image_sum,
-                    const PtrStepSz<unsigned long long> image_sqsum,
-                    unsigned int templ_sum, unsigned long long templ_sqsum,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale = templ_sum * weight;
-            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
-                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
-                    image_sum, image_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g,
-                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                             - image_sum_g_ * templ_sum_scale_g;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
-                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sqsum_b_ = (float)(
-                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                             - image_sum_g_ * templ_sum_scale_g
-                                             - image_sum_b_ * templ_sum_scale_b;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    image_sum_b, image_sqsum_b,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
-                float templ_sum_scale_a, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sqsum_b_ = (float)(
-                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-                float image_sum_a_ = (float)(
-                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-                float image_sqsum_a_ = (float)(
-                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
-                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
-                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
-                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sum_scale_a = templ_sum_a * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
-                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    image_sum_b, image_sqsum_b,
-                    image_sum_a, image_sqsum_a,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // normalize
-
-        template <int cn>
-        __global__ void normalizeKernel_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
-            }
-        }
-
-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            switch (cn)
-            {
-            case 1:
-                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 2:
-                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 3:
-                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 4:
-                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // extractFirstChannel
-
-        template <int cn>
-        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
-        {
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef val = ((const Typef*)image.ptr(y))[x];
-                result.ptr(y)[x] = first(val);
-            }
-        }
-
-        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            switch (cn)
-            {
-            case 1:
-                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 2:
-                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 3:
-                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 4:
-                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            }
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } //namespace match_template
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/nlm.cu b/modules/gpu/src/cuda/nlm.cu
deleted file mode 100644
index 92bfccf37c..0000000000
--- a/modules/gpu/src/cuda/nlm.cu
+++ /dev/null
@@ -1,569 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm2(const float& v) { return v*v; }
-        __device__ __forceinline__ float norm2(const float2& v) { return v.x*v.x + v.y*v.y; }
-        __device__ __forceinline__ float norm2(const float3& v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
-        __device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z  + v.w*v.w; }
-
-        template<typename T, typename B>
-        __global__ void nlm_kernel(const PtrStep<T> src, PtrStepSz<T> dst, const B b, int search_radius, int block_radius, float noise_mult)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-            const int i = blockDim.y * blockIdx.y + threadIdx.y;
-            const int j = blockDim.x * blockIdx.x + threadIdx.x;
-
-            if (j >= dst.cols || i >= dst.rows)
-                return;
-
-            int bsize = search_radius + block_radius;
-            int search_window = 2 * search_radius + 1;
-            float minus_search_window2_inv = -1.f/(search_window * search_window);
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0.f;
-
-            if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
-            {
-                for(float y = -search_radius; y <= search_radius; ++y)
-                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {
-                        float dist2 = 0;
-                        for(float ty = -block_radius; ty <= block_radius; ++ty)
-                            for(float tx = -block_radius; tx <= block_radius; ++tx)
-                            {
-                                value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
-                                value_type av = saturate_cast<value_type>(src(i +     ty, j +     tx));
-
-                                dist2 += norm2(av - bv);
-                            }
-
-                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
-
-                        /*if (i == 255 && j == 255)
-                            printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/
-
-                        sum1 = sum1 + w * saturate_cast<value_type>(src(i + y, j + x));
-                        sum2 += w;
-                    }
-            }
-            else
-            {
-                for(float y = -search_radius; y <= search_radius; ++y)
-                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {
-                        float dist2 = 0;
-                        for(float ty = -block_radius; ty <= block_radius; ++ty)
-                            for(float tx = -block_radius; tx <= block_radius; ++tx)
-                            {
-                                value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
-                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));
-                                dist2 += norm2(av - bv);
-                            }
-
-                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
-
-                        sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
-                        sum2 += w;
-                    }
-
-            }
-
-            dst(i, j) = saturate_cast<T>(sum1 / sum2);
-
-        }
-
-        template<typename T, template <typename> class B>
-        void nlm_caller(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            int block_window = 2 * block_radius + 1;
-            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
-            float noise_mult = minus_h2_inv/(block_window * block_window);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);
-
-            static func_t funcs[] =
-            {
-                nlm_caller<T, BrdReflect101>,
-                nlm_caller<T, BrdReplicate>,
-                nlm_caller<T, BrdConstant>,
-                nlm_caller<T, BrdReflect>,
-                nlm_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, search_radius, block_radius, h, stream);
-        }
-
-        template void nlm_bruteforce_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-        template void nlm_bruteforce_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-        template void nlm_bruteforce_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-    }
-}}}
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (fast approximate version)
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-
-        template <int cn> struct Unroll;
-        template <> struct Unroll<1>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
-            {
-                return thrust::tie(val1, val2);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op);
-            }
-        };
-        template <> struct Unroll<2>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op);
-            }
-        };
-        template <> struct Unroll<3>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y, val2.z);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op, op);
-            }
-        };
-        template <> struct Unroll<4>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op, op, op);
-            }
-        };
-
-        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
-        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
-        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
-
-        template <class T> struct FastNonLocalMenas
-        {
-            enum
-            {
-                CTA_SIZE = 128,
-
-                TILE_COLS = 128,
-                TILE_ROWS = 32,
-
-                STRIDE = CTA_SIZE
-            };
-
-            struct plus
-            {
-                __device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; }
-            };
-
-            int search_radius;
-            int block_radius;
-
-            int search_window;
-            int block_window;
-            float minus_h2_inv;
-
-            FastNonLocalMenas(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
-                search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {}
-
-            PtrStep<T> src;
-            mutable PtrStepi buffer;
-
-            __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    dist_sums[index] = 0;
-
-                    for(int tx = 0; tx < block_window; ++tx)
-                        col_sums(tx, index) = 0;
-
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int ay = i;
-                    int ax = j;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius;
-
-#if 1
-                    for (int tx = -block_radius; tx <= block_radius; ++tx)
-                    {
-                        int col_sum = 0;
-                        for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        {
-                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
-
-                            dist_sums[index] += dist;
-                            col_sum += dist;
-                        }
-                        col_sums(tx + block_radius, index) = col_sum;
-                    }
-#else
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        for (int tx = -block_radius; tx <= block_radius; ++tx)
-                        {
-                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
-
-                            dist_sums[index] += dist;
-                            col_sums(tx + block_radius, index) += dist;
-                        }
-#endif
-
-                    up_col_sums(j, index) = col_sums(block_window - 1, index);
-                }
-            }
-
-            __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int ay = i;
-                    int ax = j + block_radius;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius + block_radius;
-
-                    int col_sum = 0;
-
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
-
-                    dist_sums[index] += col_sum - col_sums(first, index);
-
-                    col_sums(first, index) = col_sum;
-                    up_col_sums(j, index) = col_sum;
-                }
-            }
-
-            __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                int ay = i;
-                int ax = j + block_radius;
-
-                T a_up   = src(ay - block_radius - 1, ax);
-                T a_down = src(ay + block_radius, ax);
-
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius + block_radius;
-
-                    T b_up   = src(by - block_radius - 1, bx);
-                    T b_down = src(by + block_radius, bx);
-
-                    int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-
-                    dist_sums[index] += col_sum  - col_sums(first, index);
-                    col_sums(first, index) = col_sum;
-                    up_col_sums(j, index) = col_sum;
-                }
-            }
-
-            __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type;
-
-                float weights_sum = 0;
-                sum_type sum = VecTraits<sum_type>::all(0);
-
-                float bw2_inv = 1.f/(block_window * block_window);
-
-                int sx = j - search_radius;
-                int sy = i - search_radius;
-
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    float avg_dist = dist_sums[index] * bw2_inv;
-                    float weight = __expf(avg_dist * minus_h2_inv);
-                    weights_sum += weight;
-
-                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
-                }
-
-                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
-
-                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
-                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
-                                 threadIdx.x,
-                                 Unroll<VecTraits<T>::cn>::op());
-
-                if (threadIdx.x == 0)
-                    dst = saturate_cast<T>(sum / weights_sum);
-            }
-
-            __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
-            {
-                int tbx = blockIdx.x * TILE_COLS;
-                int tby = blockIdx.y * TILE_ROWS;
-
-                int tex = ::min(tbx + TILE_COLS, dst.cols);
-                int tey = ::min(tby + TILE_ROWS, dst.rows);
-
-                PtrStepi col_sums;
-                col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
-                col_sums.step = buffer.step;
-
-                PtrStepi up_col_sums;
-                up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
-                up_col_sums.step = buffer.step;
-
-                extern __shared__ int dist_sums[]; //search_window * search_window
-
-                int first = 0;
-
-                for (int i = tby; i < tey; ++i)
-                    for (int j = tbx; j < tex; ++j)
-                    {
-                        __syncthreads();
-
-                        if (j == tbx)
-                        {
-                            initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
-                            first = 0;
-                        }
-                        else
-                        {
-                            if (i == tby)
-                              shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
-                            else
-                              shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
-
-                            first = (first + 1) % block_window;
-                        }
-
-                        __syncthreads();
-
-                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
-                    }
-            }
-
-        };
-
-        template<typename T>
-        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
-
-        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-        {
-            typedef FastNonLocalMenas<uchar> FNLM;
-            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
-
-            buffer_cols = search_window * search_window * grid.y;
-            buffer_rows = src.cols + block_window * grid.x;
-        }
-
-        template<typename T>
-        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
-                          int search_window, int block_window, float h, cudaStream_t stream)
-        {
-            typedef FastNonLocalMenas<T> FNLM;
-            FNLM fnlm(search_window, block_window, h);
-
-            fnlm.src = (PtrStepSz<T>)src;
-            fnlm.buffer = buffer;
-
-            dim3 block(FNLM::CTA_SIZE, 1);
-            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
-            int smem = search_window * search_window * sizeof(int);
-
-
-            fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void nlm_fast_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float,  cudaStream_t);
-        template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-        template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-
-
-
-        __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x < lab.cols && y < lab.rows)
-            {
-                uchar3 p = lab(y, x);
-                ab(y,x) = make_uchar2(p.y, p.z);
-                l(y,x) = p.x;
-            }
-        }
-
-        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
-        {
-            dim3 b(32, 8);
-            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
-
-            fnlm_split_kernel<<<g, b>>>(lab, l, ab);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep<uchar2> ab, PtrStepSz<uchar3> lab)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x < lab.cols && y < lab.rows)
-            {
-                uchar2 p = ab(y, x);
-                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
-            }
-        }
-
-        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream)
-        {
-            dim3 b(32, 8);
-            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
-
-            fnlm_merge_kernel<<<g, b>>>(l, ab, lab);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/pyr_down.cu b/modules/gpu/src/cuda/pyr_down.cu
deleted file mode 100644
index 904f549bad..0000000000
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ /dev/null
@@ -1,228 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
-
-            __shared__ work_t smem[256 + 4];
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y;
-
-            const int src_y = 2 * y;
-
-            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, x);
-                    sum = sum + 0.25f   * src(src_y - 1, x);
-                    sum = sum + 0.375f  * src(src_y    , x);
-                    sum = sum + 0.25f   * src(src_y + 1, x);
-                    sum = sum + 0.0625f * src(src_y + 2, x);
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, left_x);
-                    sum = sum + 0.25f   * src(src_y - 1, left_x);
-                    sum = sum + 0.375f  * src(src_y    , left_x);
-                    sum = sum + 0.25f   * src(src_y + 1, left_x);
-                    sum = sum + 0.0625f * src(src_y + 2, left_x);
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, right_x);
-                    sum = sum + 0.25f   * src(src_y - 1, right_x);
-                    sum = sum + 0.375f  * src(src_y    , right_x);
-                    sum = sum + 0.25f   * src(src_y + 1, right_x);
-                    sum = sum + 0.0625f * src(src_y + 2, right_x);
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-            else
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-
-            __syncthreads();
-
-            if (threadIdx.x < 128)
-            {
-                const int tid2 = threadIdx.x * 2;
-
-                work_t sum;
-
-                sum =       0.0625f * smem[2 + tid2 - 2];
-                sum = sum + 0.25f   * smem[2 + tid2 - 1];
-                sum = sum + 0.375f  * smem[2 + tid2    ];
-                sum = sum + 0.25f   * smem[2 + tid2 + 1];
-                sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-                if (dst_x < dst_cols)
-                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-            }
-        }
-
-        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(src.cols, block.x), dst.rows);
-
-            B<T> b(src.rows, src.cols);
-
-            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/pyr_up.cu b/modules/gpu/src/cuda/pyr_up.cu
deleted file mode 100644
index 36a72274cf..0000000000
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ /dev/null
@@ -1,196 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            __shared__ sum_t s_srcPatch[10][10];
-            __shared__ sum_t s_dstPatch[20][16];
-
-            if (threadIdx.x < 10 && threadIdx.y < 10)
-            {
-                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
-                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
-
-                srcx = ::abs(srcx);
-                srcx = ::min(src.cols - 1, srcx);
-
-                srcy = ::abs(srcy);
-                srcy = ::min(src.rows - 1, srcy);
-
-                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
-            }
-
-            __syncthreads();
-
-            sum_t sum = VecTraits<sum_t>::all(0);
-
-            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
-            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
-            const bool eveny = ((threadIdx.y & 1) == 0);
-            const int tidx = threadIdx.x;
-
-            if (eveny)
-            {
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
-                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
-            }
-
-            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
-
-            if (threadIdx.y < 2)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
-            }
-
-            if (threadIdx.y > 13)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
-            }
-
-            __syncthreads();
-
-            sum = VecTraits<sum_t>::all(0);
-
-            const int tidy = threadIdx.y;
-
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
-            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
-
-            if (x < dst.cols && y < dst.rows)
-                dst(y, x) = saturate_cast<T>(4.0f * sum);
-        }
-
-        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(16, 16);
-            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            pyrUp<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/remap.cu b/modules/gpu/src/cuda/remap.cu
deleted file mode 100644
index dd2c669159..0000000000
--- a/modules/gpu/src/cuda/remap.cu
+++ /dev/null
@@ -1,274 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = mapx.ptr(y)[x];
-                const float ycoo = mapy.ptr(y)[x];
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_remap_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
-            {
-                if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
-                else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
-            }
-        };
-
-        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
-
-            static const caller_t callers[3][5] =
-            {
-                {
-                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
-                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
-        }
-
-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
deleted file mode 100644
index 04c1fb2ac4..0000000000
--- a/modules/gpu/src/cuda/resize.cu
+++ /dev/null
@@ -1,302 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <cfloat>
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = x * fx;
-                const float ycoo = y * fy;
-
-                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                dst(y, x) = saturate_cast<T>(src(y, x));
-            }
-        }
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
-
-                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
-                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_resize_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type, srcWhole); \
-                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate< type > brd(src.rows, src.cols); \
-                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
-                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                if (stream == 0)
-                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
-                else
-                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> struct ResizeDispatcher<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                int iscale_x = (int)round(fx);
-                int iscale_y = (int)round(fy);
-
-                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
-                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
-                else
-                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-            PtrStepSzb dst, int interpolation, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
-
-            static const caller_t callers[4] =
-            {
-                ResizeDispatcher<PointFilter, T>::call,
-                ResizeDispatcher<LinearFilter, T>::call,
-                ResizeDispatcher<CubicFilter, T>::call,
-                ResizeDispatcher<AreaFilter, T>::call
-            };
-            // chenge to linear if area interpolation upscaling
-            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
-                interpolation = 1;
-
-            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
-                static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template<typename T> struct scan_traits{};
-
-        template<> struct scan_traits<uchar>
-        {
-            typedef float scan_line_type;
-        };
-
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/warp.cu b/modules/gpu/src/cuda/warp.cu
deleted file mode 100644
index 8c5a067d36..0000000000
--- a/modules/gpu/src/cuda/warp.cu
+++ /dev/null
@@ -1,389 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __constant__ float c_warpMat[3 * 3];
-
-        struct AffineTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
-                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        struct PerspectiveTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
-
-                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
-                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////
-        // Build Maps
-
-        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < xmap.cols && y < xmap.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                xmap(y, x) = coord.x;
-                ymap(y, x) = coord.y;
-            }
-        }
-
-        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
-
-            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
-        }
-
-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////////
-        // Warp
-
-        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
-            }
-        }
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
-            {
-                (void)xoff;
-                (void)yoff;
-                (void)srcWhole;
-
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_warp_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
-            {
-                if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
-                else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
-            }
-        };
-
-        template <class Transform, typename T>
-        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
-
-            static const func_t funcs[3][5] =
-            {
-                {
-                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
-        }
-
-        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
-        }
-
-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
-        }
-
-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cvt_color_internal.h b/modules/gpu/src/cvt_color_internal.h
deleted file mode 100644
index 010d832a25..0000000000
--- a/modules/gpu/src/cvt_color_internal.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __cvt_color_internal_h__
-#define __cvt_color_internal_h__
-
-namespace cv { namespace gpu { namespace cudev
-{
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
-
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
-}}}
-
-#endif
diff --git a/modules/gpu/src/denoising.cpp b/modules/gpu/src/denoising.cpp
deleted file mode 100644
index 1687f8e3cc..0000000000
--- a/modules/gpu/src/denoising.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::bilateralFilter(const GpuMat&, GpuMat&, int, float, float, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::nonLocalMeans(const GpuMat&, GpuMat&, float, int, int, int, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat&, GpuMat&, float, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat&, GpuMat&, float, float, int, int, Stream&) { throw_no_cuda(); }
-
-
-#else
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (brute force)
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t stream);
-
-        template<typename T>
-        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& s)
-{
-    using cv::gpu::cudev::imgproc::bilateral_filter_gpu;
-
-    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
-
-    static const func_t funcs[6][4] =
-    {
-        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
-        {0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/},
-        {bilateral_filter_gpu<ushort>     , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3>     , bilateral_filter_gpu<ushort4>     },
-        {bilateral_filter_gpu<short>      , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3>      , bilateral_filter_gpu<short4>      },
-        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
-        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
-    };
-
-    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
-    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
-
-
-    int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
-    kernel_size = std::max(radius, 1)*2 + 1;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(src.size(), src.type());
-    func(src, dst, kernel_size, sigma_spatial, sigma_color, gpuBorderType, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s)
-{
-    using cv::gpu::cudev::imgproc::nlm_bruteforce_gpu;
-    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
-
-    static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ };
-
-    CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3);
-
-    const func_t func = funcs[src.channels() - 1];
-    CV_Assert(func != 0);
-
-    int b = borderMode;
-    CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(src.size(), src.type());
-    func(src, dst, search_window/2, block_window/2, h, gpuBorderType, StreamAccessor::getStream(s));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (fast approxinate)
-
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
-
-        template<typename T>
-        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
-                          int search_window, int block_window, float h, cudaStream_t stream);
-
-        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream);
-        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream);
-     }
-}}}
-
-void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
-{
-    CV_Assert(src.depth() == CV_8U && src.channels() < 4);
-
-    int border_size = search_window/2 + block_window/2;
-    Size esize = src.size() + Size(border_size, border_size) * 2;
-
-    cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
-    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);
-
-    cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
-    GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));
-
-    int bcols, brows;
-    cudev::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
-    buffer.create(brows, bcols, CV_32S);
-
-    using namespace cv::gpu::cudev::imgproc;
-    typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};
-
-    dst.create(src.size(), src.type());
-    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC3);
-
-    lab.create(src.size(), src.type());
-    cv::gpu::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, s);
-
-    l.create(src.size(), CV_8U);
-    ab.create(src.size(), CV_8UC2);
-    cudev::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
-
-    simpleMethod(l, l, h_luminance, search_window, block_window, s);
-    simpleMethod(ab, ab, h_color, search_window, block_window, s);
-
-    cudev::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
-    cv::gpu::cvtColor(lab, dst, cv::COLOR_Lab2BGR, 0, s);
-}
-
-
-#endif
-
-
diff --git a/modules/gpu/src/disparity_bilateral_filter.cpp b/modules/gpu/src/disparity_bilateral_filter.cpp
new file mode 100644
index 0000000000..ef5be018da
--- /dev/null
+++ b/modules/gpu/src/disparity_bilateral_filter.cpp
@@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }
+
+void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace disp_bilateral_filter
+    {
+        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
+
+        template<typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::cudev::disp_bilateral_filter;
+
+namespace
+{
+    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
+    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
+    const float DEFAULT_SIGMA_RANGE = 10.0f;
+
+    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
+    {
+        Mat cpu_table_color(1, len, CV_32F);
+
+        float* line = cpu_table_color.ptr<float>();
+
+        for(int i = 0; i < len; i++)
+            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
+
+        table_color.upload(cpu_table_color);
+    }
+
+    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
+    {
+        int half = (win_size >> 1);
+
+        Mat cpu_table_space(half + 1, half + 1, CV_32F);
+
+        for (int y = 0; y <= half; ++y)
+        {
+            float* row = cpu_table_space.ptr<float>(y);
+            for (int x = 0; x <= half; ++x)
+                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
+        }
+
+        table_space.upload(cpu_table_space);
+    }
+
+    template <typename T>
+    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
+                                   GpuMat& table_color, GpuMat& table_space,
+                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+    {
+        short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
+        short max_disc = short(ndisp * max_disc_threshold + 0.5);
+
+        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
+
+        if (&dst != &disp)
+        {
+            if (stream)
+                stream.enqueueCopy(disp, dst);
+            else
+                disp.copyTo(dst);
+        }
+
+        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+    }
+
+    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                                GpuMat& table_color, GpuMat& table_space,
+                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
+
+    const bilateral_filter_operator_t operators[] =
+        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
+}
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
+    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
+      sigma_range(DEFAULT_SIGMA_RANGE)
+{
+    calc_color_weighted_table(table_color, sigma_range, 255);
+    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
+}
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
+                                                     float max_disc_threshold_, float sigma_range_)
+    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
+      sigma_range(sigma_range_)
+{
+    calc_color_weighted_table(table_color, sigma_range, 255);
+    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
+}
+
+void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+{
+    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
+    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
+    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/gftt.cpp b/modules/gpu/src/gftt.cpp
deleted file mode 100644
index 18a729bc17..0000000000
--- a/modules/gpu/src/gftt.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace gfft
-    {
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
-    }
-}}}
-
-void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask)
-{
-    using namespace cv::gpu::cudev::gfft;
-
-    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
-
-    ensureSizeIsEnough(image.size(), CV_32F, eig_);
-
-    if (useHarrisDetector)
-        cornerHarris(image, eig_, Dx_, Dy_, buf_, blockSize, 3, harrisK);
-    else
-        cornerMinEigenVal(image, eig_, Dx_, Dy_, buf_, blockSize, 3);
-
-    double maxVal = 0;
-    minMax(eig_, 0, &maxVal, GpuMat(), minMaxbuf_);
-
-    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
-
-    int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
-
-    if (total == 0)
-    {
-        corners.release();
-        return;
-    }
-
-    sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
-
-    if (minDistance < 1)
-        tmpCorners_.colRange(0, maxCorners > 0 ? std::min(maxCorners, total) : total).copyTo(corners);
-    else
-    {
-        std::vector<Point2f> tmp(total);
-        Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
-        tmpCorners_.colRange(0, total).download(tmpMat);
-
-        std::vector<Point2f> tmp2;
-        tmp2.reserve(total);
-
-        const int cell_size = cvRound(minDistance);
-        const int grid_width = (image.cols + cell_size - 1) / cell_size;
-        const int grid_height = (image.rows + cell_size - 1) / cell_size;
-
-        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
-
-        for (int i = 0; i < total; ++i)
-        {
-            Point2f p = tmp[i];
-
-            bool good = true;
-
-            int x_cell = static_cast<int>(p.x / cell_size);
-            int y_cell = static_cast<int>(p.y / cell_size);
-
-            int x1 = x_cell - 1;
-            int y1 = y_cell - 1;
-            int x2 = x_cell + 1;
-            int y2 = y_cell + 1;
-
-            // boundary check
-            x1 = std::max(0, x1);
-            y1 = std::max(0, y1);
-            x2 = std::min(grid_width - 1, x2);
-            y2 = std::min(grid_height - 1, y2);
-
-            for (int yy = y1; yy <= y2; yy++)
-            {
-                for (int xx = x1; xx <= x2; xx++)
-                {
-                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
-
-                    if (!m.empty())
-                    {
-                        for(size_t j = 0; j < m.size(); j++)
-                        {
-                            float dx = p.x - m[j].x;
-                            float dy = p.y - m[j].y;
-
-                            if (dx * dx + dy * dy < minDistance * minDistance)
-                            {
-                                good = false;
-                                goto break_out;
-                            }
-                        }
-                    }
-                }
-            }
-
-            break_out:
-
-            if(good)
-            {
-                grid[y_cell * grid_width + x_cell].push_back(p);
-
-                tmp2.push_back(p);
-
-                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
-                    break;
-            }
-        }
-
-        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
-    }
-}
-
-#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/graphcuts.cpp b/modules/gpu/src/graphcuts.cpp
deleted file mode 100644
index 40ccd04710..0000000000
--- a/modules/gpu/src/graphcuts.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
-void cv::gpu::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace ccl
-    {
-        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream);
-
-        template<typename T>
-        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-    }
-}}}
-
-static float4 scalarToCudaType(const cv::Scalar& in)
-{
-  return make_float4((float)in[0], (float)in[1], (float)in[2], (float)in[3]);
-}
-
-void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
-{
-    CV_Assert(!image.empty());
-
-    int ch = image.channels();
-    CV_Assert(ch <= 4);
-
-    int depth = image.depth();
-
-    typedef void (*func_t)(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-
-    static const func_t suppotLookup[8][4] =
-    {   //    1,    2,     3,     4
-        { cudev::ccl::computeEdges<uchar>,  0,  cudev::ccl::computeEdges<uchar3>,  cudev::ccl::computeEdges<uchar4>  },// CV_8U
-        { 0,                                 0,  0,                                  0                                  },// CV_16U
-        { cudev::ccl::computeEdges<ushort>, 0,  cudev::ccl::computeEdges<ushort3>, cudev::ccl::computeEdges<ushort4> },// CV_8S
-        { 0,                                 0,  0,                                  0                                  },// CV_16S
-        { cudev::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
-        { cudev::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
-        { 0,                                 0,  0,                                  0                                  },// CV_64F
-        { 0,                                 0,  0,                                  0                                  } // CV_USRTYPE1
-    };
-
-    func_t f = suppotLookup[depth][ch - 1];
-    CV_Assert(f);
-
-    if (image.size() != mask.size() || mask.type() != CV_8UC1)
-        mask.create(image.size(), CV_8UC1);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
-    f(image, mask, culo, cuhi, stream);
-}
-
-void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
-{
-    CV_Assert(!mask.empty() && mask.type() == CV_8U);
-
-    if (!deviceSupports(SHARED_ATOMICS))
-        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
-
-    components.create(mask.size(), CV_32SC1);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    cudev::ccl::labelComponents(mask, components, flags, stream);
-}
-
-namespace
-{
-    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
-
-    class NppiGraphcutStateHandler
-    {
-    public:
-        NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
-        {
-            nppSafeCall( func(sznpp, &pState, pDeviceMem) );
-        }
-
-        ~NppiGraphcutStateHandler()
-        {
-            nppSafeCall( nppiGraphcutFree(pState) );
-        }
-
-        operator NppiGraphcutState*()
-        {
-            return pState;
-        }
-
-    private:
-        NppiGraphcutState* pState;
-    };
-}
-
-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
-{
-#if (CUDA_VERSION < 5000)
-    CV_Assert(terminals.type() == CV_32S);
-#else
-    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
-#endif
-
-    Size src_size = terminals.size();
-
-    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(leftTransp.type() == terminals.type());
-
-    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(rightTransp.type() == terminals.type());
-
-    CV_Assert(top.size() == src_size);
-    CV_Assert(top.type() == terminals.type());
-
-    CV_Assert(bottom.size() == src_size);
-    CV_Assert(bottom.type() == terminals.type());
-
-    labels.create(src_size, CV_8U);
-
-    NppiSize sznpp;
-    sznpp.width = src_size.width;
-    sznpp.height = src_size.height;
-
-    int bufsz;
-    nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
-
-    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
-
-#if (CUDA_VERSION < 5000)
-    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-#else
-    if (terminals.type() == CV_32S)
-    {
-        nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-    else
-    {
-        nppSafeCall( nppiGraphcut_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), bottom.ptr<Npp32f>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-#endif
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
-              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
-{
-#if (CUDA_VERSION < 5000)
-    CV_Assert(terminals.type() == CV_32S);
-#else
-    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
-#endif
-
-    Size src_size = terminals.size();
-
-    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(leftTransp.type() == terminals.type());
-
-    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(rightTransp.type() == terminals.type());
-
-    CV_Assert(top.size() == src_size);
-    CV_Assert(top.type() == terminals.type());
-
-    CV_Assert(topLeft.size() == src_size);
-    CV_Assert(topLeft.type() == terminals.type());
-
-    CV_Assert(topRight.size() == src_size);
-    CV_Assert(topRight.type() == terminals.type());
-
-    CV_Assert(bottom.size() == src_size);
-    CV_Assert(bottom.type() == terminals.type());
-
-    CV_Assert(bottomLeft.size() == src_size);
-    CV_Assert(bottomLeft.type() == terminals.type());
-
-    CV_Assert(bottomRight.size() == src_size);
-    CV_Assert(bottomRight.type() == terminals.type());
-
-    labels.create(src_size, CV_8U);
-
-    NppiSize sznpp;
-    sznpp.width = src_size.width;
-    sznpp.height = src_size.height;
-
-    int bufsz;
-    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
-
-    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
-
-#if (CUDA_VERSION < 5000)
-    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
-        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
-        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-#else
-    if (terminals.type() == CV_32S)
-    {
-        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
-            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
-            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-    else
-    {
-        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
-            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
-            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-#endif
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
deleted file mode 100644
index bc0a8a400d..0000000000
--- a/modules/gpu/src/hough.cpp
+++ /dev/null
@@ -1,1432 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::HoughLines(const GpuMat&, GpuMat&, float, float, int, bool, int) { throw_no_cuda(); }
-void cv::gpu::HoughLines(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, bool, int) { throw_no_cuda(); }
-void cv::gpu::HoughLinesDownload(const GpuMat&, OutputArray, OutputArray) { throw_no_cuda(); }
-
-void cv::gpu::HoughLinesP(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, int, int) { throw_no_cuda(); }
-
-void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, int, float, float, int, int, int, int, int) { throw_no_cuda(); }
-void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, HoughCirclesBuf&, int, float, float, int, int, int, int, int) { throw_no_cuda(); }
-void cv::gpu::HoughCirclesDownload(const GpuMat&, OutputArray) { throw_no_cuda(); }
-
-Ptr<GeneralizedHough_GPU> cv::gpu::GeneralizedHough_GPU::create(int) { throw_no_cuda(); return Ptr<GeneralizedHough_GPU>(); }
-cv::gpu::GeneralizedHough_GPU::~GeneralizedHough_GPU() {}
-void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat&, int, Point) { throw_no_cuda(); }
-void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat&, const GpuMat&, const GpuMat&, Point) { throw_no_cuda(); }
-void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat&, GpuMat&, int) { throw_no_cuda(); }
-void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::GeneralizedHough_GPU::download(const GpuMat&, OutputArray, OutputArray) { throw_no_cuda(); }
-void cv::gpu::GeneralizedHough_GPU::release() {}
-
-#else /* !defined (HAVE_CUDA) */
-
-#include "opencv2/core/utility.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace hough
-    {
-        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
-    }
-}}}
-
-//////////////////////////////////////////////////////////
-// HoughLines
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace hough
-    {
-        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
-        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
-    }
-}}}
-
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    HoughLinesBuf buf;
-    HoughLines(src, lines, buf, rho, theta, threshold, doSort, maxLines);
-}
-
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    using namespace cv::gpu::cudev::hough;
-
-    CV_Assert(src.type() == CV_8UC1);
-    CV_Assert(src.cols < std::numeric_limits<unsigned short>::max());
-    CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());
-
-    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.list);
-    unsigned int* srcPoints = buf.list.ptr<unsigned int>();
-
-    const int pointsCount = buildPointList_gpu(src, srcPoints);
-    if (pointsCount == 0)
-    {
-        lines.release();
-        return;
-    }
-
-    const int numangle = cvRound(CV_PI / theta);
-    const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
-    CV_Assert(numangle > 0 && numrho > 0);
-
-    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, buf.accum);
-    buf.accum.setTo(Scalar::all(0));
-
-    DeviceInfo devInfo;
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
-
-    ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
-
-    int linesCount = linesGetResult_gpu(buf.accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
-    if (linesCount > 0)
-        lines.cols = linesCount;
-    else
-        lines.release();
-}
-
-void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
-{
-    if (d_lines.empty())
-    {
-        h_lines_.release();
-        if (h_votes_.needed())
-            h_votes_.release();
-        return;
-    }
-
-    CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);
-
-    h_lines_.create(1, d_lines.cols, CV_32FC2);
-    Mat h_lines = h_lines_.getMat();
-    d_lines.row(0).download(h_lines);
-
-    if (h_votes_.needed())
-    {
-        h_votes_.create(1, d_lines.cols, CV_32SC1);
-        Mat h_votes = h_votes_.getMat();
-        GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
-        d_votes.download(h_votes);
-    }
-}
-
-//////////////////////////////////////////////////////////
-// HoughLinesP
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace hough
-    {
-        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
-    }
-}}}
-
-void cv::gpu::HoughLinesP(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines)
-{
-    using namespace cv::gpu::cudev::hough;
-
-    CV_Assert( src.type() == CV_8UC1 );
-    CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
-    CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
-
-    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.list);
-    unsigned int* srcPoints = buf.list.ptr<unsigned int>();
-
-    const int pointsCount = buildPointList_gpu(src, srcPoints);
-    if (pointsCount == 0)
-    {
-        lines.release();
-        return;
-    }
-
-    const int numangle = cvRound(CV_PI / theta);
-    const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
-    CV_Assert( numangle > 0 && numrho > 0 );
-
-    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, buf.accum);
-    buf.accum.setTo(Scalar::all(0));
-
-    DeviceInfo devInfo;
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
-
-    ensureSizeIsEnough(1, maxLines, CV_32SC4, lines);
-
-    int linesCount = houghLinesProbabilistic_gpu(src, buf.accum, lines.ptr<int4>(), maxLines, rho, theta, maxLineGap, minLineLength);
-
-    if (linesCount > 0)
-        lines.cols = linesCount;
-    else
-        lines.release();
-}
-
-//////////////////////////////////////////////////////////
-// HoughCircles
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace hough
-    {
-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
-        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
-        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
-                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
-    }
-}}}
-
-void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
-{
-    HoughCirclesBuf buf;
-    HoughCircles(src, circles, buf, method, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius, maxCircles);
-}
-
-void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf& buf, int method,
-                           float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
-{
-    using namespace cv::gpu::cudev::hough;
-
-    CV_Assert(src.type() == CV_8UC1);
-    CV_Assert(src.cols < std::numeric_limits<unsigned short>::max());
-    CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());
-    CV_Assert(method == cv::HOUGH_GRADIENT);
-    CV_Assert(dp > 0);
-    CV_Assert(minRadius > 0 && maxRadius > minRadius);
-    CV_Assert(cannyThreshold > 0);
-    CV_Assert(votesThreshold > 0);
-    CV_Assert(maxCircles > 0);
-
-    const float idp = 1.0f / dp;
-
-    cv::gpu::Canny(src, buf.cannyBuf, buf.edges, std::max(cannyThreshold / 2, 1), cannyThreshold);
-
-    ensureSizeIsEnough(2, src.size().area(), CV_32SC1, buf.list);
-    unsigned int* srcPoints = buf.list.ptr<unsigned int>(0);
-    unsigned int* centers = buf.list.ptr<unsigned int>(1);
-
-    const int pointsCount = buildPointList_gpu(buf.edges, srcPoints);
-    if (pointsCount == 0)
-    {
-        circles.release();
-        return;
-    }
-
-    ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, buf.accum);
-    buf.accum.setTo(Scalar::all(0));
-
-    circlesAccumCenters_gpu(srcPoints, pointsCount, buf.cannyBuf.dx, buf.cannyBuf.dy, buf.accum, minRadius, maxRadius, idp);
-
-    int centersCount = buildCentersList_gpu(buf.accum, centers, votesThreshold);
-    if (centersCount == 0)
-    {
-        circles.release();
-        return;
-    }
-
-    if (minDist > 1)
-    {
-        cv::AutoBuffer<ushort2> oldBuf_(centersCount);
-        cv::AutoBuffer<ushort2> newBuf_(centersCount);
-        int newCount = 0;
-
-        ushort2* oldBuf = oldBuf_;
-        ushort2* newBuf = newBuf_;
-
-        cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) );
-
-        const int cellSize = cvRound(minDist);
-        const int gridWidth = (src.cols + cellSize - 1) / cellSize;
-        const int gridHeight = (src.rows + cellSize - 1) / cellSize;
-
-        std::vector< std::vector<ushort2> > grid(gridWidth * gridHeight);
-
-        const float minDist2 = minDist * minDist;
-
-        for (int i = 0; i < centersCount; ++i)
-        {
-            ushort2 p = oldBuf[i];
-
-            bool good = true;
-
-            int xCell = static_cast<int>(p.x / cellSize);
-            int yCell = static_cast<int>(p.y / cellSize);
-
-            int x1 = xCell - 1;
-            int y1 = yCell - 1;
-            int x2 = xCell + 1;
-            int y2 = yCell + 1;
-
-            // boundary check
-            x1 = std::max(0, x1);
-            y1 = std::max(0, y1);
-            x2 = std::min(gridWidth - 1, x2);
-            y2 = std::min(gridHeight - 1, y2);
-
-            for (int yy = y1; yy <= y2; ++yy)
-            {
-                for (int xx = x1; xx <= x2; ++xx)
-                {
-                    std::vector<ushort2>& m = grid[yy * gridWidth + xx];
-
-                    for(size_t j = 0; j < m.size(); ++j)
-                    {
-                        float dx = (float)(p.x - m[j].x);
-                        float dy = (float)(p.y - m[j].y);
-
-                        if (dx * dx + dy * dy < minDist2)
-                        {
-                            good = false;
-                            goto break_out;
-                        }
-                    }
-                }
-            }
-
-            break_out:
-
-            if(good)
-            {
-                grid[yCell * gridWidth + xCell].push_back(p);
-
-                newBuf[newCount++] = p;
-            }
-        }
-
-        cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) );
-        centersCount = newCount;
-    }
-
-    ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);
-
-    const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles,
-                                                    dp, minRadius, maxRadius, votesThreshold, deviceSupports(FEATURE_SET_COMPUTE_20));
-
-    if (circlesCount > 0)
-        circles.cols = circlesCount;
-    else
-        circles.release();
-}
-
-void cv::gpu::HoughCirclesDownload(const GpuMat& d_circles, cv::OutputArray h_circles_)
-{
-    if (d_circles.empty())
-    {
-        h_circles_.release();
-        return;
-    }
-
-    CV_Assert(d_circles.rows == 1 && d_circles.type() == CV_32FC3);
-
-    h_circles_.create(1, d_circles.cols, CV_32FC3);
-    Mat h_circles = h_circles_.getMat();
-    d_circles.download(h_circles);
-}
-
-//////////////////////////////////////////////////////////
-// GeneralizedHough
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace hough
-    {
-        template <typename T>
-        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
-        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                             PtrStepSz<short2> r_table, int* r_sizes,
-                             short2 templCenter, int levels);
-
-        void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                          PtrStepSz<short2> r_table, const int* r_sizes,
-                                          PtrStepSzi hist,
-                                          float dp, int levels);
-        int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold);
-
-        void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                               PtrStepSz<short2> r_table, const int* r_sizes,
-                                               PtrStepi hist, int rows, int cols,
-                                               float minScale, float scaleStep, int scaleRange,
-                                               float dp, int levels);
-        int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
-                                                   float minScale, float scaleStep, float dp, int threshold);
-
-        void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                                  PtrStepSz<short2> r_table, const int* r_sizes,
-                                                  PtrStepi hist, int rows, int cols,
-                                                  float minAngle, float angleStep, int angleRange,
-                                                  float dp, int levels);
-        int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
-                                                      float minAngle, float angleStep, float dp, int threshold);
-
-        void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
-        void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
-        void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                                     int* sizes, int maxSize,
-                                                     float xi, float angleEpsilon, int levels,
-                                                     float2 center, float maxDist);
-        void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                                     int* sizes, int maxSize,
-                                                     float xi, float angleEpsilon, int levels,
-                                                     float2 center, float maxDist);
-        void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
-                                         float minAngle, float maxAngle, float angleStep, int angleRange,
-                                         int levels, int tMaxSize);
-        void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
-                                         float angle, float angleEpsilon,
-                                         float minScale, float maxScale, float iScaleStep, int scaleRange,
-                                         int levels, int tMaxSize);
-        void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
-                                         float angle, float angleEpsilon, float scale,
-                                         float dp,
-                                         int levels, int tMaxSize);
-        int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
-                                             float angle, int angleVotes, float scale, int scaleVotes,
-                                             float dp, int threshold);
-    }
-}}}
-
-namespace
-{
-    /////////////////////////////////////
-    // Common
-
-    template <typename T, class A> void releaseVector(std::vector<T, A>& v)
-    {
-        std::vector<T, A> empty;
-        empty.swap(v);
-    }
-
-    class GHT_Pos : public GeneralizedHough_GPU
-    {
-    public:
-        GHT_Pos();
-
-    protected:
-        void setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter);
-        void detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions);
-        void releaseImpl();
-
-        virtual void processTempl() = 0;
-        virtual void processImage() = 0;
-
-        void buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy);
-        void filterMinDist();
-        void convertTo(GpuMat& positions);
-
-        int maxSize;
-        double minDist;
-
-        Size templSize;
-        Point templCenter;
-        GpuMat templEdges;
-        GpuMat templDx;
-        GpuMat templDy;
-
-        Size imageSize;
-        GpuMat imageEdges;
-        GpuMat imageDx;
-        GpuMat imageDy;
-
-        GpuMat edgePointList;
-
-        GpuMat outBuf;
-        int posCount;
-
-        std::vector<float4> oldPosBuf;
-        std::vector<int3> oldVoteBuf;
-        std::vector<float4> newPosBuf;
-        std::vector<int3> newVoteBuf;
-        std::vector<int> indexies;
-    };
-
-    GHT_Pos::GHT_Pos()
-    {
-        maxSize = 10000;
-        minDist = 1.0;
-    }
-
-    void GHT_Pos::setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter_)
-    {
-        templSize = edges.size();
-        templCenter = templCenter_;
-
-        ensureSizeIsEnough(templSize, edges.type(), templEdges);
-        ensureSizeIsEnough(templSize, dx.type(), templDx);
-        ensureSizeIsEnough(templSize, dy.type(), templDy);
-
-        edges.copyTo(templEdges);
-        dx.copyTo(templDx);
-        dy.copyTo(templDy);
-
-        processTempl();
-    }
-
-    void GHT_Pos::detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions)
-    {
-        imageSize = edges.size();
-
-        ensureSizeIsEnough(imageSize, edges.type(), imageEdges);
-        ensureSizeIsEnough(imageSize, dx.type(), imageDx);
-        ensureSizeIsEnough(imageSize, dy.type(), imageDy);
-
-        edges.copyTo(imageEdges);
-        dx.copyTo(imageDx);
-        dy.copyTo(imageDy);
-
-        posCount = 0;
-
-        processImage();
-
-        if (posCount == 0)
-            positions.release();
-        else
-        {
-            if (minDist > 1)
-                filterMinDist();
-            convertTo(positions);
-        }
-    }
-
-    void GHT_Pos::releaseImpl()
-    {
-        templSize = Size();
-        templCenter = Point(-1, -1);
-        templEdges.release();
-        templDx.release();
-        templDy.release();
-
-        imageSize = Size();
-        imageEdges.release();
-        imageDx.release();
-        imageDy.release();
-
-        edgePointList.release();
-
-        outBuf.release();
-        posCount = 0;
-
-        releaseVector(oldPosBuf);
-        releaseVector(oldVoteBuf);
-        releaseVector(newPosBuf);
-        releaseVector(newVoteBuf);
-        releaseVector(indexies);
-    }
-
-    void GHT_Pos::buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy)
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        typedef int (*func_t)(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
-        static const func_t funcs[] =
-        {
-            0,
-            0,
-            0,
-            buildEdgePointList_gpu<short>,
-            buildEdgePointList_gpu<int>,
-            buildEdgePointList_gpu<float>,
-            0
-        };
-
-        CV_Assert(edges.type() == CV_8UC1);
-        CV_Assert(dx.size() == edges.size());
-        CV_Assert(dy.type() == dx.type() && dy.size() == edges.size());
-
-        const func_t func = funcs[dx.depth()];
-        CV_Assert(func != 0);
-
-        edgePointList.cols = (int) (edgePointList.step / sizeof(int));
-        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);
-
-        edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));
-    }
-
-    struct IndexCmp
-    {
-        const int3* aux;
-
-        explicit IndexCmp(const int3* _aux) : aux(_aux) {}
-
-        bool operator ()(int l1, int l2) const
-        {
-            return aux[l1].x > aux[l2].x;
-        }
-    };
-
-    void GHT_Pos::filterMinDist()
-    {
-        oldPosBuf.resize(posCount);
-        oldVoteBuf.resize(posCount);
-
-        cudaSafeCall( cudaMemcpy(&oldPosBuf[0], outBuf.ptr(0), posCount * sizeof(float4), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&oldVoteBuf[0], outBuf.ptr(1), posCount * sizeof(int3), cudaMemcpyDeviceToHost) );
-
-        indexies.resize(posCount);
-        for (int i = 0; i < posCount; ++i)
-            indexies[i] = i;
-        std::sort(indexies.begin(), indexies.end(), IndexCmp(&oldVoteBuf[0]));
-
-        newPosBuf.clear();
-        newVoteBuf.clear();
-        newPosBuf.reserve(posCount);
-        newVoteBuf.reserve(posCount);
-
-        const int cellSize = cvRound(minDist);
-        const int gridWidth = (imageSize.width + cellSize - 1) / cellSize;
-        const int gridHeight = (imageSize.height + cellSize - 1) / cellSize;
-
-        std::vector< std::vector<Point2f> > grid(gridWidth * gridHeight);
-
-        const double minDist2 = minDist * minDist;
-
-        for (int i = 0; i < posCount; ++i)
-        {
-            const int ind = indexies[i];
-
-            Point2f p(oldPosBuf[ind].x, oldPosBuf[ind].y);
-
-            bool good = true;
-
-            const int xCell = static_cast<int>(p.x / cellSize);
-            const int yCell = static_cast<int>(p.y / cellSize);
-
-            int x1 = xCell - 1;
-            int y1 = yCell - 1;
-            int x2 = xCell + 1;
-            int y2 = yCell + 1;
-
-            // boundary check
-            x1 = std::max(0, x1);
-            y1 = std::max(0, y1);
-            x2 = std::min(gridWidth - 1, x2);
-            y2 = std::min(gridHeight - 1, y2);
-
-            for (int yy = y1; yy <= y2; ++yy)
-            {
-                for (int xx = x1; xx <= x2; ++xx)
-                {
-                    const std::vector<Point2f>& m = grid[yy * gridWidth + xx];
-
-                    for(size_t j = 0; j < m.size(); ++j)
-                    {
-                        const Point2f d = p - m[j];
-
-                        if (d.ddot(d) < minDist2)
-                        {
-                            good = false;
-                            goto break_out;
-                        }
-                    }
-                }
-            }
-
-            break_out:
-
-            if(good)
-            {
-                grid[yCell * gridWidth + xCell].push_back(p);
-
-                newPosBuf.push_back(oldPosBuf[ind]);
-                newVoteBuf.push_back(oldVoteBuf[ind]);
-            }
-        }
-
-        posCount = static_cast<int>(newPosBuf.size());
-        cudaSafeCall( cudaMemcpy(outBuf.ptr(0), &newPosBuf[0], posCount * sizeof(float4), cudaMemcpyHostToDevice) );
-        cudaSafeCall( cudaMemcpy(outBuf.ptr(1), &newVoteBuf[0], posCount * sizeof(int3), cudaMemcpyHostToDevice) );
-    }
-
-    void GHT_Pos::convertTo(GpuMat& positions)
-    {
-        ensureSizeIsEnough(2, posCount, CV_32FC4, positions);
-        GpuMat(2, posCount, CV_32FC4, outBuf.data, outBuf.step).copyTo(positions);
-    }
-
-    /////////////////////////////////////
-    // POSITION Ballard
-
-    class GHT_Ballard_Pos : public GHT_Pos
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        GHT_Ballard_Pos();
-
-    protected:
-        void releaseImpl();
-
-        void processTempl();
-        void processImage();
-
-        virtual void calcHist();
-        virtual void findPosInHist();
-
-        int levels;
-        int votesThreshold;
-        double dp;
-
-        GpuMat r_table;
-        GpuMat r_sizes;
-
-        GpuMat hist;
-    };
-
-    CV_INIT_ALGORITHM(GHT_Ballard_Pos, "GeneralizedHough_GPU.POSITION",
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution."));
-
-    GHT_Ballard_Pos::GHT_Ballard_Pos()
-    {
-        levels = 360;
-        votesThreshold = 100;
-        dp = 1.0;
-    }
-
-    void GHT_Ballard_Pos::releaseImpl()
-    {
-        GHT_Pos::releaseImpl();
-
-        r_table.release();
-        r_sizes.release();
-
-        hist.release();
-    }
-
-    void GHT_Ballard_Pos::processTempl()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(levels > 0);
-
-        buildEdgePointList(templEdges, templDx, templDy);
-
-        ensureSizeIsEnough(levels + 1, maxSize, CV_16SC2, r_table);
-        ensureSizeIsEnough(1, levels + 1, CV_32SC1, r_sizes);
-        r_sizes.setTo(Scalar::all(0));
-
-        if (edgePointList.cols > 0)
-        {
-            buildRTable_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                            r_table, r_sizes.ptr<int>(), make_short2(templCenter.x, templCenter.y), levels);
-            min(r_sizes, maxSize, r_sizes);
-        }
-    }
-
-    void GHT_Ballard_Pos::processImage()
-    {
-        calcHist();
-        findPosInHist();
-    }
-
-    void GHT_Ballard_Pos::calcHist()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
-        CV_Assert(dp > 0.0);
-
-        const double idp = 1.0 / dp;
-
-        buildEdgePointList(imageEdges, imageDx, imageDy);
-
-        ensureSizeIsEnough(cvCeil(imageSize.height * idp) + 2, cvCeil(imageSize.width * idp) + 2, CV_32SC1, hist);
-        hist.setTo(Scalar::all(0));
-
-        if (edgePointList.cols > 0)
-        {
-            GHT_Ballard_Pos_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                                         r_table, r_sizes.ptr<int>(),
-                                         hist,
-                                         (float)dp, levels);
-        }
-    }
-
-    void GHT_Ballard_Pos::findPosInHist()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(votesThreshold > 0);
-
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
-
-        posCount = GHT_Ballard_Pos_findPosInHist_gpu(hist, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)dp, votesThreshold);
-    }
-
-    /////////////////////////////////////
-    // POSITION & SCALE
-
-    class GHT_Ballard_PosScale : public GHT_Ballard_Pos
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        GHT_Ballard_PosScale();
-
-    protected:
-        void calcHist();
-        void findPosInHist();
-
-        double minScale;
-        double maxScale;
-        double scaleStep;
-    };
-
-    CV_INIT_ALGORITHM(GHT_Ballard_PosScale, "GeneralizedHough_GPU.POSITION_SCALE",
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
-                                           "Minimal scale to detect.");
-                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
-                                           "Maximal scale to detect.");
-                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
-                                           "Scale step."));
-
-    GHT_Ballard_PosScale::GHT_Ballard_PosScale()
-    {
-        minScale = 0.5;
-        maxScale = 2.0;
-        scaleStep = 0.05;
-    }
-
-    void GHT_Ballard_PosScale::calcHist()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
-        CV_Assert(dp > 0.0);
-        CV_Assert(minScale > 0.0 && minScale < maxScale);
-        CV_Assert(scaleStep > 0.0);
-
-        const double idp = 1.0 / dp;
-        const int scaleRange = cvCeil((maxScale - minScale) / scaleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
-
-        buildEdgePointList(imageEdges, imageDx, imageDy);
-
-        ensureSizeIsEnough((scaleRange + 2) * (rows + 2), cols + 2, CV_32SC1, hist);
-        hist.setTo(Scalar::all(0));
-
-        if (edgePointList.cols > 0)
-        {
-            GHT_Ballard_PosScale_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                                              r_table, r_sizes.ptr<int>(),
-                                              hist, rows, cols,
-                                              (float)minScale, (float)scaleStep, scaleRange, (float)dp, levels);
-        }
-    }
-
-    void GHT_Ballard_PosScale::findPosInHist()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(votesThreshold > 0);
-
-        const double idp = 1.0 / dp;
-        const int scaleRange = cvCeil((maxScale - minScale) / scaleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
-
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
-
-        posCount =  GHT_Ballard_PosScale_findPosInHist_gpu(hist, rows, cols, scaleRange, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)minScale, (float)scaleStep, (float)dp, votesThreshold);
-    }
-
-    /////////////////////////////////////
-    // POSITION & Rotation
-
-    class GHT_Ballard_PosRotation : public GHT_Ballard_Pos
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        GHT_Ballard_PosRotation();
-
-    protected:
-        void calcHist();
-        void findPosInHist();
-
-        double minAngle;
-        double maxAngle;
-        double angleStep;
-    };
-
-    CV_INIT_ALGORITHM(GHT_Ballard_PosRotation, "GeneralizedHough_GPU.POSITION_ROTATION",
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
-                                           "Minimal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
-                                           "Maximal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
-                                           "Angle step in degrees."));
-
-    GHT_Ballard_PosRotation::GHT_Ballard_PosRotation()
-    {
-        minAngle = 0.0;
-        maxAngle = 360.0;
-        angleStep = 1.0;
-    }
-
-    void GHT_Ballard_PosRotation::calcHist()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
-        CV_Assert(dp > 0.0);
-        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
-        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
-
-        const double idp = 1.0 / dp;
-        const int angleRange = cvCeil((maxAngle - minAngle) / angleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
-
-        buildEdgePointList(imageEdges, imageDx, imageDy);
-
-        ensureSizeIsEnough((angleRange + 2) * (rows + 2), cols + 2, CV_32SC1, hist);
-        hist.setTo(Scalar::all(0));
-
-        if (edgePointList.cols > 0)
-        {
-            GHT_Ballard_PosRotation_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                                                 r_table, r_sizes.ptr<int>(),
-                                                 hist, rows, cols,
-                                                 (float)minAngle, (float)angleStep, angleRange, (float)dp, levels);
-        }
-    }
-
-    void GHT_Ballard_PosRotation::findPosInHist()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(votesThreshold > 0);
-
-        const double idp = 1.0 / dp;
-        const int angleRange = cvCeil((maxAngle - minAngle) / angleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
-
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
-
-        posCount = GHT_Ballard_PosRotation_findPosInHist_gpu(hist, rows, cols, angleRange, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)minAngle, (float)angleStep, (float)dp, votesThreshold);
-    }
-
-    /////////////////////////////////////////
-    // POSITION & SCALE & ROTATION
-
-    double toRad(double a)
-    {
-        return a * CV_PI / 180.0;
-    }
-
-    double clampAngle(double a)
-    {
-        double res = a;
-
-        while (res > 360.0)
-            res -= 360.0;
-        while (res < 0)
-            res += 360.0;
-
-        return res;
-    }
-
-    bool angleEq(double a, double b, double eps = 1.0)
-    {
-        return (fabs(clampAngle(a - b)) <= eps);
-    }
-
-    class GHT_Guil_Full : public GHT_Pos
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        GHT_Guil_Full();
-
-    protected:
-        void releaseImpl();
-
-        void processTempl();
-        void processImage();
-
-        struct Feature
-        {
-            GpuMat p1_pos;
-            GpuMat p1_theta;
-            GpuMat p2_pos;
-
-            GpuMat d12;
-
-            GpuMat r1;
-            GpuMat r2;
-
-            GpuMat sizes;
-            int maxSize;
-
-            void create(int levels, int maxCapacity, bool isTempl);
-            void release();
-        };
-
-        typedef void (*set_func_t)(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
-        typedef void (*build_func_t)(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                     int* sizes, int maxSize,
-                                     float xi, float angleEpsilon, int levels,
-                                     float2 center, float maxDist);
-
-        void buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
-                              set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center = Point2d());
-
-        void calcOrientation();
-        void calcScale(double angle);
-        void calcPosition(double angle, int angleVotes, double scale, int scaleVotes);
-
-        double xi;
-        int levels;
-        double angleEpsilon;
-
-        double minAngle;
-        double maxAngle;
-        double angleStep;
-        int angleThresh;
-
-        double minScale;
-        double maxScale;
-        double scaleStep;
-        int scaleThresh;
-
-        double dp;
-        int posThresh;
-
-        Feature templFeatures;
-        Feature imageFeatures;
-
-        std::vector< std::pair<double, int> > angles;
-        std::vector< std::pair<double, int> > scales;
-
-        GpuMat hist;
-        std::vector<int> h_buf;
-    };
-
-    CV_INIT_ALGORITHM(GHT_Guil_Full, "GeneralizedHough_GPU.POSITION_SCALE_ROTATION",
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "xi", obj.xi, false, 0, 0,
-                                           "Angle difference in degrees between two points in feature.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "Feature table levels.");
-                      obj.info()->addParam(obj, "angleEpsilon", obj.angleEpsilon, false, 0, 0,
-                                           "Maximal difference between angles that treated as equal.");
-                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
-                                           "Minimal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
-                                           "Maximal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
-                                           "Angle step in degrees.");
-                      obj.info()->addParam(obj, "angleThresh", obj.angleThresh, false, 0, 0,
-                                           "Angle threshold.");
-                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
-                                           "Minimal scale to detect.");
-                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
-                                           "Maximal scale to detect.");
-                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
-                                           "Scale step.");
-                      obj.info()->addParam(obj, "scaleThresh", obj.scaleThresh, false, 0, 0,
-                                           "Scale threshold.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "posThresh", obj.posThresh, false, 0, 0,
-                                           "Position threshold."));
-
-    GHT_Guil_Full::GHT_Guil_Full()
-    {
-        maxSize = 1000;
-        xi = 90.0;
-        levels = 360;
-        angleEpsilon = 1.0;
-
-        minAngle = 0.0;
-        maxAngle = 360.0;
-        angleStep = 1.0;
-        angleThresh = 15000;
-
-        minScale = 0.5;
-        maxScale = 2.0;
-        scaleStep = 0.05;
-        scaleThresh = 1000;
-
-        dp = 1.0;
-        posThresh = 100;
-    }
-
-    void GHT_Guil_Full::releaseImpl()
-    {
-        GHT_Pos::releaseImpl();
-
-        templFeatures.release();
-        imageFeatures.release();
-
-        releaseVector(angles);
-        releaseVector(scales);
-
-        hist.release();
-        releaseVector(h_buf);
-    }
-
-    void GHT_Guil_Full::processTempl()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        buildFeatureList(templEdges, templDx, templDy, templFeatures,
-            GHT_Guil_Full_setTemplFeatures, GHT_Guil_Full_buildTemplFeatureList_gpu,
-            true, templCenter);
-
-        h_buf.resize(templFeatures.sizes.cols);
-        cudaSafeCall( cudaMemcpy(&h_buf[0], templFeatures.sizes.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
-        templFeatures.maxSize = *max_element(h_buf.begin(), h_buf.end());
-    }
-
-    void GHT_Guil_Full::processImage()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        CV_Assert(levels > 0);
-        CV_Assert(templFeatures.sizes.cols == levels + 1);
-        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
-        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
-        CV_Assert(angleThresh > 0);
-        CV_Assert(minScale > 0.0 && minScale < maxScale);
-        CV_Assert(scaleStep > 0.0);
-        CV_Assert(scaleThresh > 0);
-        CV_Assert(dp > 0.0);
-        CV_Assert(posThresh > 0);
-
-        const double iAngleStep = 1.0 / angleStep;
-        const int angleRange = cvCeil((maxAngle - minAngle) * iAngleStep);
-
-        const double iScaleStep = 1.0 / scaleStep;
-        const int scaleRange = cvCeil((maxScale - minScale) * iScaleStep);
-
-        const double idp = 1.0 / dp;
-        const int histRows = cvCeil(imageSize.height * idp);
-        const int histCols = cvCeil(imageSize.width * idp);
-
-        ensureSizeIsEnough(histRows + 2, std::max(angleRange + 1, std::max(scaleRange + 1, histCols + 2)), CV_32SC1, hist);
-        h_buf.resize(std::max(angleRange + 1, scaleRange + 1));
-
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
-
-        buildFeatureList(imageEdges, imageDx, imageDy, imageFeatures,
-            GHT_Guil_Full_setImageFeatures, GHT_Guil_Full_buildImageFeatureList_gpu,
-            false);
-
-        calcOrientation();
-
-        for (size_t i = 0; i < angles.size(); ++i)
-        {
-            const double angle = angles[i].first;
-            const int angleVotes = angles[i].second;
-
-            calcScale(angle);
-
-            for (size_t j = 0; j < scales.size(); ++j)
-            {
-                const double scale = scales[j].first;
-                const int scaleVotes = scales[j].second;
-
-                calcPosition(angle, angleVotes, scale, scaleVotes);
-            }
-        }
-    }
-
-    void GHT_Guil_Full::Feature::create(int levels, int maxCapacity, bool isTempl)
-    {
-        if (!isTempl)
-        {
-            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p1_pos);
-            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p2_pos);
-        }
-
-        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, p1_theta);
-
-        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, d12);
-
-        if (isTempl)
-        {
-            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r1);
-            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r2);
-        }
-
-        ensureSizeIsEnough(1, levels + 1, CV_32SC1, sizes);
-        sizes.setTo(Scalar::all(0));
-
-        maxSize = 0;
-    }
-
-    void GHT_Guil_Full::Feature::release()
-    {
-        p1_pos.release();
-        p1_theta.release();
-        p2_pos.release();
-
-        d12.release();
-
-        r1.release();
-        r2.release();
-
-        sizes.release();
-
-        maxSize = 0;
-    }
-
-    void GHT_Guil_Full::buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
-                                         set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center)
-    {
-        CV_Assert(levels > 0);
-
-        const double maxDist = sqrt((double) templSize.width * templSize.width + templSize.height * templSize.height) * maxScale;
-
-        features.create(levels, maxSize, isTempl);
-        set_func(features.p1_pos, features.p1_theta, features.p2_pos, features.d12, features.r1, features.r2);
-
-        buildEdgePointList(edges, dx, dy);
-
-        if (edgePointList.cols > 0)
-        {
-            build_func(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                features.sizes.ptr<int>(), maxSize, (float)xi, (float)angleEpsilon, levels, make_float2((float)center.x, (float)center.y), (float)maxDist);
-        }
-    }
-
-    void GHT_Guil_Full::calcOrientation()
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        const double iAngleStep = 1.0 / angleStep;
-        const int angleRange = cvCeil((maxAngle - minAngle) * iAngleStep);
-
-        hist.setTo(Scalar::all(0));
-        GHT_Guil_Full_calcOHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0),
-            hist.ptr<int>(), (float)minAngle, (float)maxAngle, (float)angleStep, angleRange, levels, templFeatures.maxSize);
-        cudaSafeCall( cudaMemcpy(&h_buf[0], hist.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
-
-        angles.clear();
-
-        for (int n = 0; n < angleRange; ++n)
-        {
-            if (h_buf[n] >= angleThresh)
-            {
-                const double angle = minAngle + n * angleStep;
-                angles.push_back(std::make_pair(angle, h_buf[n]));
-            }
-        }
-    }
-
-    void GHT_Guil_Full::calcScale(double angle)
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        const double iScaleStep = 1.0 / scaleStep;
-        const int scaleRange = cvCeil((maxScale - minScale) * iScaleStep);
-
-        hist.setTo(Scalar::all(0));
-        GHT_Guil_Full_calcSHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0),
-            hist.ptr<int>(), (float)angle, (float)angleEpsilon, (float)minScale, (float)maxScale, (float)iScaleStep, scaleRange, levels, templFeatures.maxSize);
-        cudaSafeCall( cudaMemcpy(&h_buf[0], hist.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
-
-        scales.clear();
-
-        for (int s = 0; s < scaleRange; ++s)
-        {
-            if (h_buf[s] >= scaleThresh)
-            {
-                const double scale = minScale + s * scaleStep;
-                scales.push_back(std::make_pair(scale, h_buf[s]));
-            }
-        }
-    }
-
-    void GHT_Guil_Full::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
-    {
-        using namespace cv::gpu::cudev::hough;
-
-        hist.setTo(Scalar::all(0));
-        GHT_Guil_Full_calcPHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0),
-            hist,(float) (float)angle, (float)angleEpsilon, (float)scale, (float)dp, levels, templFeatures.maxSize);
-
-        posCount = GHT_Guil_Full_findPosInHist_gpu(hist, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1),
-            posCount, maxSize, (float)angle, angleVotes, (float)scale, scaleVotes, (float)dp, posThresh);
-    }
-}
-
-Ptr<GeneralizedHough_GPU> cv::gpu::GeneralizedHough_GPU::create(int method)
-{
-    switch (method)
-    {
-    case cv::GeneralizedHough::GHT_POSITION:
-        CV_Assert( !GHT_Ballard_Pos_info_auto.name().empty() );
-        return new GHT_Ballard_Pos();
-
-    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_SCALE):
-        CV_Assert( !GHT_Ballard_PosScale_info_auto.name().empty() );
-        return new GHT_Ballard_PosScale();
-
-    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_ROTATION):
-        CV_Assert( !GHT_Ballard_PosRotation_info_auto.name().empty() );
-        return new GHT_Ballard_PosRotation();
-
-    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_SCALE | cv::GeneralizedHough::GHT_ROTATION):
-        CV_Assert( !GHT_Guil_Full_info_auto.name().empty() );
-        return new GHT_Guil_Full();
-    }
-
-    CV_Error(cv::Error::StsBadArg, "Unsupported method");
-    return Ptr<GeneralizedHough_GPU>();
-}
-
-cv::gpu::GeneralizedHough_GPU::~GeneralizedHough_GPU()
-{
-}
-
-void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat& templ, int cannyThreshold, Point templCenter)
-{
-    CV_Assert(templ.type() == CV_8UC1);
-    CV_Assert(cannyThreshold > 0);
-
-    ensureSizeIsEnough(templ.size(), CV_8UC1, edges_);
-    Canny(templ, cannyBuf_, edges_, cannyThreshold / 2, cannyThreshold);
-
-    if (templCenter == Point(-1, -1))
-        templCenter = Point(templ.cols / 2, templ.rows / 2);
-
-    setTemplateImpl(edges_, cannyBuf_.dx, cannyBuf_.dy, templCenter);
-}
-
-void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter)
-{
-    if (templCenter == Point(-1, -1))
-        templCenter = Point(edges.cols / 2, edges.rows / 2);
-
-    setTemplateImpl(edges, dx, dy, templCenter);
-}
-
-void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat& image, GpuMat& positions, int cannyThreshold)
-{
-    CV_Assert(image.type() == CV_8UC1);
-    CV_Assert(cannyThreshold > 0);
-
-    ensureSizeIsEnough(image.size(), CV_8UC1, edges_);
-    Canny(image, cannyBuf_, edges_, cannyThreshold / 2, cannyThreshold);
-
-    detectImpl(edges_, cannyBuf_.dx, cannyBuf_.dy, positions);
-}
-
-void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions)
-{
-    detectImpl(edges, dx, dy, positions);
-}
-
-void cv::gpu::GeneralizedHough_GPU::download(const GpuMat& d_positions, OutputArray h_positions_, OutputArray h_votes_)
-{
-    if (d_positions.empty())
-    {
-        h_positions_.release();
-        if (h_votes_.needed())
-            h_votes_.release();
-        return;
-    }
-
-    CV_Assert(d_positions.rows == 2 && d_positions.type() == CV_32FC4);
-
-    h_positions_.create(1, d_positions.cols, CV_32FC4);
-    Mat h_positions = h_positions_.getMat();
-    d_positions.row(0).download(h_positions);
-
-    if (h_votes_.needed())
-    {
-        h_votes_.create(1, d_positions.cols, CV_32SC3);
-        Mat h_votes = h_votes_.getMat();
-        GpuMat d_votes(1, d_positions.cols, CV_32SC3, const_cast<int3*>(d_positions.ptr<int3>(1)));
-        d_votes.download(h_votes);
-    }
-}
-
-void cv::gpu::GeneralizedHough_GPU::release()
-{
-    edges_.release();
-    cannyBuf_.release();
-    releaseImpl();
-}
-
-#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
deleted file mode 100644
index c21a7b837d..0000000000
--- a/modules/gpu/src/imgproc.cpp
+++ /dev/null
@@ -1,1181 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
-void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
-void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::buildWarpPlaneMaps(Size, Rect, const Mat&, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::buildWarpCylindricalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_no_cuda(); }
-void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::histEven(const GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_no_cuda(); }
-void cv::gpu::histEven(const GpuMat&, GpuMat*, GpuMat&, int*, int*, int*, Stream&) { throw_no_cuda(); }
-void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_no_cuda(); }
-void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_no_cuda(); }
-void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_no_cuda(); }
-void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
-void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
-void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
-void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
-void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_no_cuda(); }
-void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_no_cuda(); }
-void cv::gpu::CannyBuf::create(const Size&, int) { throw_no_cuda(); }
-void cv::gpu::CannyBuf::release() { throw_no_cuda(); }
-cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::gpu::CLAHE>(); }
-void cv::gpu::alphaComp(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-////////////////////////////////////////////////////////////////////////
-// meanShiftFiltering_GPU
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
-{
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    if( src.empty() )
-        CV_Error( cv::Error::StsBadArg, "The input image is empty" );
-
-    if( src.depth() != CV_8U || src.channels() != 4 )
-        CV_Error( cv::Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
-
-    dst.create( src.size(), CV_8UC4 );
-
-    if( !(criteria.type & TermCriteria::MAX_ITER) )
-        criteria.maxCount = 5;
-
-    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
-
-    float eps;
-    if( !(criteria.type & TermCriteria::EPS) )
-        eps = 1.f;
-    eps = (float)std::max(criteria.epsilon, 0.0);
-
-    meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// meanShiftProc_GPU
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
-{
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    if( src.empty() )
-        CV_Error( cv::Error::StsBadArg, "The input image is empty" );
-
-    if( src.depth() != CV_8U || src.channels() != 4 )
-        CV_Error( cv::Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
-
-    dstr.create( src.size(), CV_8UC4 );
-    dstsp.create( src.size(), CV_16SC2 );
-
-    if( !(criteria.type & TermCriteria::MAX_ITER) )
-        criteria.maxCount = 5;
-
-    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
-
-    float eps;
-    if( !(criteria.type & TermCriteria::EPS) )
-        eps = 1.f;
-    eps = (float)std::max(criteria.epsilon, 0.0);
-
-    meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// drawColorDisp
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
-        void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
-    }
-}}}
-
-namespace
-{
-    template <typename T>
-    void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
-    {
-        using namespace ::cv::gpu::cudev::imgproc;
-
-        dst.create(src.size(), CV_8UC4);
-
-        drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
-    }
-
-    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);
-
-    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
-}
-
-void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& stream)
-{
-    CV_Assert(src.type() == CV_8U || src.type() == CV_16S);
-
-    drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// reprojectImageTo3D
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, typename D>
-        void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q, int dst_cn, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    typedef void (*func_t)(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-    static const func_t funcs[2][4] =
-    {
-        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
-        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
-    };
-
-    CV_Assert(disp.type() == CV_8U || disp.type() == CV_16S);
-    CV_Assert(Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous());
-    CV_Assert(dst_cn == 3 || dst_cn == 4);
-
-    xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
-
-    funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpPlaneMaps
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
-                                cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T,
-                                 float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
-{
-    (void)src_size;
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
-    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
-    CV_Assert((T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32F && T.isContinuous());
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert(K_Rinv.isContinuous());
-    CV_Assert(R_Kinv.isContinuous());
-
-    map_x.create(dst_roi.size(), CV_32F);
-    map_y.create(dst_roi.size(), CV_32F);
-    cudev::imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
-                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpCylyndricalMaps
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                      const float k_rinv[9], const float r_kinv[9], float scale,
-                                      cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
-                                       GpuMat& map_x, GpuMat& map_y, Stream& stream)
-{
-    (void)src_size;
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
-    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert(K_Rinv.isContinuous());
-    CV_Assert(R_Kinv.isContinuous());
-
-    map_x.create(dst_roi.size(), CV_32F);
-    map_y.create(dst_roi.size(), CV_32F);
-    cudev::imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpSphericalMaps
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                    const float k_rinv[9], const float r_kinv[9], float scale,
-                                    cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
-                                     GpuMat& map_x, GpuMat& map_y, Stream& stream)
-{
-    (void)src_size;
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
-    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert(K_Rinv.isContinuous());
-    CV_Assert(R_Kinv.isContinuous());
-
-    map_x.create(dst_roi.size(), CV_32F);
-    map_y.create(dst_roi.size(), CV_32F);
-    cudev::imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// rotate
-
-namespace
-{
-    template<int DEPTH> struct NppTypeTraits;
-    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
-    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
-    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
-    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
-    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
-    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
-    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
-
-    template <int DEPTH> struct NppRotateFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
-                                    npp_t* pDst, int nDstStep, NppiRect oDstROI,
-                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
-    };
-
-    template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
-    {
-        typedef typename NppRotateFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream)
-        {
-            (void)dsize;
-            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
-
-            NppStreamHandler h(stream);
-
-            NppiSize srcsz;
-            srcsz.height = src.rows;
-            srcsz.width = src.cols;
-            NppiRect srcroi;
-            srcroi.x = srcroi.y = 0;
-            srcroi.height = src.rows;
-            srcroi.width = src.cols;
-            NppiRect dstroi;
-            dstroi.x = dstroi.y = 0;
-            dstroi.height = dst.rows;
-            dstroi.width = dst.cols;
-
-            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
-                dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
-{
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
-        {0,0,0,0},
-        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
-        {0,0,0,0},
-        {0,0,0,0},
-        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
-    };
-
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-
-    dst.create(dsize, src.type());
-    dst.setTo(Scalar::all(0));
-
-    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
-}
-
-
-////////////////////////////////////////////////////////////////////////
-// Histogram
-
-namespace
-{
-    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
-    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);
-
-    template<int SDEPTH> struct NppHistogramEvenFuncC1
-    {
-        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
-
-    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
-            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
-    };
-    template<int SDEPTH> struct NppHistogramEvenFuncC4
-    {
-        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
-            Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
-    };
-
-    template<int SDEPTH, typename NppHistogramEvenFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
-    struct NppHistogramEvenC1
-    {
-        typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;
-
-        static void hist(const GpuMat& src, GpuMat& hist, GpuMat& buffer, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
-        {
-            int levels = histSize + 1;
-            hist.create(1, histSize, CV_32S);
-
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            int buf_size;
-            get_buf_size(sz, levels, &buf_size);
-
-            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
-                lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppHistogramEvenFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
-    struct NppHistogramEvenC4
-    {
-        typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;
-
-        static void hist(const GpuMat& src, GpuMat hist[4], GpuMat& buffer, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
-        {
-            int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
-            hist[0].create(1, histSize[0], CV_32S);
-            hist[1].create(1, histSize[1], CV_32S);
-            hist[2].create(1, histSize[2], CV_32S);
-            hist[3].create(1, histSize[3], CV_32S);
-
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
-
-            int buf_size;
-            get_buf_size(sz, levels, &buf_size);
-
-            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int SDEPTH> struct NppHistogramRangeFuncC1
-    {
-        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
-        typedef Npp32s level_t;
-        enum {LEVEL_TYPE_CODE=CV_32SC1};
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
-            const Npp32s* pLevels, int nLevels, Npp8u* pBuffer);
-    };
-    template<> struct NppHistogramRangeFuncC1<CV_32F>
-    {
-        typedef Npp32f src_t;
-        typedef Npp32f level_t;
-        enum {LEVEL_TYPE_CODE=CV_32FC1};
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
-            const Npp32f* pLevels, int nLevels, Npp8u* pBuffer);
-    };
-    template<int SDEPTH> struct NppHistogramRangeFuncC4
-    {
-        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
-        typedef Npp32s level_t;
-        enum {LEVEL_TYPE_CODE=CV_32SC1};
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
-            const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer);
-    };
-    template<> struct NppHistogramRangeFuncC4<CV_32F>
-    {
-        typedef Npp32f src_t;
-        typedef Npp32f level_t;
-        enum {LEVEL_TYPE_CODE=CV_32FC1};
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
-            const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer);
-    };
-
-    template<int SDEPTH, typename NppHistogramRangeFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
-    struct NppHistogramRangeC1
-    {
-        typedef typename NppHistogramRangeFuncC1<SDEPTH>::src_t src_t;
-        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
-        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
-
-        static void hist(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buffer, cudaStream_t stream)
-        {
-            CV_Assert(levels.type() == LEVEL_TYPE_CODE && levels.rows == 1);
-
-            hist.create(1, levels.cols - 1, CV_32S);
-
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            int buf_size;
-            get_buf_size(sz, levels.cols, &buf_size);
-
-            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buffer.ptr<Npp8u>()) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppHistogramRangeFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
-    struct NppHistogramRangeC4
-    {
-        typedef typename NppHistogramRangeFuncC4<SDEPTH>::src_t src_t;
-        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
-        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
-
-        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buffer, cudaStream_t stream)
-        {
-            CV_Assert(levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1);
-            CV_Assert(levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1);
-            CV_Assert(levels[2].type() == LEVEL_TYPE_CODE && levels[2].rows == 1);
-            CV_Assert(levels[3].type() == LEVEL_TYPE_CODE && levels[3].rows == 1);
-
-            hist[0].create(1, levels[0].cols - 1, CV_32S);
-            hist[1].create(1, levels[1].cols - 1, CV_32S);
-            hist[2].create(1, levels[2].cols - 1, CV_32S);
-            hist[3].create(1, levels[3].cols - 1, CV_32S);
-
-            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
-            int nLevels[] = {levels[0].cols, levels[1].cols, levels[2].cols, levels[3].cols};
-            const level_t* pLevels[] = {levels[0].ptr<level_t>(), levels[1].ptr<level_t>(), levels[2].ptr<level_t>(), levels[3].ptr<level_t>()};
-
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            int buf_size;
-            get_buf_size(sz, nLevels, &buf_size);
-
-            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buffer.ptr<Npp8u>()) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel)
-{
-    Mat host_levels(1, nLevels, CV_32SC1);
-    nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
-    levels.upload(host_levels);
-}
-
-void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
-{
-    GpuMat buf;
-    histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
-
-void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
-{
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
-
-    typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, GpuMat& buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
-    static const hist_t hist_callers[] =
-    {
-        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
-        0,
-        NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
-        NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
-    };
-
-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
-{
-    GpuMat buf;
-    histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
-
-void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
-{
-    CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
-
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
-    static const hist_t hist_callers[] =
-    {
-        NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
-        0,
-        NppHistogramEvenC4<CV_16U, nppiHistogramEven_16u_C4R, nppiHistogramEvenGetBufferSize_16u_C4R>::hist,
-        NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist
-    };
-
-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream)
-{
-    GpuMat buf;
-    histRange(src, hist, levels, buf, stream);
-}
-
-void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream)
-{
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1);
-
-    typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, cudaStream_t stream);
-    static const hist_t hist_callers[] =
-    {
-        NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
-        0,
-        NppHistogramRangeC1<CV_16U, nppiHistogramRange_16u_C1R, nppiHistogramRangeGetBufferSize_16u_C1R>::hist,
-        NppHistogramRangeC1<CV_16S, nppiHistogramRange_16s_C1R, nppiHistogramRangeGetBufferSize_16s_C1R>::hist,
-        0,
-        NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist
-    };
-
-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
-{
-    GpuMat buf;
-    histRange(src, hist, levels, buf, stream);
-}
-
-void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream)
-{
-    CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4);
-
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, cudaStream_t stream);
-    static const hist_t hist_callers[] =
-    {
-        NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
-        0,
-        NppHistogramRangeC4<CV_16U, nppiHistogramRange_16u_C4R, nppiHistogramRangeGetBufferSize_16u_C4R>::hist,
-        NppHistogramRangeC4<CV_16S, nppiHistogramRange_16s_C4R, nppiHistogramRangeGetBufferSize_16s_C4R>::hist,
-        0,
-        NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist
-    };
-
-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
-}
-
-namespace hist
-{
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
-    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
-}
-
-void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    hist.create(1, 256, CV_32SC1);
-    hist.setTo(Scalar::all(0));
-
-    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    GpuMat hist;
-    GpuMat buf;
-    equalizeHist(src, dst, hist, buf, stream);
-}
-
-void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    dst.create(src.size(), src.type());
-
-    int intBufSize;
-    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
-
-    ensureSizeIsEnough(1, intBufSize + 256 * sizeof(int), CV_8UC1, buf);
-
-    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr());
-    GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize);
-
-    calcHist(src, hist, s);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
-
-    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
-}
-
-////////////////////////////////////////////////////////////////////////
-// cornerHarris & minEgenVal
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
-        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
-    {
-        double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
-
-        if (ksize < 0)
-            scale *= 2.;
-
-        if (src.depth() == CV_8U)
-            scale *= 255.;
-
-        scale = 1./scale;
-
-        Dx.create(src.size(), CV_32F);
-        Dy.create(src.size(), CV_32F);
-
-        if (ksize > 0)
-        {
-            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
-            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
-        }
-        else
-        {
-            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
-            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
-        }
-    }
-}
-
-void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType)
-{
-    GpuMat Dx, Dy;
-    cornerHarris(src, dst, Dx, Dy, blockSize, ksize, k, borderType);
-}
-
-void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)
-{
-    GpuMat buf;
-    cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);
-}
-
-void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
-
-    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
-
-    dst.create(src.size(), CV_32F);
-
-    cornerHarris_gpu(blockSize, static_cast<float>(k), Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
-{
-    GpuMat Dx, Dy;
-    cornerMinEigenVal(src, dst, Dx, Dy, blockSize, ksize, borderType);
-}
-
-void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
-{
-    GpuMat buf;
-    cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);
-}
-
-void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
-{
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
-
-    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
-
-    dst.create(src.size(), CV_32F);
-
-    cornerMinEigenVal_gpu(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// Canny
-
-void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
-{
-    if (apperture_size > 0)
-    {
-        ensureSizeIsEnough(image_size, CV_32SC1, dx);
-        ensureSizeIsEnough(image_size, CV_32SC1, dy);
-
-        if (apperture_size != 3)
-        {
-            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
-        }
-    }
-
-    ensureSizeIsEnough(image_size, CV_32FC1, mag);
-    ensureSizeIsEnough(image_size, CV_32SC1, map);
-
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
-}
-
-void cv::gpu::CannyBuf::release()
-{
-    dx.release();
-    dy.release();
-    mag.release();
-    map.release();
-    st1.release();
-    st2.release();
-}
-
-namespace canny
-{
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
-
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
-
-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
-
-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
-
-    void getEdges(PtrStepSzi map, PtrStepSzb dst);
-}
-
-namespace
-{
-    void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
-    {
-        using namespace canny;
-
-        buf.map.setTo(Scalar::all(0));
-        calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
-
-        edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
-
-        edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
-
-        getEdges(buf.map, dst);
-    }
-}
-
-void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
-{
-    CannyBuf buf;
-    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
-}
-
-void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
-{
-    using namespace canny;
-
-    CV_Assert(src.type() == CV_8UC1);
-
-    if (!deviceSupports(SHARED_ATOMICS))
-        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics");
-
-    if( low_thresh > high_thresh )
-        std::swap( low_thresh, high_thresh);
-
-    dst.create(src.size(), CV_8U);
-    buf.create(src.size(), apperture_size);
-
-    if (apperture_size == 3)
-    {
-        Size wholeSize;
-        Point ofs;
-        src.locateROI(wholeSize, ofs);
-        GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
-
-        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
-    }
-    else
-    {
-        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
-        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
-
-        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
-    }
-
-    CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
-}
-
-void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
-{
-    CannyBuf buf;
-    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
-}
-
-void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
-{
-    using namespace canny;
-
-    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
-    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
-
-    if( low_thresh > high_thresh )
-        std::swap( low_thresh, high_thresh);
-
-    dst.create(dx.size(), CV_8U);
-    buf.create(dx.size(), -1);
-
-    calcMagnitude(dx, dy, buf.mag, L2gradient);
-
-    CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
-}
-
-////////////////////////////////////////////////////////////////////////
-// CLAHE
-
-namespace clahe
-{
-    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
-    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
-}
-
-namespace
-{
-    class CLAHE_Impl : public cv::gpu::CLAHE
-    {
-    public:
-        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
-
-        cv::AlgorithmInfo* info() const;
-
-        void apply(cv::InputArray src, cv::OutputArray dst);
-        void apply(InputArray src, OutputArray dst, Stream& stream);
-
-        void setClipLimit(double clipLimit);
-        double getClipLimit() const;
-
-        void setTilesGridSize(cv::Size tileGridSize);
-        cv::Size getTilesGridSize() const;
-
-        void collectGarbage();
-
-    private:
-        double clipLimit_;
-        int tilesX_;
-        int tilesY_;
-
-        GpuMat srcExt_;
-        GpuMat lut_;
-    };
-
-    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
-        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
-    {
-    }
-
-    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE_GPU",
-        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
-        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
-        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
-
-    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
-    {
-        apply(_src, _dst, Stream::Null());
-    }
-
-    void CLAHE_Impl::apply(InputArray _src, OutputArray _dst, Stream& s)
-    {
-        GpuMat src = _src.getGpuMat();
-
-        CV_Assert( src.type() == CV_8UC1 );
-
-        _dst.create( src.size(), src.type() );
-        GpuMat dst = _dst.getGpuMat();
-
-        const int histSize = 256;
-
-        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
-
-        cudaStream_t stream = StreamAccessor::getStream(s);
-
-        cv::Size tileSize;
-        GpuMat srcForLut;
-
-        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
-        {
-            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
-            srcForLut = src;
-        }
-        else
-        {
-            cv::gpu::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar(), s);
-
-            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
-            srcForLut = srcExt_;
-        }
-
-        const int tileSizeTotal = tileSize.area();
-        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
-
-        int clipLimit = 0;
-        if (clipLimit_ > 0.0)
-        {
-            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
-            clipLimit = std::max(clipLimit, 1);
-        }
-
-        clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
-
-        clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
-    }
-
-    void CLAHE_Impl::setClipLimit(double clipLimit)
-    {
-        clipLimit_ = clipLimit;
-    }
-
-    double CLAHE_Impl::getClipLimit() const
-    {
-        return clipLimit_;
-    }
-
-    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
-    {
-        tilesX_ = tileGridSize.width;
-        tilesY_ = tileGridSize.height;
-    }
-
-    cv::Size CLAHE_Impl::getTilesGridSize() const
-    {
-        return cv::Size(tilesX_, tilesY_);
-    }
-
-    void CLAHE_Impl::collectGarbage()
-    {
-        srcExt_.release();
-        lut_.release();
-    }
-}
-
-cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double clipLimit, cv::Size tileGridSize)
-{
-    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
-}
-
-////////////////////////////////////////////////////////////////////////
-// alphaComp
-
-namespace
-{
-    template <int DEPTH> struct NppAlphaCompFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp);
-    };
-
-    template <int DEPTH, typename NppAlphaCompFunc<DEPTH>::func_t func> struct NppAlphaComp
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = img1.cols;
-            oSizeROI.height = img2.rows;
-
-            nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
-                              dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream)
-{
-    static const NppiAlphaOp npp_alpha_ops[] = {
-        NPPI_OP_ALPHA_OVER,
-        NPPI_OP_ALPHA_IN,
-        NPPI_OP_ALPHA_OUT,
-        NPPI_OP_ALPHA_ATOP,
-        NPPI_OP_ALPHA_XOR,
-        NPPI_OP_ALPHA_PLUS,
-        NPPI_OP_ALPHA_OVER_PREMUL,
-        NPPI_OP_ALPHA_IN_PREMUL,
-        NPPI_OP_ALPHA_OUT_PREMUL,
-        NPPI_OP_ALPHA_ATOP_PREMUL,
-        NPPI_OP_ALPHA_XOR_PREMUL,
-        NPPI_OP_ALPHA_PLUS_PREMUL,
-        NPPI_OP_ALPHA_PREMUL
-    };
-
-    typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream);
-
-    static const func_t funcs[] =
-    {
-        NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R>::call,
-        0,
-        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
-        0,
-        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
-        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
-    };
-
-    CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
-    CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
-
-    dst.create(img1.size(), img1.type());
-
-    const func_t func = funcs[img1.depth()];
-
-    func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream));
-}
-
-#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp
deleted file mode 100644
index d78828bf17..0000000000
--- a/modules/gpu/src/match_template.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace match_template
-    {
-        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-
-        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
-            int cn, cudaStream_t stream);
-
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
-            int cn, cudaStream_t stream);
-
-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_8UC2(
-            int w, int h,
-            const PtrStepSz<unsigned int> image_sum_r,
-            const PtrStepSz<unsigned int> image_sum_g,
-            unsigned int templ_sum_r,
-            unsigned int templ_sum_g,
-            PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
-                PtrStepSzf result, cudaStream_t stream);
-
-
-        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                int w, int h, const PtrStepSz<unsigned int> image_sum,
-                const PtrStepSz<unsigned long long> image_sqsum,
-                unsigned int templ_sum, unsigned long long templ_sqsum,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
-                PtrStepSzf result, cudaStream_t stream);
-
-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
-
-        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::match_template;
-
-namespace
-{
-
-    // Evaluates optimal template's area threshold. If
-    // template's area is less  than the threshold, we use naive match
-    // template version, otherwise FFT-based (if available)
-    int getTemplateThreshold(int method, int depth)
-    {
-        switch (method)
-        {
-        case cv::TM_CCORR:
-            if (depth == CV_32F) return 250;
-            if (depth == CV_8U) return 300;
-            break;
-        case cv::TM_SQDIFF:
-            if (depth == CV_8U) return 300;
-            break;
-        }
-        CV_Error(cv::Error::StsBadArg, "getTemplateThreshold: unsupported match template mode");
-        return 0;
-    }
-
-
-    void matchTemplate_CCORR_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_32F))
-        {
-            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-            return;
-        }
-
-        ConvolveBuf convolve_buf;
-        convolve_buf.user_block_size = buf.user_block_size;
-
-        if (image.channels() == 1)
-            convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
-        else
-        {
-            GpuMat result_;
-            convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
-            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
-        }
-    }
-
-
-    void matchTemplate_CCORR_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_8U))
-        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-            return;
-        }
-
-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
-    }
-
-
-    void matchTemplate_CCORR_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-
-        buf.image_sqsums.resize(1);
-        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-
-        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_SQDIFF_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        (void)buf;
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_SQDIFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_SQDIFF, CV_8U))
-        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-            return;
-        }
-
-        buf.image_sqsums.resize(1);
-        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-
-        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_SQDIFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        buf.image_sqsums.resize(1);
-        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-
-        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_CCOFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-
-        if (image.channels() == 1)
-        {
-            buf.image_sums.resize(1);
-            integral(image, buf.image_sums[0], stream);
-
-            unsigned int templ_sum = (unsigned int)sum(templ)[0];
-            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
-        }
-        else
-        {
-            split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
-            for (int i = 0; i < image.channels(); ++i)
-                integral(buf.images[i], buf.image_sums[i], stream);
-
-            Scalar templ_sum = sum(templ);
-
-            switch (image.channels())
-            {
-            case 2:
-                matchTemplatePrepared_CCOFF_8UC2(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 3:
-                matchTemplatePrepared_CCOFF_8UC3(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 4:
-                matchTemplatePrepared_CCOFF_8UC4(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
-                break;
-            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
-            }
-        }
-    }
-
-
-    void matchTemplate_CCOFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
-
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
-
-        if (image.channels() == 1)
-        {
-            buf.image_sums.resize(1);
-            integral(image, buf.image_sums[0], stream);
-            buf.image_sqsums.resize(1);
-            sqrIntegral(image, buf.image_sqsums[0], stream);
-
-            unsigned int templ_sum = (unsigned int)sum(templ)[0];
-            unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ)[0];
-
-            matchTemplatePrepared_CCOFF_NORMED_8U(
-                    templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0],
-                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
-        }
-        else
-        {
-            split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
-            buf.image_sqsums.resize(buf.images.size());
-            for (int i = 0; i < image.channels(); ++i)
-            {
-                integral(buf.images[i], buf.image_sums[i], stream);
-                sqrIntegral(buf.images[i], buf.image_sqsums[i], stream);
-            }
-
-            Scalar templ_sum = sum(templ);
-            Scalar templ_sqsum = sqrSum(templ);
-
-            switch (image.channels())
-            {
-            case 2:
-                matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 3:
-                matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 4:
-                matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
-                        buf.image_sums[3], buf.image_sqsums[3],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
-                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
-            }
-        }
-    }
-}
-
-
-void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
-{
-    MatchTemplateBuf buf;
-    matchTemplate(image, templ, result, method, buf, stream);
-}
-
-
-void cv::gpu::matchTemplate(
-        const GpuMat& image, const GpuMat& templ, GpuMat& result, int method,
-        MatchTemplateBuf &buf, Stream& stream)
-{
-    CV_Assert(image.type() == templ.type());
-    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
-
-    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream);
-
-    static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
-                                        ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
-                                        ::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U };
-    static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0,
-                                         ::matchTemplate_CCORR_32F, 0, 0, 0 };
-
-    const Caller* callers = 0;
-    switch (image.depth())
-    {
-        case CV_8U: callers = callers8U; break;
-        case CV_32F: callers = callers32F; break;
-        default: CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported data type");
-    }
-
-    Caller caller = callers[method];
-    CV_Assert(caller);
-    caller(image, templ, result, buf, stream);
-}
-
-#endif
diff --git a/modules/gpu/src/mssegmentation.cpp b/modules/gpu/src/mssegmentation.cpp
deleted file mode 100644
index 7f02168e1a..0000000000
--- a/modules/gpu/src/mssegmentation.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::meanShiftSegmentation(const GpuMat&, Mat&, int, int, int, TermCriteria) { throw_no_cuda(); }
-
-#else
-
-// Auxiliray stuff
-namespace
-{
-
-//
-// Declarations
-//
-
-class DjSets
-{
-public:
-    DjSets(int n);
-    int find(int elem);
-    int merge(int set1, int set2);
-
-    std::vector<int> parent;
-    std::vector<int> rank;
-    std::vector<int> size;
-private:
-    DjSets(const DjSets&);
-    void operator =(const DjSets&);
-};
-
-
-template <typename T>
-struct GraphEdge
-{
-    GraphEdge() {}
-    GraphEdge(int to_, int next_, const T& val_) : to(to_), next(next_), val(val_) {}
-    int to;
-    int next;
-    T val;
-};
-
-
-template <typename T>
-class Graph
-{
-public:
-    typedef GraphEdge<T> Edge;
-
-    Graph(int numv, int nume_max);
-
-    void addEdge(int from, int to, const T& val=T());
-
-    std::vector<int> start;
-    std::vector<Edge> edges;
-
-    int numv;
-    int nume_max;
-    int nume;
-private:
-    Graph(const Graph&);
-    void operator =(const Graph&);
-};
-
-
-struct SegmLinkVal
-{
-    SegmLinkVal() {}
-    SegmLinkVal(int dr_, int dsp_) : dr(dr_), dsp(dsp_) {}
-    bool operator <(const SegmLinkVal& other) const
-    {
-        return dr + dsp < other.dr + other.dsp;
-    }
-    int dr;
-    int dsp;
-};
-
-
-struct SegmLink
-{
-    SegmLink() {}
-    SegmLink(int from_, int to_, const SegmLinkVal& val_)
-        : from(from_), to(to_), val(val_) {}
-    bool operator <(const SegmLink& other) const
-    {
-        return val < other.val;
-    }
-    int from;
-    int to;
-    SegmLinkVal val;
-};
-
-//
-// Implementation
-//
-
-DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
-{
-    for (int i = 0; i < n; ++i)
-        parent[i] = i;
-}
-
-
-inline int DjSets::find(int elem)
-{
-    int set = elem;
-    while (set != parent[set])
-        set = parent[set];
-    while (elem != parent[elem])
-    {
-        int next = parent[elem];
-        parent[elem] = set;
-        elem = next;
-    }
-    return set;
-}
-
-
-inline int DjSets::merge(int set1, int set2)
-{
-    if (rank[set1] < rank[set2])
-    {
-        parent[set1] = set2;
-        size[set2] += size[set1];
-        return set2;
-    }
-    if (rank[set2] < rank[set1])
-    {
-        parent[set2] = set1;
-        size[set1] += size[set2];
-        return set1;
-    }
-    parent[set1] = set2;
-    rank[set2]++;
-    size[set2] += size[set1];
-    return set2;
-}
-
-
-template <typename T>
-Graph<T>::Graph(int numv_, int nume_max_) : start(numv_, -1), edges(nume_max_)
-{
-    this->numv = numv_;
-    this->nume_max = nume_max_;
-    nume = 0;
-}
-
-
-template <typename T>
-inline void Graph<T>::addEdge(int from, int to, const T& val)
-{
-    edges[nume] = Edge(to, start[from], val);
-    start[from] = nume;
-    nume++;
-}
-
-
-inline int pix(int y, int x, int ncols)
-{
-    return y * ncols + x;
-}
-
-
-inline int sqr(int x)
-{
-    return x * x;
-}
-
-
-inline int dist2(const cv::Vec4b& lhs, const cv::Vec4b& rhs)
-{
-    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
-}
-
-
-inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
-{
-    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
-}
-
-} // anonymous namespace
-
-
-void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
-{
-    CV_Assert(src.type() == CV_8UC4);
-    const int nrows = src.rows;
-    const int ncols = src.cols;
-    const int hr = sr;
-    const int hsp = sp;
-
-    // Perform mean shift procedure and obtain region and spatial maps
-    GpuMat d_rmap, d_spmap;
-    meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
-    Mat rmap(d_rmap);
-    Mat spmap(d_spmap);
-
-    Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
-                                        + (nrows - 1) + (ncols - 1));
-
-    // Make region adjacent graph from image
-    Vec4b r1;
-    Vec4b r2[4];
-    Vec2s sp1;
-    Vec2s sp2[4];
-    int dr[4];
-    int dsp[4];
-    for (int y = 0; y < nrows - 1; ++y)
-    {
-        Vec4b* ry = rmap.ptr<Vec4b>(y);
-        Vec4b* ryp = rmap.ptr<Vec4b>(y + 1);
-        Vec2s* spy = spmap.ptr<Vec2s>(y);
-        Vec2s* spyp = spmap.ptr<Vec2s>(y + 1);
-        for (int x = 0; x < ncols - 1; ++x)
-        {
-            r1 = ry[x];
-            sp1 = spy[x];
-
-            r2[0] = ry[x + 1];
-            r2[1] = ryp[x];
-            r2[2] = ryp[x + 1];
-            r2[3] = ryp[x];
-
-            sp2[0] = spy[x + 1];
-            sp2[1] = spyp[x];
-            sp2[2] = spyp[x + 1];
-            sp2[3] = spyp[x];
-
-            dr[0] = dist2(r1, r2[0]);
-            dr[1] = dist2(r1, r2[1]);
-            dr[2] = dist2(r1, r2[2]);
-            dsp[0] = dist2(sp1, sp2[0]);
-            dsp[1] = dist2(sp1, sp2[1]);
-            dsp[2] = dist2(sp1, sp2[2]);
-
-            r1 = ry[x + 1];
-            sp1 = spy[x + 1];
-
-            dr[3] = dist2(r1, r2[3]);
-            dsp[3] = dist2(sp1, sp2[3]);
-
-            g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-            g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
-            g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
-            g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
-        }
-    }
-    for (int y = 0; y < nrows - 1; ++y)
-    {
-        r1 = rmap.at<Vec4b>(y, ncols - 1);
-        r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
-        sp1 = spmap.at<Vec2s>(y, ncols - 1);
-        sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
-        dr[0] = dist2(r1, r2[0]);
-        dsp[0] = dist2(sp1, sp2[0]);
-        g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-    }
-    for (int x = 0; x < ncols - 1; ++x)
-    {
-        r1 = rmap.at<Vec4b>(nrows - 1, x);
-        r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
-        sp1 = spmap.at<Vec2s>(nrows - 1, x);
-        sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
-        dr[0] = dist2(r1, r2[0]);
-        dsp[0] = dist2(sp1, sp2[0]);
-        g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-    }
-
-    DjSets comps(g.numv);
-
-    // Find adjacent components
-    for (int v = 0; v < g.numv; ++v)
-    {
-        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
-        {
-            int c1 = comps.find(v);
-            int c2 = comps.find(g.edges[e_it].to);
-            if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
-                comps.merge(c1, c2);
-        }
-    }
-
-    std::vector<SegmLink> edges;
-    edges.reserve(g.numv);
-
-    // Prepare edges connecting differnet components
-    for (int v = 0; v < g.numv; ++v)
-    {
-        int c1 = comps.find(v);
-        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
-        {
-            int c2 = comps.find(g.edges[e_it].to);
-            if (c1 != c2)
-                edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
-        }
-    }
-
-    // Sort all graph's edges connecting differnet components (in asceding order)
-    sort(edges.begin(), edges.end());
-
-    // Exclude small components (starting from the nearest couple)
-    for (size_t i = 0; i < edges.size(); ++i)
-    {
-        int c1 = comps.find(edges[i].from);
-        int c2 = comps.find(edges[i].to);
-        if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
-            comps.merge(c1, c2);
-    }
-
-    // Compute sum of the pixel's colors which are in the same segment
-    Mat h_src(src);
-    std::vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
-    for (int y = 0; y < nrows; ++y)
-    {
-        Vec4b* h_srcy = h_src.ptr<Vec4b>(y);
-        for (int x = 0; x < ncols; ++x)
-        {
-            int parent = comps.find(pix(y, x, ncols));
-            Vec4b col = h_srcy[x];
-            Vec4i& sumcol = sumcols[parent];
-            sumcol[0] += col[0];
-            sumcol[1] += col[1];
-            sumcol[2] += col[2];
-        }
-    }
-
-    // Create final image, color of each segment is the average color of its pixels
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < nrows; ++y)
-    {
-        Vec4b* dsty = dst.ptr<Vec4b>(y);
-        for (int x = 0; x < ncols; ++x)
-        {
-            int parent = comps.find(pix(y, x, ncols));
-            const Vec4i& sumcol = sumcols[parent];
-            Vec4b& dstcol = dsty[x];
-            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
-            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
-            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
-            dstcol[3] = 255;
-        }
-    }
-}
-
-#endif // #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
diff --git a/modules/gpu/src/pyramids.cpp b/modules/gpu/src/pyramids.cpp
deleted file mode 100644
index 9e9fbe3437..0000000000
--- a/modules/gpu/src/pyramids.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::ImagePyramid::build(const GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::ImagePyramid::getLayer(GpuMat&, Size, Stream&) const { throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////////////
-// pyrDown
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
-        {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
-        {pyrDown_gpu<ushort>     , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3>     , pyrDown_gpu<ushort4>     },
-        {pyrDown_gpu<short>      , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3>      , pyrDown_gpu<short4>      },
-        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
-        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-
-    func(src, dst, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// pyrUp
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
-        {0 /*pyrUp_gpu<schar>*/, 0 /*pyrUp_gpu<schar2>*/ , 0 /*pyrUp_gpu<schar3>*/, 0 /*pyrUp_gpu<schar4>*/},
-        {pyrUp_gpu<ushort>     , 0 /*pyrUp_gpu<ushort2>*/, pyrUp_gpu<ushort3>     , pyrUp_gpu<ushort4>     },
-        {pyrUp_gpu<short>      , 0 /*pyrUp_gpu<short2>*/ , pyrUp_gpu<short3>      , pyrUp_gpu<short4>      },
-        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
-        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    dst.create(src.rows * 2, src.cols * 2, src.type());
-
-    func(src, dst, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// ImagePyramid
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace pyramid
-    {
-        template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stream)
-{
-    using namespace cv::gpu::cudev::pyramid;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {kernelDownsampleX2_gpu<uchar1>       , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3>      , kernelDownsampleX2_gpu<uchar4>      },
-        {0 /*kernelDownsampleX2_gpu<char1>*/  , 0 /*kernelDownsampleX2_gpu<char2>*/  , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
-        {kernelDownsampleX2_gpu<ushort1>      , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3>     , kernelDownsampleX2_gpu<ushort4>     },
-        {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
-        {0 /*kernelDownsampleX2_gpu<int1>*/   , 0 /*kernelDownsampleX2_gpu<int2>*/   , 0 /*kernelDownsampleX2_gpu<int3>*/  , 0 /*kernelDownsampleX2_gpu<int4>*/  },
-        {kernelDownsampleX2_gpu<float1>       , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3>      , kernelDownsampleX2_gpu<float4>      }
-    };
-
-    CV_Assert(img.depth() <= CV_32F && img.channels() <= 4);
-
-    const func_t func = funcs[img.depth()][img.channels() - 1];
-    CV_Assert(func != 0);
-
-    layer0_ = img;
-    Size szLastLayer = img.size();
-    nLayers_ = 1;
-
-    if (numLayers <= 0)
-        numLayers = 255; //it will cut-off when any of the dimensions goes 1
-
-    pyramid_.resize(numLayers);
-
-    for (int i = 0; i < numLayers - 1; ++i)
-    {
-        Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
-
-        if (szCurLayer.width == 0 || szCurLayer.height == 0)
-            break;
-
-        ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
-        nLayers_++;
-
-        const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
-
-        func(prevLayer, pyramid_[i], StreamAccessor::getStream(stream));
-
-        szLastLayer = szCurLayer;
-    }
-}
-
-void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::pyramid;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {kernelInterpolateFrom1_gpu<uchar1>      , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3>      , kernelInterpolateFrom1_gpu<uchar4>      },
-        {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/  , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
-        {kernelInterpolateFrom1_gpu<ushort1>     , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3>     , kernelInterpolateFrom1_gpu<ushort4>     },
-        {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
-        {0 /*kernelInterpolateFrom1_gpu<int1>*/  , 0 /*kernelInterpolateFrom1_gpu<int2>*/   , 0 /*kernelInterpolateFrom1_gpu<int3>*/  , 0 /*kernelInterpolateFrom1_gpu<int4>*/  },
-        {kernelInterpolateFrom1_gpu<float1>      , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3>      , kernelInterpolateFrom1_gpu<float4>      }
-    };
-
-    CV_Assert(outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0);
-
-    ensureSizeIsEnough(outRoi, layer0_.type(), outImg);
-
-    const func_t func = funcs[outImg.depth()][outImg.channels() - 1];
-    CV_Assert(func != 0);
-
-    if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
-    {
-        if (stream)
-            stream.enqueueCopy(layer0_, outImg);
-        else
-            layer0_.copyTo(outImg);
-    }
-
-    float lastScale = 1.0f;
-    float curScale;
-    GpuMat lastLayer = layer0_;
-    GpuMat curLayer;
-
-    for (int i = 0; i < nLayers_ - 1; ++i)
-    {
-        curScale = lastScale * 0.5f;
-        curLayer = pyramid_[i];
-
-        if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
-        {
-            if (stream)
-                stream.enqueueCopy(curLayer, outImg);
-            else
-                curLayer.copyTo(outImg);
-        }
-
-        if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
-            break;
-
-        lastScale = curScale;
-        lastLayer = curLayer;
-    }
-
-    func(lastLayer, outImg, StreamAccessor::getStream(stream));
-}
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/src/remap.cpp b/modules/gpu/src/remap.cpp
deleted file mode 100644
index 315766546b..0000000000
--- a/modules/gpu/src/remap.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, Scalar, Stream&){ throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T>
-        void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
-                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    }
-}}}
-
-void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
-        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-    static const func_t funcs[6][4] =
-    {
-        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
-        {0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/  , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/},
-        {remap_gpu<ushort>     , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3>    , remap_gpu<ushort4>    },
-        {remap_gpu<short>      , 0 /*remap_gpu<short2>*/ , remap_gpu<short3>     , remap_gpu<short4>     },
-        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
-        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(xmap.size(), src.type());
-
-    Scalar_<float> borderValueFloat;
-    borderValueFloat = borderValue;
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
-        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
-}
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp
deleted file mode 100644
index 32afa54de9..0000000000
--- a/modules/gpu/src/resize.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
-{
-    (void)src;
-    (void)dst;
-    (void)dsize;
-    (void)fx;
-    (void)fy;
-    (void)interpolation;
-    (void)s;
-
-    throw_no_cuda();
-}
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T>
-        void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-                        PtrStepSzb dst, int interpolation, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
-{
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR
-            || interpolation == INTER_CUBIC || interpolation == INTER_AREA);
-    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
-
-    if (dsize == Size())
-        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
-    else
-    {
-        fx = static_cast<double>(dsize.width) / src.cols;
-        fy = static_cast<double>(dsize.height) / src.rows;
-    }
-    if (dsize != dst.size())
-        dst.create(dsize, src.type());
-
-    if (dsize == src.size())
-    {
-        if (s)
-            s.enqueueCopy(src, dst);
-        else
-            src.copyTo(dst);
-        return;
-    }
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
-    useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
-
-    if (useNpp)
-    {
-        typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
-                                    double xFactor, double yFactor, int eInterpolation);
-
-        const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };
-
-        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};
-
-        NppiSize srcsz;
-        srcsz.width  = wholeSize.width;
-        srcsz.height = wholeSize.height;
-
-        NppiRect srcrect;
-        srcrect.x = ofs.x;
-        srcrect.y = ofs.y;
-        srcrect.width  = src.cols;
-        srcrect.height = src.rows;
-
-        NppiSize dstsz;
-        dstsz.width  = dst.cols;
-        dstsz.height = dst.rows;
-
-        NppStreamHandler h(stream);
-
-        nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        using namespace ::cv::gpu::cudev::imgproc;
-
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        static const func_t funcs[6][4] =
-        {
-            {resize_gpu<uchar>      , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3>     , resize_gpu<uchar4>     },
-            {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/  , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
-            {resize_gpu<ushort>     , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3>    , resize_gpu<ushort4>    },
-            {resize_gpu<short>      , 0 /*resize_gpu<short2>*/ , resize_gpu<short3>     , resize_gpu<short4>     },
-            {0 /*resize_gpu<int>*/  , 0 /*resize_gpu<int2>*/   , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
-            {resize_gpu<float>      , 0 /*resize_gpu<float2>*/ , resize_gpu<float3>     , resize_gpu<float4>     }
-        };
-
-        const func_t func = funcs[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
-            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
-    }
-}
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/src/warp.cpp b/modules/gpu/src/warp.cpp
deleted file mode 100644
index 007091e6a3..0000000000
--- a/modules/gpu/src/warp.cpp
+++ /dev/null
@@ -1,454 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-
-void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
-void cv::gpu::buildWarpAffineMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
-void cv::gpu::buildWarpPerspectiveMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
-
-        template <typename T>
-        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
-
-        template <typename T>
-        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    }
-}}}
-
-void cv::gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    CV_Assert(M.rows == 2 && M.cols == 3);
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[2 * 3];
-    Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invertAffineTransform(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    CV_Assert(M.rows == 3 && M.cols == 3);
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[3 * 3];
-    Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invert(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    buildWarpPerspectiveMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
-}
-
-namespace
-{
-    template<int DEPTH> struct NppTypeTraits;
-    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
-    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
-    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
-    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };
-    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };
-    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
-    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
-
-    template <int DEPTH> struct NppWarpFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_t* pDst,
-                                    int dstStep, NppiRect dstRoi, const double coeffs[][3],
-                                    int interpolation);
-    };
-
-    template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
-    {
-        typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
-        {
-            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
-
-            NppiSize srcsz;
-            srcsz.height = src.rows;
-            srcsz.width = src.cols;
-
-            NppiRect srcroi;
-            srcroi.x = 0;
-            srcroi.y = 0;
-            srcroi.height = src.rows;
-            srcroi.width = src.cols;
-
-            NppiRect dstroi;
-            dstroi.x = 0;
-            dstroi.y = 0;
-            dstroi.height = dst.rows;
-            dstroi.width = dst.cols;
-
-            cv::gpu::NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
-                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
-                              coeffs, npp_inter[interpolation]) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
-{
-    CV_Assert(M.rows == 2 && M.cols == 3);
-
-    int interpolation = flags & INTER_MAX;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    dst.create(dsize, src.type());
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    static const bool useNppTab[6][4][3] =
-    {
-        {
-            {false, false, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        }
-    };
-
-    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    // NPP bug on float data
-    useNpp = useNpp && src.depth() != CV_32F;
-
-    if (useNpp)
-    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
-
-        static const func_t funcs[2][6][4] =
-        {
-            {
-                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
-            },
-            {
-                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
-            }
-        };
-
-        dst.setTo(borderValue);
-
-        double coeffs[2][3];
-        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
-        M.convertTo(coeffsMat, coeffsMat.type());
-
-        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
-    }
-    else
-    {
-        using namespace cv::gpu::cudev::imgproc;
-
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        static const func_t funcs[6][4] =
-        {
-            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
-            {0 /*warpAffine_gpu<schar>*/, 0 /*warpAffine_gpu<char2>*/  , 0 /*warpAffine_gpu<char3>*/, 0 /*warpAffine_gpu<char4>*/},
-            {warpAffine_gpu<ushort>     , 0 /*warpAffine_gpu<ushort2>*/, warpAffine_gpu<ushort3>    , warpAffine_gpu<ushort4>    },
-            {warpAffine_gpu<short>      , 0 /*warpAffine_gpu<short2>*/ , warpAffine_gpu<short3>     , warpAffine_gpu<short4>     },
-            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
-            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
-        };
-
-        const func_t func = funcs[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        int gpuBorderType;
-        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-        float coeffs[2 * 3];
-        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
-
-        if (flags & WARP_INVERSE_MAP)
-            M.convertTo(coeffsMat, coeffsMat.type());
-        else
-        {
-            cv::Mat iM;
-            invertAffineTransform(M, iM);
-            iM.convertTo(coeffsMat, coeffsMat.type());
-        }
-
-        Scalar_<float> borderValueFloat;
-        borderValueFloat = borderValue;
-
-        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
-    }
-}
-
-void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
-{
-    CV_Assert(M.rows == 3 && M.cols == 3);
-
-    int interpolation = flags & INTER_MAX;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    dst.create(dsize, src.type());
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    static const bool useNppTab[6][4][3] =
-    {
-        {
-            {false, false, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        }
-    };
-
-    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    // NPP bug on float data
-    useNpp = useNpp && src.depth() != CV_32F;
-
-    if (useNpp)
-    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
-
-        static const func_t funcs[2][6][4] =
-        {
-            {
-                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
-            },
-            {
-                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
-            }
-        };
-
-        dst.setTo(borderValue);
-
-        double coeffs[3][3];
-        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
-        M.convertTo(coeffsMat, coeffsMat.type());
-
-        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
-    }
-    else
-    {
-        using namespace cv::gpu::cudev::imgproc;
-
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        static const func_t funcs[6][4] =
-        {
-            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
-            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
-            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
-            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
-            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
-            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
-        };
-
-        const func_t func = funcs[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        int gpuBorderType;
-        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-        float coeffs[3 * 3];
-        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
-
-        if (flags & WARP_INVERSE_MAP)
-            M.convertTo(coeffsMat, coeffsMat.type());
-        else
-        {
-            cv::Mat iM;
-            invert(M, iM);
-            iM.convertTo(coeffsMat, coeffsMat.type());
-        }
-
-        Scalar_<float> borderValueFloat;
-        borderValueFloat = borderValue;
-
-        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
-    }
-}
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/interpolation.hpp b/modules/gpu/test/interpolation.hpp
deleted file mode 100644
index 7a00143e1d..0000000000
--- a/modules/gpu/test/interpolation.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
-#define __OPENCV_TEST_INTERPOLATION_HPP__
-
-#include "opencv2/core.hpp"
-#include "opencv2/imgproc.hpp"
-
-template <typename T> T readVal(const cv::Mat& src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-{
-    if (border_type == cv::BORDER_CONSTANT)
-        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
-
-    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
-}
-
-template <typename T> struct NearestInterpolator
-{
-    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        return readVal<T>(src, int(y), int(x), c, border_type, borderVal);
-    }
-};
-
-template <typename T> struct LinearInterpolator
-{
-    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        int x1 = cvFloor(x);
-        int y1 = cvFloor(y);
-        int x2 = x1 + 1;
-        int y2 = y1 + 1;
-
-        float res = 0;
-
-        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
-        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
-        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
-        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
-
-        return cv::saturate_cast<T>(res);
-    }
-};
-
-template <typename T> struct CubicInterpolator
-{
-    static float bicubicCoeff(float x_)
-    {
-        float x = fabsf(x_);
-        if (x <= 1.0f)
-        {
-            return x * x * (1.5f * x - 2.5f) + 1.0f;
-        }
-        else if (x < 2.0f)
-        {
-            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
-        }
-        else
-        {
-            return 0.0f;
-        }
-    }
-
-    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        const float xmin = ceilf(x - 2.0f);
-        const float xmax = floorf(x + 2.0f);
-
-        const float ymin = ceilf(y - 2.0f);
-        const float ymax = floorf(y + 2.0f);
-
-        float sum  = 0.0f;
-        float wsum = 0.0f;
-
-        for (float cy = ymin; cy <= ymax; cy += 1.0f)
-        {
-            for (float cx = xmin; cx <= xmax; cx += 1.0f)
-            {
-                const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
-                sum += w * readVal<T>(src, (int) floorf(cy), (int) floorf(cx), c, border_type, borderVal);
-                wsum += w;
-            }
-        }
-
-        float res = (!wsum)? 0 : sum / wsum;
-
-        return cv::saturate_cast<T>(res);
-    }
-};
-
-#endif // __OPENCV_TEST_INTERPOLATION_HPP__
diff --git a/modules/gpu/test/test_color.cpp b/modules/gpu/test/test_color.cpp
deleted file mode 100644
index 4bd53c9194..0000000000
--- a/modules/gpu/test/test_color.cpp
+++ /dev/null
@@ -1,2503 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// cvtColor
-
-PARAM_TEST_CASE(CvtColor, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int depth;
-    bool useRoi;
-
-    cv::Mat img;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        depth = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        img = randomMat(size, CV_MAKE_TYPE(depth, 3), 0.0, depth == CV_32F ? 1.0 : 255.0);
-    }
-};
-
-GPU_TEST_P(CvtColor, BGR2RGB)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR2RGBA)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2RGBA);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2RGBA);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR2BGRA)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGRA);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGRA);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGRA2RGB)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGRA2BGR)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGRA2RGBA)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2RGBA);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGBA);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR2GRAY)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, RGB2GRAY)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, GRAY2BGR)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, GRAY2BGRA)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGRA, 4);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGRA, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGRA2GRAY)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, RGBA2GRAY)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2BGR565)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGR565);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGR565);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, RGB2BGR565)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2BGR565);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2BGR565);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5652BGR)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5652RGB)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGRA2BGR565)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR565);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR565);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, RGBA2BGR565)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2BGR565);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2BGR565);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5652BGRA)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652BGRA, 4);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652BGRA, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5652RGBA)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652RGBA, 4);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652RGBA, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, GRAY2BGR565)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR565);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR565);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5652GRAY)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR2BGR555)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGR555);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGR555);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, RGB2BGR555)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2BGR555);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2BGR555);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5552BGR)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5552RGB)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGRA2BGR555)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR555);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR555);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, RGBA2BGR555)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2BGR555);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2BGR555);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5552BGRA)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552BGRA, 4);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552BGRA, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5552RGBA)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552RGBA, 4);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552RGBA, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, GRAY2BGR555)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR555);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR555);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR5552GRAY)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-GPU_TEST_P(CvtColor, BGR2XYZ)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, RGB2XYZ)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2XYZ);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2XYZ);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2XYZ4)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGRA2XYZ4)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, XYZ2BGR)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, XYZ2RGB)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, XYZ42BGR)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, XYZ42BGRA)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR, 4);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2YCrCb)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, RGB2YCrCb)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YCrCb);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YCrCb);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2YCrCb4)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, RGBA2YCrCb4)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YCrCb2BGR)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YCrCb2RGB)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YCrCb42RGB)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YCrCb42RGBA)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB, 4);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2HSV)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HSV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HSV);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HSV)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HSV4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGBA2HSV4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, BGR2HLS)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HLS);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HLS);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HLS)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HLS4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGBA2HLS4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV2BGR)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV2RGB)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV42BGR)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV42BGRA)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR, 4);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS2BGR)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS2RGB)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS42RGB)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS42RGBA)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB, 4);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, BGR2HSV_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HSV_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HSV_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HSV_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HSV4_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGBA2HSV4_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, BGR2HLS_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HLS_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HLS_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HLS_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGB2HLS4_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, RGBA2HLS4_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV2BGR_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV2RGB_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV42RGB_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HSV42RGBA_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL, 4);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS2BGR_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2BGR_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2BGR_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS2RGB_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS42RGB_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, HLS42RGBA_FULL)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL, 4);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
-}
-
-GPU_TEST_P(CvtColor, BGR2YUV)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YUV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YUV);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, RGB2YUV)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YUV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YUV);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YUV2BGR)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YUV42BGR)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YUV42BGRA)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR, 4);
-
-    cv::Mat channels[4];
-    cv::split(src, channels);
-    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
-    cv::merge(channels, 4, src);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, YUV2RGB)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_RGB2YUV);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2YUV4)
-{
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YUV, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YUV);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, RGBA2YUV4)
-{
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YUV, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YUV);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2Lab)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Lab);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Lab);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, RGB2Lab)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2Lab);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2Lab);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, BGRA2Lab4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Lab, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Lab);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, LBGR2Lab)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Lab);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Lab);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, LRGB2Lab)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LRGB2Lab);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_LRGB2Lab);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, LBGRA2Lab4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Lab, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Lab);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, Lab2BGR)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
-}
-
-GPU_TEST_P(CvtColor, Lab2RGB)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
-}
-
-GPU_TEST_P(CvtColor, Lab2BGRA)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2BGR, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2BGR, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
-}
-
-GPU_TEST_P(CvtColor, Lab2LBGR)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LBGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LBGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
-}
-
-GPU_TEST_P(CvtColor, Lab2LRGB)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LRGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LRGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
-}
-
-GPU_TEST_P(CvtColor, Lab2LRGBA)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LRGB, 4);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LRGB, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
-}
-
-GPU_TEST_P(CvtColor, BGR2Luv)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Luv);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Luv);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, RGB2Luv)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2Luv);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2Luv);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, BGRA2Luv4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Luv, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Luv);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, LBGR2Luv)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Luv);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Luv);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, LRGB2Luv)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src = img;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LRGB2Luv);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_LRGB2Luv);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, LBGRA2Luv4)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Luv, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Luv);
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat channels[4];
-    cv::split(h_dst, channels);
-    cv::merge(channels, 3, h_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
-}
-
-GPU_TEST_P(CvtColor, Luv2BGR)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
-}
-
-GPU_TEST_P(CvtColor, Luv2RGB)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2RGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2RGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
-}
-
-GPU_TEST_P(CvtColor, Luv2BGRA)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2BGR, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2BGR, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
-}
-
-GPU_TEST_P(CvtColor, Luv2LBGR)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LBGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LBGR);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
-}
-
-GPU_TEST_P(CvtColor, Luv2LRGB)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LRGB);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LRGB);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
-}
-
-GPU_TEST_P(CvtColor, Luv2LRGBA)
-{
-    if (depth == CV_16U)
-        return;
-
-    cv::Mat src;
-    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LRGB, 4);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LRGB, 4);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
-}
-
-#if defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
-
-GPU_TEST_P(CvtColor, RGBA2mRGBA)
-{
-    if (depth != CV_8U)
-        return;
-
-    cv::Mat src = randomMat(size, CV_MAKE_TYPE(depth, 4));
-
-    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2mRGBA);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2mRGBA);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1);
-}
-
-#endif // defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
-
-GPU_TEST_P(CvtColor, BayerBG2BGR)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerBG2BGR4)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
-
-    cv::Mat dst4(dst);
-    cv::Mat dst3;
-    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
-
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerGB2BGR)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerGB2BGR4)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
-
-    cv::Mat dst4(dst);
-    cv::Mat dst3;
-    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerRG2BGR)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerRG2BGR4)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
-
-    cv::Mat dst4(dst);
-    cv::Mat dst3;
-    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerGR2BGR)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerGR2BGR4)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR, 4);
-
-    ASSERT_EQ(4, dst.channels());
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
-
-    cv::Mat dst4(dst);
-    cv::Mat dst3;
-    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
-}
-
-GPU_TEST_P(CvtColor, BayerBG2Gray)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
-}
-
-GPU_TEST_P(CvtColor, BayerGB2Gray)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
-}
-
-GPU_TEST_P(CvtColor, BayerRG2Gray)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
-}
-
-GPU_TEST_P(CvtColor, BayerGR2Gray)
-{
-    if ((depth != CV_8U && depth != CV_16U) || useRoi)
-        return;
-
-    cv::Mat src = randomMat(size, depth);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2GRAY);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2GRAY);
-
-    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
-    WHOLE_SUBMAT));
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Demosaicing
-
-struct Demosaicing : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-
-    static void mosaic(const cv::Mat_<cv::Vec3b>& src, cv::Mat_<uchar>& dst, cv::Point firstRed)
-    {
-        dst.create(src.size());
-
-        for (int y = 0; y < src.rows; ++y)
-        {
-            for (int x = 0; x < src.cols; ++x)
-            {
-                cv::Vec3b pix = src(y, x);
-
-                cv::Point alternate;
-                alternate.x = (x + firstRed.x) % 2;
-                alternate.y = (y + firstRed.y) % 2;
-
-                if (alternate.y == 0)
-                {
-                    if (alternate.x == 0)
-                    {
-                        // RG
-                        // GB
-                        dst(y, x) = pix[2];
-                    }
-                    else
-                    {
-                        // GR
-                        // BG
-                        dst(y, x) = pix[1];
-                    }
-                }
-                else
-                {
-                    if (alternate.x == 0)
-                    {
-                        // GB
-                        // RG
-                        dst(y, x) = pix[1];
-                    }
-                    else
-                    {
-                        // BG
-                        // GR
-                        dst(y, x) = pix[0];
-                    }
-                }
-            }
-        }
-    }
-};
-
-GPU_TEST_P(Demosaicing, BayerBG2BGR)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(1, 1));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerBG2BGR);
-
-    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
-}
-
-GPU_TEST_P(Demosaicing, BayerGB2BGR)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(0, 1));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGB2BGR);
-
-    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
-}
-
-GPU_TEST_P(Demosaicing, BayerRG2BGR)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(0, 0));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerRG2BGR);
-
-    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
-}
-
-GPU_TEST_P(Demosaicing, BayerGR2BGR)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(1, 0));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGR2BGR);
-
-    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
-}
-
-GPU_TEST_P(Demosaicing, BayerBG2BGR_MHT)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(1, 1));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerBG2BGR_MHT);
-
-    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
-}
-
-GPU_TEST_P(Demosaicing, BayerGB2BGR_MHT)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(0, 1));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGB2BGR_MHT);
-
-    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
-}
-
-GPU_TEST_P(Demosaicing, BayerRG2BGR_MHT)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(0, 0));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerRG2BGR_MHT);
-
-    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
-}
-
-GPU_TEST_P(Demosaicing, BayerGR2BGR_MHT)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-
-    cv::Mat_<uchar> src;
-    mosaic(img, src, cv::Point(1, 0));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGR2BGR_MHT);
-
-    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Demosaicing, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// swapChannels
-
-PARAM_TEST_CASE(SwapChannels, cv::gpu::DeviceInfo, cv::Size, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(SwapChannels, Accuracy)
-{
-    cv::Mat src = readImageType("stereobm/aloe-L.png", CV_8UC4);
-    ASSERT_FALSE(src.empty());
-
-    cv::gpu::GpuMat d_src = loadMat(src, useRoi);
-
-    const int dstOrder[] = {2, 1, 0, 3};
-    cv::gpu::swapChannels(d_src, dstOrder);
-
-    cv::Mat dst_gold;
-    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGBA);
-
-    EXPECT_MAT_NEAR(dst_gold, d_src, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, SwapChannels, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    WHOLE_SUBMAT));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_denoising.cpp b/modules/gpu/test/test_denoising.cpp
deleted file mode 100644
index 2f1a93be1c..0000000000
--- a/modules/gpu/test/test_denoising.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-////////////////////////////////////////////////////////
-// BilateralFilter
-
-PARAM_TEST_CASE(BilateralFilter, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    int kernel_size;
-    float sigma_color;
-    float sigma_spatial;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-
-        kernel_size = 5;
-        sigma_color = 10.f;
-        sigma_spatial = 3.5f;
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(BilateralFilter, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    src.convertTo(src, type);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::bilateralFilter(loadMat(src), dst, kernel_size, sigma_color, sigma_spatial);
-
-    cv::Mat dst_gold;
-    cv::bilateralFilter(src, dst_gold, kernel_size, sigma_color, sigma_spatial);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Denoising, BilateralFilter, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(639, 481)),
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3))
-    ));
-
-
-////////////////////////////////////////////////////////
-// Brute Force Non local means
-
-struct BruteForceNonLocalMeans: testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(BruteForceNonLocalMeans, Regression)
-{
-    using cv::gpu::GpuMat;
-
-    cv::Mat bgr  = readImage("denoising/lena_noised_gaussian_sigma=20_multi_0.png", cv::IMREAD_COLOR);
-    ASSERT_FALSE(bgr.empty());
-
-    cv::Mat gray;
-    cv::cvtColor(bgr, gray, cv::COLOR_BGR2GRAY);
-
-    GpuMat dbgr, dgray;
-    cv::gpu::nonLocalMeans(GpuMat(bgr),  dbgr, 20);
-    cv::gpu::nonLocalMeans(GpuMat(gray), dgray, 20);
-
-#if 0
-    dumpImage("denoising/nlm_denoised_lena_bgr.png", cv::Mat(dbgr));
-    dumpImage("denoising/nlm_denoised_lena_gray.png", cv::Mat(dgray));
-#endif
-
-    cv::Mat bgr_gold  = readImage("denoising/nlm_denoised_lena_bgr.png", cv::IMREAD_COLOR);
-    cv::Mat gray_gold  = readImage("denoising/nlm_denoised_lena_gray.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(bgr_gold.empty() || gray_gold.empty());
-
-    EXPECT_MAT_NEAR(bgr_gold, dbgr, 1e-4);
-    EXPECT_MAT_NEAR(gray_gold, dgray, 1e-4);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Denoising, BruteForceNonLocalMeans, ALL_DEVICES);
-
-////////////////////////////////////////////////////////
-// Fast Force Non local means
-
-struct FastNonLocalMeans: testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(FastNonLocalMeans, Regression)
-{
-    using cv::gpu::GpuMat;
-
-    cv::Mat bgr  = readImage("denoising/lena_noised_gaussian_sigma=20_multi_0.png", cv::IMREAD_COLOR);
-    ASSERT_FALSE(bgr.empty());
-
-    cv::Mat gray;
-    cv::cvtColor(bgr, gray, cv::COLOR_BGR2GRAY);
-
-    GpuMat dbgr, dgray;
-    cv::gpu::FastNonLocalMeansDenoising fnlmd;
-
-    fnlmd.simpleMethod(GpuMat(gray),  dgray, 20);
-    fnlmd.labMethod(GpuMat(bgr),  dbgr, 20, 10);
-
-#if 0
-    dumpImage("denoising/fnlm_denoised_lena_bgr.png", cv::Mat(dbgr));
-    dumpImage("denoising/fnlm_denoised_lena_gray.png", cv::Mat(dgray));
-#endif
-
-    cv::Mat bgr_gold  = readImage("denoising/fnlm_denoised_lena_bgr.png", cv::IMREAD_COLOR);
-    cv::Mat gray_gold  = readImage("denoising/fnlm_denoised_lena_gray.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(bgr_gold.empty() || gray_gold.empty());
-
-    EXPECT_MAT_NEAR(bgr_gold, dbgr, 1);
-    EXPECT_MAT_NEAR(gray_gold, dgray, 1);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Denoising, FastNonLocalMeans, ALL_DEVICES);
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_hough.cpp b/modules/gpu/test/test_hough.cpp
deleted file mode 100644
index a044901041..0000000000
--- a/modules/gpu/test/test_hough.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// HoughLines
-
-PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, cv::Size, UseRoi)
-{
-    static void generateLines(cv::Mat& img)
-    {
-        img.setTo(cv::Scalar::all(0));
-
-        cv::line(img, cv::Point(20, 0), cv::Point(20, img.rows), cv::Scalar::all(255));
-        cv::line(img, cv::Point(0, 50), cv::Point(img.cols, 50), cv::Scalar::all(255));
-        cv::line(img, cv::Point(0, 0), cv::Point(img.cols, img.rows), cv::Scalar::all(255));
-        cv::line(img, cv::Point(img.cols, 0), cv::Point(0, img.rows), cv::Scalar::all(255));
-    }
-
-    static void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
-    {
-        dst.setTo(cv::Scalar::all(0));
-
-        for (size_t i = 0; i < lines.size(); ++i)
-        {
-            float rho = lines[i][0], theta = lines[i][1];
-            cv::Point pt1, pt2;
-            double a = std::cos(theta), b = std::sin(theta);
-            double x0 = a*rho, y0 = b*rho;
-            pt1.x = cvRound(x0 + 1000*(-b));
-            pt1.y = cvRound(y0 + 1000*(a));
-            pt2.x = cvRound(x0 - 1000*(-b));
-            pt2.y = cvRound(y0 - 1000*(a));
-            cv::line(dst, pt1, pt2, cv::Scalar::all(255));
-        }
-    }
-};
-
-GPU_TEST_P(HoughLines, Accuracy)
-{
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const bool useRoi = GET_PARAM(2);
-
-    const float rho = 1.0f;
-    const float theta = (float) (1.5 * CV_PI / 180.0);
-    const int threshold = 100;
-
-    cv::Mat src(size, CV_8UC1);
-    generateLines(src);
-
-    cv::gpu::GpuMat d_lines;
-    cv::gpu::HoughLines(loadMat(src, useRoi), d_lines, rho, theta, threshold);
-
-    std::vector<cv::Vec2f> lines;
-    cv::gpu::HoughLinesDownload(d_lines, lines);
-
-    cv::Mat dst(size, CV_8UC1);
-    drawLines(dst, lines);
-
-    ASSERT_MAT_NEAR(src, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    WHOLE_SUBMAT));
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// HoughCircles
-
-PARAM_TEST_CASE(HoughCircles, cv::gpu::DeviceInfo, cv::Size, UseRoi)
-{
-    static void drawCircles(cv::Mat& dst, const std::vector<cv::Vec3f>& circles, bool fill)
-    {
-        dst.setTo(cv::Scalar::all(0));
-
-        for (size_t i = 0; i < circles.size(); ++i)
-            cv::circle(dst, cv::Point2f(circles[i][0], circles[i][1]), (int)circles[i][2], cv::Scalar::all(255), fill ? -1 : 1);
-    }
-};
-
-GPU_TEST_P(HoughCircles, Accuracy)
-{
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const bool useRoi = GET_PARAM(2);
-
-    const float dp = 2.0f;
-    const float minDist = 0.0f;
-    const int minRadius = 10;
-    const int maxRadius = 20;
-    const int cannyThreshold = 100;
-    const int votesThreshold = 20;
-
-    std::vector<cv::Vec3f> circles_gold(4);
-    circles_gold[0] = cv::Vec3i(20, 20, minRadius);
-    circles_gold[1] = cv::Vec3i(90, 87, minRadius + 3);
-    circles_gold[2] = cv::Vec3i(30, 70, minRadius + 8);
-    circles_gold[3] = cv::Vec3i(80, 10, maxRadius);
-
-    cv::Mat src(size, CV_8UC1);
-    drawCircles(src, circles_gold, true);
-
-    cv::gpu::GpuMat d_circles;
-    cv::gpu::HoughCircles(loadMat(src, useRoi), d_circles, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
-
-    std::vector<cv::Vec3f> circles;
-    cv::gpu::HoughCirclesDownload(d_circles, circles);
-
-    ASSERT_FALSE(circles.empty());
-
-    for (size_t i = 0; i < circles.size(); ++i)
-    {
-        cv::Vec3f cur = circles[i];
-
-        bool found = false;
-
-        for (size_t j = 0; j < circles_gold.size(); ++j)
-        {
-            cv::Vec3f gold = circles_gold[j];
-
-            if (std::fabs(cur[0] - gold[0]) < 5 && std::fabs(cur[1] - gold[1]) < 5 && std::fabs(cur[2] - gold[2]) < 5)
-            {
-                found = true;
-                break;
-            }
-        }
-
-        ASSERT_TRUE(found);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughCircles, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    WHOLE_SUBMAT));
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// GeneralizedHough
-
-PARAM_TEST_CASE(GeneralizedHough, cv::gpu::DeviceInfo, UseRoi)
-{
-};
-
-GPU_TEST_P(GeneralizedHough, POSITION)
-{
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const bool useRoi = GET_PARAM(1);
-
-    cv::Mat templ = readImage("../cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(templ.empty());
-
-    cv::Point templCenter(templ.cols / 2, templ.rows / 2);
-
-    const size_t gold_count = 3;
-    cv::Point pos_gold[gold_count];
-    pos_gold[0] = cv::Point(templCenter.x + 10, templCenter.y + 10);
-    pos_gold[1] = cv::Point(2 * templCenter.x + 40, templCenter.y + 10);
-    pos_gold[2] = cv::Point(2 * templCenter.x + 40, 2 * templCenter.y + 40);
-
-    cv::Mat image(templ.rows * 3, templ.cols * 3, CV_8UC1, cv::Scalar::all(0));
-    for (size_t i = 0; i < gold_count; ++i)
-    {
-        cv::Rect rec(pos_gold[i].x - templCenter.x, pos_gold[i].y - templCenter.y, templ.cols, templ.rows);
-        cv::Mat imageROI = image(rec);
-        templ.copyTo(imageROI);
-    }
-
-    cv::Ptr<cv::gpu::GeneralizedHough_GPU> hough = cv::gpu::GeneralizedHough_GPU::create(cv::GeneralizedHough::GHT_POSITION);
-    hough->set("votesThreshold", 200);
-
-    hough->setTemplate(loadMat(templ, useRoi));
-
-    cv::gpu::GpuMat d_pos;
-    hough->detect(loadMat(image, useRoi), d_pos);
-
-    std::vector<cv::Vec4f> pos;
-    hough->download(d_pos, pos);
-
-    ASSERT_EQ(gold_count, pos.size());
-
-    for (size_t i = 0; i < gold_count; ++i)
-    {
-        cv::Point gold = pos_gold[i];
-
-        bool found = false;
-
-        for (size_t j = 0; j < pos.size(); ++j)
-        {
-            cv::Point2f p(pos[j][0], pos[j][1]);
-
-            if (::fabs(p.x - gold.x) < 2 && ::fabs(p.y - gold.y) < 2)
-            {
-                found = true;
-                break;
-            }
-        }
-
-        ASSERT_TRUE(found);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, GeneralizedHough, testing::Combine(
-    ALL_DEVICES,
-    WHOLE_SUBMAT));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
deleted file mode 100644
index 6957f54375..0000000000
--- a/modules/gpu/test/test_imgproc.cpp
+++ /dev/null
@@ -1,843 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// HistEven
-
-struct HistEven : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(HistEven, Accuracy)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png");
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat hsv;
-    cv::cvtColor(img, hsv, cv::COLOR_BGR2HSV);
-
-    int hbins = 30;
-    float hranges[] = {0.0f, 180.0f};
-
-    std::vector<cv::gpu::GpuMat> srcs;
-    cv::gpu::split(loadMat(hsv), srcs);
-
-    cv::gpu::GpuMat hist;
-    cv::gpu::histEven(srcs[0], hist, hbins, (int)hranges[0], (int)hranges[1]);
-
-    cv::MatND histnd;
-    int histSize[] = {hbins};
-    const float* ranges[] = {hranges};
-    int channels[] = {0};
-    cv::calcHist(&hsv, 1, channels, cv::Mat(), histnd, 1, histSize, ranges);
-
-    cv::Mat hist_gold = histnd;
-    hist_gold = hist_gold.t();
-    hist_gold.convertTo(hist_gold, CV_32S);
-
-    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HistEven, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// CalcHist
-
-namespace
-{
-    void calcHistGold(const cv::Mat& src, cv::Mat& hist)
-    {
-        hist.create(1, 256, CV_32SC1);
-        hist.setTo(cv::Scalar::all(0));
-
-        int* hist_row = hist.ptr<int>();
-        for (int y = 0; y < src.rows; ++y)
-        {
-            const uchar* src_row = src.ptr(y);
-
-            for (int x = 0; x < src.cols; ++x)
-                ++hist_row[src_row[x]];
-        }
-    }
-}
-
-PARAM_TEST_CASE(CalcHist, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    cv::Size size;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(CalcHist, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_8UC1);
-
-    cv::gpu::GpuMat hist;
-    cv::gpu::calcHist(loadMat(src), hist);
-
-    cv::Mat hist_gold;
-    calcHistGold(src, hist_gold);
-
-    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CalcHist, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// EqualizeHist
-
-PARAM_TEST_CASE(EqualizeHist, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(EqualizeHist, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_8UC1);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::equalizeHist(loadMat(src), dst);
-
-    cv::Mat dst_gold;
-    cv::equalizeHist(src, dst_gold);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, EqualizeHist, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// CLAHE
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(ClipLimit, double)
-}
-
-PARAM_TEST_CASE(CLAHE, cv::gpu::DeviceInfo, cv::Size, ClipLimit)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    double clipLimit;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        clipLimit = GET_PARAM(2);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(CLAHE, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_8UC1);
-
-    cv::Ptr<cv::gpu::CLAHE> clahe = cv::gpu::createCLAHE(clipLimit);
-    cv::gpu::GpuMat dst;
-    clahe->apply(loadMat(src), dst);
-
-    cv::Ptr<cv::CLAHE> clahe_gold = cv::createCLAHE(clipLimit);
-    cv::Mat dst_gold;
-    clahe_gold->apply(src, dst_gold);
-
-    ASSERT_MAT_NEAR(dst_gold, dst, 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CLAHE, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(0.0, 40.0)));
-
-////////////////////////////////////////////////////////
-// Canny
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(AppertureSize, int);
-    IMPLEMENT_PARAM_CLASS(L2gradient, bool);
-}
-
-PARAM_TEST_CASE(Canny, cv::gpu::DeviceInfo, AppertureSize, L2gradient, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int apperture_size;
-    bool useL2gradient;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        apperture_size = GET_PARAM(1);
-        useL2gradient = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Canny, Accuracy)
-{
-    cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    double low_thresh = 50.0;
-    double high_thresh = 100.0;
-
-    if (!supportFeature(devInfo, cv::gpu::SHARED_ATOMICS))
-    {
-        try
-        {
-        cv::gpu::GpuMat edges;
-        cv::gpu::Canny(loadMat(img), edges, low_thresh, high_thresh, apperture_size, useL2gradient);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat edges;
-        cv::gpu::Canny(loadMat(img, useRoi), edges, low_thresh, high_thresh, apperture_size, useL2gradient);
-
-        cv::Mat edges_gold;
-        cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
-
-        EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(AppertureSize(3), AppertureSize(5)),
-    testing::Values(L2gradient(false), L2gradient(true)),
-    WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////////////////////////////
-// MeanShift
-
-struct MeanShift : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    cv::Mat img;
-
-    int spatialRad;
-    int colorRad;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        img = readImageType("meanshift/cones.png", CV_8UC4);
-        ASSERT_FALSE(img.empty());
-
-        spatialRad = 30;
-        colorRad = 30;
-    }
-};
-
-GPU_TEST_P(MeanShift, Filtering)
-{
-    cv::Mat img_template;
-    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-        img_template = readImage("meanshift/con_result.png");
-    else
-        img_template = readImage("meanshift/con_result_CC1X.png");
-    ASSERT_FALSE(img_template.empty());
-
-    cv::gpu::GpuMat d_dst;
-    cv::gpu::meanShiftFiltering(loadMat(img), d_dst, spatialRad, colorRad);
-
-    ASSERT_EQ(CV_8UC4, d_dst.type());
-
-    cv::Mat dst(d_dst);
-
-    cv::Mat result;
-    cv::cvtColor(dst, result, cv::COLOR_BGRA2BGR);
-
-    EXPECT_MAT_NEAR(img_template, result, 0.0);
-}
-
-GPU_TEST_P(MeanShift, Proc)
-{
-    cv::FileStorage fs;
-    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-        fs.open(std::string(cvtest::TS::ptr()->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ);
-    else
-        fs.open(std::string(cvtest::TS::ptr()->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ);
-    ASSERT_TRUE(fs.isOpened());
-
-    cv::Mat spmap_template;
-    fs["spmap"] >> spmap_template;
-    ASSERT_FALSE(spmap_template.empty());
-
-    cv::gpu::GpuMat rmap_filtered;
-    cv::gpu::meanShiftFiltering(loadMat(img), rmap_filtered, spatialRad, colorRad);
-
-    cv::gpu::GpuMat rmap;
-    cv::gpu::GpuMat spmap;
-    cv::gpu::meanShiftProc(loadMat(img), rmap, spmap, spatialRad, colorRad);
-
-    ASSERT_EQ(CV_8UC4, rmap.type());
-
-    EXPECT_MAT_NEAR(rmap_filtered, rmap, 0.0);
-    EXPECT_MAT_NEAR(spmap_template, spmap, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MeanShift, ALL_DEVICES);
-
-////////////////////////////////////////////////////////////////////////////////
-// MeanShiftSegmentation
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(MinSize, int);
-}
-
-PARAM_TEST_CASE(MeanShiftSegmentation, cv::gpu::DeviceInfo, MinSize)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int minsize;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        minsize = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(MeanShiftSegmentation, Regression)
-{
-    cv::Mat img = readImageType("meanshift/cones.png", CV_8UC4);
-    ASSERT_FALSE(img.empty());
-
-    std::ostringstream path;
-    path << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize;
-    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-        path << ".png";
-    else
-        path << "_CC1X.png";
-    cv::Mat dst_gold = readImage(path.str());
-    ASSERT_FALSE(dst_gold.empty());
-
-    cv::Mat dst;
-    cv::gpu::meanShiftSegmentation(loadMat(img), dst, 10, 10, minsize);
-
-    cv::Mat dst_rgb;
-    cv::cvtColor(dst, dst_rgb, cv::COLOR_BGRA2BGR);
-
-    EXPECT_MAT_SIMILAR(dst_gold, dst_rgb, 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MeanShiftSegmentation, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MinSize(0), MinSize(4), MinSize(20), MinSize(84), MinSize(340), MinSize(1364))));
-
-////////////////////////////////////////////////////////////////////////////
-// Blend
-
-namespace
-{
-    template <typename T>
-    void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
-    {
-        result_gold.create(img1.size(), img1.type());
-
-        int cn = img1.channels();
-
-        for (int y = 0; y < img1.rows; ++y)
-        {
-            const float* weights1_row = weights1.ptr<float>(y);
-            const float* weights2_row = weights2.ptr<float>(y);
-            const T* img1_row = img1.ptr<T>(y);
-            const T* img2_row = img2.ptr<T>(y);
-            T* result_gold_row = result_gold.ptr<T>(y);
-
-            for (int x = 0; x < img1.cols * cn; ++x)
-            {
-                float w1 = weights1_row[x / cn];
-                float w2 = weights2_row[x / cn];
-                result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
-            }
-        }
-    }
-}
-
-PARAM_TEST_CASE(Blend, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Blend, Accuracy)
-{
-    int depth = CV_MAT_DEPTH(type);
-
-    cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
-    cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
-    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
-    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
-
-    cv::gpu::GpuMat result;
-    cv::gpu::blendLinear(loadMat(img1, useRoi), loadMat(img2, useRoi), loadMat(weights1, useRoi), loadMat(weights2, useRoi), result);
-
-    cv::Mat result_gold;
-    if (depth == CV_8U)
-        blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
-    else
-        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
-
-    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.0 : 1e-5);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate8U
-
-CV_ENUM(TemplateMethod, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
-}
-
-PARAM_TEST_CASE(MatchTemplate8U, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        templ_size = GET_PARAM(2);
-        cn = GET_PARAM(3);
-        method = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(MatchTemplate8U, Accuracy)
-{
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
-
-    cv::Mat dst_gold;
-    cv::matchTemplate(image, templ, dst_gold, method);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, templ_size.area() * 1e-1);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    TemplateMethod::all()));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate32F
-
-PARAM_TEST_CASE(MatchTemplate32F, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-
-    int n, m, h, w;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        templ_size = GET_PARAM(2);
-        cn = GET_PARAM(3);
-        method = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(MatchTemplate32F, Regression)
-{
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
-
-    cv::Mat dst_gold;
-    cv::matchTemplate(image, templ, dst_gold, method);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, templ_size.area() * 1e-1);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplateBlackSource
-
-PARAM_TEST_CASE(MatchTemplateBlackSource, cv::gpu::DeviceInfo, TemplateMethod)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int method;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        method = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(MatchTemplateBlackSource, Accuracy)
-{
-    cv::Mat image = readImage("matchtemplate/black.png");
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat pattern = readImage("matchtemplate/cat.png");
-    ASSERT_FALSE(pattern.empty());
-
-    cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, method);
-
-    cv::Mat dst(d_dst);
-
-    double maxValue;
-    cv::Point maxLoc;
-    cv::minMaxLoc(dst, NULL, &maxValue, NULL, &maxLoc);
-
-    cv::Point maxLocGold = cv::Point(284, 12);
-
-    ASSERT_EQ(maxLocGold, maxLoc);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplateBlackSource, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(TemplateMethod(cv::TM_CCOEFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED))));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_CCOEF_NORMED
-
-PARAM_TEST_CASE(MatchTemplate_CCOEF_NORMED, cv::gpu::DeviceInfo, std::pair<std::string, std::string>)
-{
-    cv::gpu::DeviceInfo devInfo;
-    std::string imageName;
-    std::string patternName;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        imageName = GET_PARAM(1).first;
-        patternName = GET_PARAM(1).second;
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
-{
-    cv::Mat image = readImage(imageName);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat pattern = readImage(patternName);
-    ASSERT_FALSE(pattern.empty());
-
-    cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, cv::TM_CCOEFF_NORMED);
-
-    cv::Mat dst(d_dst);
-
-    cv::Point minLoc, maxLoc;
-    double minVal, maxVal;
-    cv::minMaxLoc(dst, &minVal, &maxVal, &minLoc, &maxLoc);
-
-    cv::Mat dstGold;
-    cv::matchTemplate(image, pattern, dstGold, cv::TM_CCOEFF_NORMED);
-
-    double minValGold, maxValGold;
-    cv::Point minLocGold, maxLocGold;
-    cv::minMaxLoc(dstGold, &minValGold, &maxValGold, &minLocGold, &maxLocGold);
-
-    ASSERT_EQ(minLocGold, minLoc);
-    ASSERT_EQ(maxLocGold, maxLoc);
-    ASSERT_LE(maxVal, 1.0);
-    ASSERT_GE(minVal, -1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate_CCOEF_NORMED, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::make_pair(std::string("matchtemplate/source-0.png"), std::string("matchtemplate/target-0.png")))));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_CanFindBigTemplate
-
-struct MatchTemplate_CanFindBigTemplate : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
-{
-    cv::Mat scene = readImage("matchtemplate/scene.png");
-    ASSERT_FALSE(scene.empty());
-
-    cv::Mat templ = readImage("matchtemplate/template.png");
-    ASSERT_FALSE(templ.empty());
-
-    cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF_NORMED);
-
-    cv::Mat result(d_result);
-
-    double minVal;
-    cv::Point minLoc;
-    cv::minMaxLoc(result, &minVal, 0, &minLoc, 0);
-
-    ASSERT_GE(minVal, 0);
-    ASSERT_LT(minVal, 1e-3);
-    ASSERT_EQ(344, minLoc.x);
-    ASSERT_EQ(0, minLoc.y);
-}
-
-GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
-{
-    cv::Mat scene = readImage("matchtemplate/scene.png");
-    ASSERT_FALSE(scene.empty());
-
-    cv::Mat templ = readImage("matchtemplate/template.png");
-    ASSERT_FALSE(templ.empty());
-
-    cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF);
-
-    cv::Mat result(d_result);
-
-    double minVal;
-    cv::Point minLoc;
-    cv::minMaxLoc(result, &minVal, 0, &minLoc, 0);
-
-    ASSERT_GE(minVal, 0);
-    ASSERT_EQ(344, minLoc.x);
-    ASSERT_EQ(0, minLoc.y);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate_CanFindBigTemplate, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// CornerHarris
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(BlockSize, int);
-    IMPLEMENT_PARAM_CLASS(ApertureSize, int);
-}
-
-PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, BorderType, BlockSize, ApertureSize)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-    int borderType;
-    int blockSize;
-    int apertureSize;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        borderType = GET_PARAM(2);
-        blockSize = GET_PARAM(3);
-        apertureSize = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(CornerHarris, Accuracy)
-{
-    cv::Mat src = readImageType("stereobm/aloe-L.png", type);
-    ASSERT_FALSE(src.empty());
-
-    double k = randomDouble(0.1, 0.9);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cornerHarris(loadMat(src), dst, blockSize, apertureSize, k, borderType);
-
-    cv::Mat dst_gold;
-    cv::cornerHarris(src, dst_gold, blockSize, apertureSize, k, borderType);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.02);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerHarris, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// cornerMinEigen
-
-PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, BorderType, BlockSize, ApertureSize)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-    int borderType;
-    int blockSize;
-    int apertureSize;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        borderType = GET_PARAM(2);
-        blockSize = GET_PARAM(3);
-        apertureSize = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(CornerMinEigen, Accuracy)
-{
-    cv::Mat src = readImageType("stereobm/aloe-L.png", type);
-    ASSERT_FALSE(src.empty());
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::cornerMinEigenVal(loadMat(src), dst, blockSize, apertureSize, borderType);
-
-    cv::Mat dst_gold;
-    cv::cornerMinEigenVal(src, dst_gold, blockSize, apertureSize, borderType);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.02);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_labeling.cpp b/modules/gpu/test/test_labeling.cpp
deleted file mode 100644
index 4a1927c392..0000000000
--- a/modules/gpu/test/test_labeling.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-namespace
-{
-    struct GreedyLabeling
-    {
-        struct dot
-        {
-            int x;
-            int y;
-
-            static dot make(int i, int j)
-            {
-                dot d; d.x = i; d.y = j;
-                return d;
-            }
-        };
-
-        struct InInterval
-        {
-            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
-            const int lo, hi;
-
-            bool operator() (const unsigned char a, const unsigned char b) const
-            {
-                int d = a - b;
-                return lo <= d && d <= hi;
-            }
-        };
-
-        GreedyLabeling(cv::Mat img)
-        : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {}
-
-        void operator() (cv::Mat labels) const
-        {
-            InInterval inInt(0, 2);
-            dot* stack = new dot[image.cols * image.rows];
-
-            int cc = -1;
-
-            int* dist_labels = (int*)labels.data;
-            int pitch = (int) labels.step1();
-
-            unsigned char* source = (unsigned char*)image.data;
-            int width = image.cols;
-            int height = image.rows;
-            int step1 = (int)image.step1();
-
-            for (int j = 0; j < image.rows; ++j)
-                for (int i = 0; i < image.cols; ++i)
-                {
-                    if (dist_labels[j * pitch + i] != -1) continue;
-
-                    dot* top = stack;
-                    dot p = dot::make(i, j);
-                    cc++;
-
-                    dist_labels[j * pitch + i] = cc;
-
-                    while (top >= stack)
-                    {
-                        int*  dl = &dist_labels[p.y * pitch + p.x];
-                        unsigned char* sp = &source[p.y * step1 + p.x];
-
-                        dl[0] = cc;
-
-                        //right
-                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
-                            *top++ = dot::make(p.x + 1, p.y);
-
-                        //left
-                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
-                            *top++ = dot::make(p.x - 1, p.y);
-
-                        //bottom
-                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+step1]))
-                            *top++ = dot::make(p.x, p.y + 1);
-
-                        //top
-                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-step1]))
-                            *top++ = dot::make(p.x, p.y - 1);
-
-                        p = *--top;
-                    }
-                }
-            delete[] stack;
-        }
-
-        void checkCorrectness(cv::Mat gpu)
-        {
-            cv::Mat diff = gpu - _labels;
-
-            int outliers = 0;
-            for (int j = 0; j < image.rows; ++j)
-                for (int i = 0; i < image.cols - 1; ++i)
-                {
-                    if ( (_labels.at<int>(j,i) == gpu.at<int>(j,i + 1)) && (diff.at<int>(j, i) != diff.at<int>(j,i + 1)))
-                    {
-                        outliers++;
-                    }
-                }
-            ASSERT_TRUE(outliers < gpu.cols + gpu.rows);
-        }
-
-        cv::Mat image;
-        cv::Mat _labels;
-    };
-}
-
-struct Labeling : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-
-    cv::Mat loat_image()
-    {
-        return cv::imread(std::string( cvtest::TS::ptr()->get_data_path() ) + "labeling/label.png");
-    }
-};
-
-GPU_TEST_P(Labeling, DISABLED_ConnectedComponents)
-{
-    cv::Mat image;
-    cvtColor(loat_image(), image, cv::COLOR_BGR2GRAY);
-
-    cv::threshold(image, image, 150, 255, cv::THRESH_BINARY);
-
-    ASSERT_TRUE(image.type() == CV_8UC1);
-
-    GreedyLabeling host(image);
-    host(host._labels);
-
-    cv::gpu::GpuMat mask;
-    mask.create(image.rows, image.cols, CV_8UC1);
-
-    cv::gpu::GpuMat components;
-    components.create(image.rows, image.cols, CV_32SC1);
-
-    cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
-
-    cv::gpu::labelComponents(mask, components);
-
-    host.checkCorrectness(cv::Mat(components));
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ConnectedComponents, Labeling, ALL_DEVICES);
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_precomp.hpp b/modules/gpu/test/test_precomp.hpp
index 08807d51e0..f98f364b9b 100644
--- a/modules/gpu/test/test_precomp.hpp
+++ b/modules/gpu/test/test_precomp.hpp
@@ -74,8 +74,6 @@
 #include "opencv2/ts/gpu_test.hpp"
 #include "opencv2/gpu.hpp"
 
-#include "interpolation.hpp"
-
 #include "opencv2/core/gpu_private.hpp"
 
 #endif
diff --git a/modules/gpu/test/test_pyramids.cpp b/modules/gpu/test/test_pyramids.cpp
deleted file mode 100644
index 6b0540fc10..0000000000
--- a/modules/gpu/test/test_pyramids.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-////////////////////////////////////////////////////////
-// pyrDown
-
-PARAM_TEST_CASE(PyrDown, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(PyrDown, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(cv::Size((size.width + 1) / 2, (size.height + 1) / 2), type, useRoi);
-    cv::gpu::pyrDown(loadMat(src, useRoi), dst);
-
-    cv::Mat dst_gold;
-    cv::pyrDown(src, dst_gold);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////
-// pyrUp
-
-PARAM_TEST_CASE(PyrUp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(PyrUp, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(cv::Size(size.width * 2, size.height * 2), type, useRoi);
-    cv::gpu::pyrUp(loadMat(src, useRoi), dst);
-
-    cv::Mat dst_gold;
-    cv::pyrUp(src, dst_gold);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    WHOLE_SUBMAT));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_remap.cpp b/modules/gpu/test/test_remap.cpp
deleted file mode 100644
index eb4b9ece85..0000000000
--- a/modules/gpu/test/test_remap.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-///////////////////////////////////////////////////////////////////
-// Gold implementation
-
-namespace
-{
-    template <typename T, template <typename> class Interpolator> void remapImpl(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal)
-    {
-        const int cn = src.channels();
-
-        cv::Size dsize = xmap.size();
-
-        dst.create(dsize, src.type());
-
-        for (int y = 0; y < dsize.height; ++y)
-        {
-            for (int x = 0; x < dsize.width; ++x)
-            {
-                for (int c = 0; c < cn; ++c)
-                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ymap.at<float>(y, x), xmap.at<float>(y, x), c, borderType, borderVal);
-            }
-        }
-    }
-
-    void remapGold(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
-    {
-        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal);
-
-        static const func_t nearest_funcs[] =
-        {
-            remapImpl<unsigned char, NearestInterpolator>,
-            remapImpl<signed char, NearestInterpolator>,
-            remapImpl<unsigned short, NearestInterpolator>,
-            remapImpl<short, NearestInterpolator>,
-            remapImpl<int, NearestInterpolator>,
-            remapImpl<float, NearestInterpolator>
-        };
-
-        static const func_t linear_funcs[] =
-        {
-            remapImpl<unsigned char, LinearInterpolator>,
-            remapImpl<signed char, LinearInterpolator>,
-            remapImpl<unsigned short, LinearInterpolator>,
-            remapImpl<short, LinearInterpolator>,
-            remapImpl<int, LinearInterpolator>,
-            remapImpl<float, LinearInterpolator>
-        };
-
-        static const func_t cubic_funcs[] =
-        {
-            remapImpl<unsigned char, CubicInterpolator>,
-            remapImpl<signed char, CubicInterpolator>,
-            remapImpl<unsigned short, CubicInterpolator>,
-            remapImpl<short, CubicInterpolator>,
-            remapImpl<int, CubicInterpolator>,
-            remapImpl<float, CubicInterpolator>
-        };
-
-        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
-
-        funcs[interpolation][src.depth()](src, xmap, ymap, dst, borderType, borderVal);
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-// Test
-
-PARAM_TEST_CASE(Remap, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    int interpolation;
-    int borderType;
-    bool useRoi;
-
-    cv::Mat xmap;
-    cv::Mat ymap;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-        borderType = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        // rotation matrix
-
-        const double aplha = CV_PI / 4;
-        static double M[2][3] = { {std::cos(aplha), -std::sin(aplha), size.width / 2.0},
-                                  {std::sin(aplha),  std::cos(aplha), 0.0}};
-
-        xmap.create(size, CV_32FC1);
-        ymap.create(size, CV_32FC1);
-
-        for (int y = 0; y < size.height; ++y)
-        {
-            for (int x = 0; x < size.width; ++x)
-            {
-                xmap.at<float>(y, x) = static_cast<float>(M[0][0] * x + M[0][1] * y + M[0][2]);
-                ymap.at<float>(y, x) = static_cast<float>(M[1][0] * x + M[1][1] * y + M[1][2]);
-            }
-        }
-    }
-};
-
-GPU_TEST_P(Remap, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Scalar val = randomScalar(0.0, 255.0);
-
-    cv::gpu::GpuMat dst = createMat(xmap.size(), type, useRoi);
-    cv::gpu::remap(loadMat(src, useRoi), dst, loadMat(xmap, useRoi), loadMat(ymap, useRoi), interpolation, borderType, val);
-
-    cv::Mat dst_gold;
-    remapGold(src, xmap, ymap, dst_gold, interpolation, borderType, val);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
-    WHOLE_SUBMAT));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp
deleted file mode 100644
index 593c891e6a..0000000000
--- a/modules/gpu/test/test_resize.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-///////////////////////////////////////////////////////////////////
-// Gold implementation
-
-namespace
-{
-    template <typename T, template <typename> class Interpolator>
-    void resizeImpl(const cv::Mat& src, cv::Mat& dst, double fx, double fy)
-    {
-        const int cn = src.channels();
-
-        cv::Size dsize(cv::saturate_cast<int>(src.cols * fx), cv::saturate_cast<int>(src.rows * fy));
-
-        dst.create(dsize, src.type());
-
-        float ifx = static_cast<float>(1.0 / fx);
-        float ify = static_cast<float>(1.0 / fy);
-
-        for (int y = 0; y < dsize.height; ++y)
-        {
-            for (int x = 0; x < dsize.width; ++x)
-            {
-                for (int c = 0; c < cn; ++c)
-                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE);
-            }
-        }
-    }
-
-    void resizeGold(const cv::Mat& src, cv::Mat& dst, double fx, double fy, int interpolation)
-    {
-        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst, double fx, double fy);
-
-        static const func_t nearest_funcs[] =
-        {
-            resizeImpl<unsigned char, NearestInterpolator>,
-            resizeImpl<signed char, NearestInterpolator>,
-            resizeImpl<unsigned short, NearestInterpolator>,
-            resizeImpl<short, NearestInterpolator>,
-            resizeImpl<int, NearestInterpolator>,
-            resizeImpl<float, NearestInterpolator>
-        };
-
-
-        static const func_t linear_funcs[] =
-        {
-            resizeImpl<unsigned char, LinearInterpolator>,
-            resizeImpl<signed char, LinearInterpolator>,
-            resizeImpl<unsigned short, LinearInterpolator>,
-            resizeImpl<short, LinearInterpolator>,
-            resizeImpl<int, LinearInterpolator>,
-            resizeImpl<float, LinearInterpolator>
-        };
-
-        static const func_t cubic_funcs[] =
-        {
-            resizeImpl<unsigned char, CubicInterpolator>,
-            resizeImpl<signed char, CubicInterpolator>,
-            resizeImpl<unsigned short, CubicInterpolator>,
-            resizeImpl<short, CubicInterpolator>,
-            resizeImpl<int, CubicInterpolator>,
-            resizeImpl<float, CubicInterpolator>
-        };
-
-        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
-
-        funcs[interpolation][src.depth()](src, dst, fx, fy);
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-// Test
-
-PARAM_TEST_CASE(Resize, cv::gpu::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    double coeff;
-    int interpolation;
-    int type;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        coeff = GET_PARAM(3);
-        interpolation = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Resize, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
-    cv::gpu::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation);
-
-    cv::Mat dst_gold;
-    resizeGold(src, dst_gold, coeff, coeff, interpolation);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(0.3, 0.5, 1.5, 2.0),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    WHOLE_SUBMAT));
-
-/////////////////
-
-PARAM_TEST_CASE(ResizeSameAsHost, cv::gpu::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    double coeff;
-    int interpolation;
-    int type;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        coeff = GET_PARAM(3);
-        interpolation = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-// downscaling only: used for classifiers
-GPU_TEST_P(ResizeSameAsHost, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
-    cv::gpu::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation);
-
-    cv::Mat dst_gold;
-    cv::resize(src, dst_gold, cv::Size(), coeff, coeff, interpolation);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(0.3, 0.5),
-    testing::Values(Interpolation(cv::INTER_AREA), Interpolation(cv::INTER_NEAREST)),  //, Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)
-    WHOLE_SUBMAT));
-
-///////////////////////////////////////////////////////////////////
-// Test NPP
-
-PARAM_TEST_CASE(ResizeNPP, cv::gpu::DeviceInfo, MatType, double, Interpolation)
-{
-    cv::gpu::DeviceInfo devInfo;
-    double coeff;
-    int interpolation;
-    int type;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        coeff = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(ResizeNPP, Accuracy)
-{
-    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
-    ASSERT_FALSE(src.empty());
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::resize(loadMat(src), dst, cv::Size(), coeff, coeff, interpolation);
-
-    cv::Mat dst_gold;
-    resizeGold(src, dst_gold, coeff, coeff, interpolation);
-
-    EXPECT_MAT_SIMILAR(dst_gold, dst, 1e-1);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeNPP, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(0.3, 0.5, 1.5, 2.0),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_threshold.cpp b/modules/gpu/test/test_threshold.cpp
deleted file mode 100644
index 52ebd7f592..0000000000
--- a/modules/gpu/test/test_threshold.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV)
-
-PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    int threshOp;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        threshOp = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Threshold, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    double maxVal = randomDouble(20.0, 127.0);
-    double thresh = randomDouble(0.0, maxVal);
-
-    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
-    cv::gpu::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
-
-    cv::Mat dst_gold;
-    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
-    ThreshOp::all(),
-    WHOLE_SUBMAT));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_warp_affine.cpp b/modules/gpu/test/test_warp_affine.cpp
deleted file mode 100644
index 43bf0f6d9e..0000000000
--- a/modules/gpu/test/test_warp_affine.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-namespace
-{
-    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
-    {
-        cv::Mat M(2, 3, CV_64FC1);
-
-        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
-        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
-
-        return M;
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-// Test buildWarpAffineMaps
-
-PARAM_TEST_CASE(BuildWarpAffineMaps, cv::gpu::DeviceInfo, cv::Size, Inverse)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    bool inverse;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        inverse = GET_PARAM(2);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(BuildWarpAffineMaps, Accuracy)
-{
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
-    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
-
-    cv::gpu::GpuMat xmap, ymap;
-    cv::gpu::buildWarpAffineMaps(M, inverse, size, xmap, ymap);
-
-    int interpolation = cv::INTER_NEAREST;
-    int borderMode = cv::BORDER_CONSTANT;
-    int flags = interpolation;
-    if (inverse)
-        flags |= cv::WARP_INVERSE_MAP;
-
-    cv::Mat dst;
-    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
-
-    cv::Mat dst_gold;
-    cv::warpAffine(src, dst_gold, M, size, flags, borderMode);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BuildWarpAffineMaps, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    DIRECT_INVERSE));
-
-///////////////////////////////////////////////////////////////////
-// Gold implementation
-
-namespace
-{
-    template <typename T, template <typename> class Interpolator> void warpAffineImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
-    {
-        const int cn = src.channels();
-
-        dst.create(dsize, src.type());
-
-        for (int y = 0; y < dsize.height; ++y)
-        {
-            for (int x = 0; x < dsize.width; ++x)
-            {
-                float xcoo = static_cast<float>(M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2));
-                float ycoo = static_cast<float>(M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2));
-
-                for (int c = 0; c < cn; ++c)
-                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
-            }
-        }
-    }
-
-    void warpAffineGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
-    {
-        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
-
-        static const func_t nearest_funcs[] =
-        {
-            warpAffineImpl<unsigned char, NearestInterpolator>,
-            warpAffineImpl<signed char, NearestInterpolator>,
-            warpAffineImpl<unsigned short, NearestInterpolator>,
-            warpAffineImpl<short, NearestInterpolator>,
-            warpAffineImpl<int, NearestInterpolator>,
-            warpAffineImpl<float, NearestInterpolator>
-        };
-
-        static const func_t linear_funcs[] =
-        {
-            warpAffineImpl<unsigned char, LinearInterpolator>,
-            warpAffineImpl<signed char, LinearInterpolator>,
-            warpAffineImpl<unsigned short, LinearInterpolator>,
-            warpAffineImpl<short, LinearInterpolator>,
-            warpAffineImpl<int, LinearInterpolator>,
-            warpAffineImpl<float, LinearInterpolator>
-        };
-
-        static const func_t cubic_funcs[] =
-        {
-            warpAffineImpl<unsigned char, CubicInterpolator>,
-            warpAffineImpl<signed char, CubicInterpolator>,
-            warpAffineImpl<unsigned short, CubicInterpolator>,
-            warpAffineImpl<short, CubicInterpolator>,
-            warpAffineImpl<int, CubicInterpolator>,
-            warpAffineImpl<float, CubicInterpolator>
-        };
-
-        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
-
-        if (inverse)
-            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
-        else
-        {
-            cv::Mat iM;
-            cv::invertAffineTransform(M, iM);
-            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-// Test
-
-PARAM_TEST_CASE(WarpAffine, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    bool inverse;
-    int interpolation;
-    int borderType;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        inverse = GET_PARAM(3);
-        interpolation = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(WarpAffine, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
-    int flags = interpolation;
-    if (inverse)
-        flags |= cv::WARP_INVERSE_MAP;
-    cv::Scalar val = randomScalar(0.0, 255.0);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::warpAffine(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
-
-    cv::Mat dst_gold;
-    warpAffineGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    DIRECT_INVERSE,
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
-    WHOLE_SUBMAT));
-
-///////////////////////////////////////////////////////////////////
-// Test NPP
-
-PARAM_TEST_CASE(WarpAffineNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolation)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-    bool inverse;
-    int interpolation;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        inverse = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(WarpAffineNPP, Accuracy)
-{
-    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
-    ASSERT_FALSE(src.empty());
-
-    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
-    int flags = interpolation;
-    if (inverse)
-        flags |= cv::WARP_INVERSE_MAP;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::warpAffine(loadMat(src), dst, M, src.size(), flags);
-
-    cv::Mat dst_gold;
-    warpAffineGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
-
-    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    DIRECT_INVERSE,
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_warp_perspective.cpp b/modules/gpu/test/test_warp_perspective.cpp
deleted file mode 100644
index d225e58b66..0000000000
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-namespace
-{
-    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
-    {
-        cv::Mat M(3, 3, CV_64FC1);
-
-        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
-        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
-        M.at<double>(2, 0) = 0.0            ; M.at<double>(2, 1) =  0.0            ; M.at<double>(2, 2) = 1.0;
-
-        return M;
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-// Test buildWarpPerspectiveMaps
-
-PARAM_TEST_CASE(BuildWarpPerspectiveMaps, cv::gpu::DeviceInfo, cv::Size, Inverse)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    bool inverse;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        inverse = GET_PARAM(2);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(BuildWarpPerspectiveMaps, Accuracy)
-{
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
-
-    cv::gpu::GpuMat xmap, ymap;
-    cv::gpu::buildWarpPerspectiveMaps(M, inverse, size, xmap, ymap);
-
-    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
-    int interpolation = cv::INTER_NEAREST;
-    int borderMode = cv::BORDER_CONSTANT;
-    int flags = interpolation;
-    if (inverse)
-        flags |= cv::WARP_INVERSE_MAP;
-
-    cv::Mat dst;
-    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
-
-    cv::Mat dst_gold;
-    cv::warpPerspective(src, dst_gold, M, size, flags, borderMode);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BuildWarpPerspectiveMaps, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    DIRECT_INVERSE));
-
-///////////////////////////////////////////////////////////////////
-// Gold implementation
-
-namespace
-{
-    template <typename T, template <typename> class Interpolator> void warpPerspectiveImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
-    {
-        const int cn = src.channels();
-
-        dst.create(dsize, src.type());
-
-        for (int y = 0; y < dsize.height; ++y)
-        {
-            for (int x = 0; x < dsize.width; ++x)
-            {
-                float coeff = static_cast<float>(M.at<double>(2, 0) * x + M.at<double>(2, 1) * y + M.at<double>(2, 2));
-
-                float xcoo = static_cast<float>((M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2)) / coeff);
-                float ycoo = static_cast<float>((M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2)) / coeff);
-
-                for (int c = 0; c < cn; ++c)
-                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
-            }
-        }
-    }
-
-    void warpPerspectiveGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
-    {
-        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
-
-        static const func_t nearest_funcs[] =
-        {
-            warpPerspectiveImpl<unsigned char, NearestInterpolator>,
-            warpPerspectiveImpl<signed char, NearestInterpolator>,
-            warpPerspectiveImpl<unsigned short, NearestInterpolator>,
-            warpPerspectiveImpl<short, NearestInterpolator>,
-            warpPerspectiveImpl<int, NearestInterpolator>,
-            warpPerspectiveImpl<float, NearestInterpolator>
-        };
-
-        static const func_t linear_funcs[] =
-        {
-            warpPerspectiveImpl<unsigned char, LinearInterpolator>,
-            warpPerspectiveImpl<signed char, LinearInterpolator>,
-            warpPerspectiveImpl<unsigned short, LinearInterpolator>,
-            warpPerspectiveImpl<short, LinearInterpolator>,
-            warpPerspectiveImpl<int, LinearInterpolator>,
-            warpPerspectiveImpl<float, LinearInterpolator>
-        };
-
-        static const func_t cubic_funcs[] =
-        {
-            warpPerspectiveImpl<unsigned char, CubicInterpolator>,
-            warpPerspectiveImpl<signed char, CubicInterpolator>,
-            warpPerspectiveImpl<unsigned short, CubicInterpolator>,
-            warpPerspectiveImpl<short, CubicInterpolator>,
-            warpPerspectiveImpl<int, CubicInterpolator>,
-            warpPerspectiveImpl<float, CubicInterpolator>
-        };
-
-        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
-
-        if (inverse)
-            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
-        else
-        {
-            cv::Mat iM;
-            cv::invert(M, iM);
-            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-// Test
-
-PARAM_TEST_CASE(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    bool inverse;
-    int interpolation;
-    int borderType;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        inverse = GET_PARAM(3);
-        interpolation = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(WarpPerspective, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
-    int flags = interpolation;
-    if (inverse)
-        flags |= cv::WARP_INVERSE_MAP;
-    cv::Scalar val = randomScalar(0.0, 255.0);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::warpPerspective(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
-
-    cv::Mat dst_gold;
-    warpPerspectiveGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    DIRECT_INVERSE,
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
-    WHOLE_SUBMAT));
-
-///////////////////////////////////////////////////////////////////
-// Test NPP
-
-PARAM_TEST_CASE(WarpPerspectiveNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolation)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-    bool inverse;
-    int interpolation;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        inverse = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
-{
-    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
-    ASSERT_FALSE(src.empty());
-
-    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
-    int flags = interpolation;
-    if (inverse)
-        flags |= cv::WARP_INVERSE_MAP;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::warpPerspective(loadMat(src), dst, M, src.size(), flags);
-
-    cv::Mat dst_gold;
-    warpPerspectiveGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
-
-    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    DIRECT_INVERSE,
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
-
-#endif // HAVE_CUDA
diff --git a/modules/gpuarithm/test/test_threshold.cpp b/modules/gpuarithm/test/test_threshold.cpp
new file mode 100644
index 0000000000..52ebd7f592
--- /dev/null
+++ b/modules/gpuarithm/test/test_threshold.cpp
@@ -0,0 +1,93 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV)
+
+PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int threshOp;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        threshOp = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Threshold, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    double maxVal = randomDouble(20.0, 127.0);
+    double thresh = randomDouble(0.0, maxVal);
+
+    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
+
+    cv::Mat dst_gold;
+    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
+    ThreshOp::all(),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/CMakeLists.txt b/modules/gpuimgproc/CMakeLists.txt
new file mode 100644
index 0000000000..04a31d5e7e
--- /dev/null
+++ b/modules/gpuimgproc/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(ANDROID OR IOS)
+  ocv_module_disable(gpuimgproc)
+endif()
+
+set(the_description "GPU-accelerated Image Processing")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
+
+ocv_define_module(gpuimgproc opencv_imgproc opencv_gpuarithm opencv_gpufilters OPTIONAL opencv_photo)
diff --git a/modules/gpuimgproc/doc/gpuimgproc.rst b/modules/gpuimgproc/doc/gpuimgproc.rst
new file mode 100644
index 0000000000..d4cba96a44
--- /dev/null
+++ b/modules/gpuimgproc/doc/gpuimgproc.rst
@@ -0,0 +1,8 @@
+*************************************
+gpu. GPU-accelerated Image Processing
+*************************************
+
+.. toctree::
+    :maxdepth: 1
+
+    image_processing
diff --git a/modules/gpuimgproc/doc/image_processing.rst b/modules/gpuimgproc/doc/image_processing.rst
new file mode 100644
index 0000000000..69e5003743
--- /dev/null
+++ b/modules/gpuimgproc/doc/image_processing.rst
@@ -0,0 +1,1065 @@
+Image Processing
+================
+
+.. highlight:: cpp
+
+
+
+gpu::meanShiftFiltering
+---------------------------
+Performs mean-shift filtering for each point of the source image.
+
+.. ocv:function:: void gpu::meanShiftFiltering( const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria=TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream=Stream::Null() )
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dst: Destination image containing the color of mapped points. It has the same size and type as  ``src`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+It maps each point of the source image into another point. As a result, you have a new color and new position of each point.
+
+
+
+gpu::meanShiftProc
+----------------------
+Performs a mean-shift procedure and stores information about processed points (their colors and positions) in two images.
+
+.. ocv:function:: void gpu::meanShiftProc( const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria=TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream=Stream::Null() )
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dstr: Destination image containing the color of mapped points. The size and type is the same as  ``src`` .
+
+    :param dstsp: Destination image containing the position of mapped points. The size is the same as  ``src`` size. The type is  ``CV_16SC2`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+.. seealso:: :ocv:func:`gpu::meanShiftFiltering`
+
+
+
+gpu::meanShiftSegmentation
+------------------------------
+Performs a mean-shift segmentation of the source image and eliminates small segments.
+
+.. ocv:function:: void gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dst: Segmented image with the same size and type as  ``src`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param minsize: Minimum segment size. Smaller segments are merged.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+
+
+gpu::integral
+-----------------
+Computes an integral image.
+
+.. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`integral`
+
+
+
+gpu::sqrIntegral
+--------------------
+Computes a squared integral image.
+
+.. ocv:function:: void gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::columnSum
+------------------
+Computes a vertical (column) sum.
+
+.. ocv:function:: void gpu::columnSum(const GpuMat& src, GpuMat& sum)
+
+    :param src: Source image. Only  ``CV_32FC1`` images are supported for now.
+
+    :param sum: Destination image of the  ``CV_32FC1`` type.
+
+
+
+gpu::cornerHarris
+---------------------
+Computes the Harris cornerness criteria at each image pixel.
+
+.. ocv:function:: void gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101)
+
+    :param src: Source image. Only  ``CV_8UC1`` and  ``CV_32FC1`` images are supported for now.
+
+    :param dst: Destination image containing cornerness values. It has the same size as ``src`` and ``CV_32FC1`` type.
+
+    :param blockSize: Neighborhood size.
+
+    :param ksize: Aperture parameter for the Sobel operator.
+
+    :param k: Harris detector free parameter.
+
+    :param borderType: Pixel extrapolation method. Only  ``BORDER_REFLECT101`` and  ``BORDER_REPLICATE`` are supported for now.
+
+.. seealso:: :ocv:func:`cornerHarris`
+
+
+
+gpu::cornerMinEigenVal
+--------------------------
+Computes the minimum eigen value of a 2x2 derivative covariation matrix at each pixel (the cornerness criteria).
+
+.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101)
+
+.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101)
+
+.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` and  ``CV_32FC1`` images are supported for now.
+
+    :param dst: Destination image containing cornerness values. The size is the same. The type is  ``CV_32FC1`` .
+
+    :param blockSize: Neighborhood size.
+
+    :param ksize: Aperture parameter for the Sobel operator.
+
+    :param borderType: Pixel extrapolation method. Only ``BORDER_REFLECT101`` and ``BORDER_REPLICATE`` are supported for now.
+
+.. seealso:: :ocv:func:`cornerMinEigenVal`
+
+
+
+gpu::mulSpectrums
+---------------------
+Performs a per-element multiplication of two Fourier spectrums.
+
+.. ocv:function:: void gpu::mulSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream=Stream::Null() )
+
+    :param a: First spectrum.
+
+    :param b: Second spectrum with the same size and type as  ``a`` .
+
+    :param c: Destination spectrum.
+
+    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
+
+    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
+
+    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+
+.. seealso:: :ocv:func:`mulSpectrums`
+
+
+
+gpu::mulAndScaleSpectrums
+-----------------------------
+Performs a per-element multiplication of two Fourier spectrums and scales the result.
+
+.. ocv:function:: void gpu::mulAndScaleSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream=Stream::Null() )
+
+    :param a: First spectrum.
+
+    :param b: Second spectrum with the same size and type as  ``a`` .
+
+    :param c: Destination spectrum.
+
+    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
+
+    :param scale: Scale constant.
+
+    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
+
+    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+
+.. seealso:: :ocv:func:`mulSpectrums`
+
+
+
+gpu::dft
+------------
+Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
+
+.. ocv:function:: void gpu::dft( const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream=Stream::Null() )
+
+    :param src: Source matrix (real or complex).
+
+    :param dst: Destination matrix (real or complex).
+
+    :param dft_size: Size of a discrete Fourier transform.
+
+    :param flags: Optional flags:
+
+        * **DFT_ROWS** transforms each individual row of the source matrix.
+
+        * **DFT_SCALE** scales the result: divide it by the number of elements in the transform (obtained from  ``dft_size`` ).
+
+        * **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real cases are always forward and inverse, respectively).
+
+        * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.
+
+Use to handle real matrices ( ``CV32FC1`` ) and complex matrices in the interleaved format ( ``CV32FC2`` ).
+
+The source matrix should be continuous, otherwise reallocation and data copying is performed. The function chooses an operation mode depending on the flags, size, and channel count of the source matrix:
+
+    * If the source matrix is complex and the output is not specified as real, the destination matrix is complex and has the ``dft_size``    size and ``CV_32FC2``    type. The destination matrix contains a full result of the DFT (forward or inverse).
+
+    * If the source matrix is complex and the output is specified as real, the function assumes that its input is the result of the forward transform (see the next item). The destination matrix has the ``dft_size`` size and ``CV_32FC1`` type. It contains the result of the inverse DFT.
+
+    * If the source matrix is real (its type is ``CV_32FC1`` ), forward DFT is performed. The result of the DFT is packed into complex ( ``CV_32FC2`` ) matrix. So, the width of the destination matrix is ``dft_size.width / 2 + 1`` . But if the source is a single column, the height is reduced instead of the width.
+
+.. seealso:: :ocv:func:`dft`
+
+
+gpu::ConvolveBuf
+----------------
+.. ocv:struct:: gpu::ConvolveBuf
+
+Class providing a memory buffer for :ocv:func:`gpu::convolve` function, plus it allows to adjust some specific parameters. ::
+
+    struct CV_EXPORTS ConvolveBuf
+    {
+        Size result_size;
+        Size block_size;
+        Size user_block_size;
+        Size dft_size;
+        int spect_len;
+
+        GpuMat image_spect, templ_spect, result_spect;
+        GpuMat image_block, templ_block, result_data;
+
+        void create(Size image_size, Size templ_size);
+        static Size estimateBlockSize(Size result_size, Size templ_size);
+    };
+
+You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
+
+gpu::ConvolveBuf::create
+------------------------
+.. ocv:function:: gpu::ConvolveBuf::create(Size image_size, Size templ_size)
+
+Constructs a buffer for :ocv:func:`gpu::convolve` function with respective arguments.
+
+
+gpu::convolve
+-----------------
+Computes a convolution (or cross-correlation) of two images.
+
+.. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr=false)
+
+.. ocv:function:: void gpu::convolve( const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream=Stream::Null() )
+
+    :param image: Source image. Only  ``CV_32FC1`` images are supported for now.
+
+    :param templ: Template image. The size is not greater than the  ``image`` size. The type is the same as  ``image`` .
+
+    :param result: Result image. If  ``image`` is  *W x H*  and ``templ`` is  *w x h*, then  ``result`` must be *W-w+1 x H-h+1*.
+
+    :param ccorr: Flags to evaluate cross-correlation instead of convolution.
+
+    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`gpu::ConvolveBuf`.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::filter2D`
+
+gpu::MatchTemplateBuf
+---------------------
+.. ocv:struct:: gpu::MatchTemplateBuf
+
+Class providing memory buffers for :ocv:func:`gpu::matchTemplate` function, plus it allows to adjust some specific parameters. ::
+
+    struct CV_EXPORTS MatchTemplateBuf
+    {
+        Size user_block_size;
+        GpuMat imagef, templf;
+        std::vector<GpuMat> images;
+        std::vector<GpuMat> image_sums;
+        std::vector<GpuMat> image_sqsums;
+    };
+
+You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::matchTemplate` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
+
+gpu::matchTemplate
+----------------------
+Computes a proximity map for a raster template and an image where the template is searched for.
+
+.. ocv:function:: void gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null())
+
+.. ocv:function:: void gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null())
+
+    :param image: Source image.  ``CV_32F`` and  ``CV_8U`` depth images (1..4 channels) are supported for now.
+
+    :param templ: Template image with the size and type the same as  ``image`` .
+
+    :param result: Map containing comparison results ( ``CV_32FC1`` ). If  ``image`` is  *W x H*  and ``templ`` is  *w x h*, then  ``result`` must be *W-w+1 x H-h+1*.
+
+    :param method: Specifies the way to compare the template with the image.
+
+    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`gpu::MatchTemplateBuf`.
+
+    :param stream: Stream for the asynchronous version.
+
+    The following methods are supported for the ``CV_8U`` depth images for now:
+
+    * ``CV_TM_SQDIFF``
+    * ``CV_TM_SQDIFF_NORMED``
+    * ``CV_TM_CCORR``
+    * ``CV_TM_CCORR_NORMED``
+    * ``CV_TM_CCOEFF``
+    * ``CV_TM_CCOEFF_NORMED``
+
+    The following methods are supported for the ``CV_32F`` images for now:
+
+    * ``CV_TM_SQDIFF``
+    * ``CV_TM_CCORR``
+
+.. seealso:: :ocv:func:`matchTemplate`
+
+
+gpu::remap
+--------------
+Applies a generic geometrical transformation to an image.
+
+.. ocv:function:: void gpu::remap( const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode=BORDER_CONSTANT, Scalar borderValue=Scalar(), Stream& stream=Stream::Null() )
+
+    :param src: Source image.
+
+    :param dst: Destination image with the size the same as  ``xmap`` and the type the same as  ``src`` .
+
+    :param xmap: X values. Only  ``CV_32FC1`` type is supported.
+
+    :param ymap: Y values. Only  ``CV_32FC1`` type is supported.
+
+    :param interpolation: Interpolation method (see  :ocv:func:`resize` ). ``INTER_NEAREST`` , ``INTER_LINEAR`` and ``INTER_CUBIC`` are supported for now.
+
+    :param borderMode: Pixel extrapolation method (see  :ocv:func:`borderInterpolate` ). ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
+
+    :param borderValue: Value used in case of a constant border. By default, it is 0.
+
+    :param stream: Stream for the asynchronous version.
+
+The function transforms the source image using the specified map:
+
+.. math::
+
+    \texttt{dst} (x,y) =  \texttt{src} (xmap(x,y), ymap(x,y))
+
+Values of pixels with non-integer coordinates are computed using the bilinear interpolation.
+
+.. seealso:: :ocv:func:`remap`
+
+
+
+gpu::cvtColor
+-----------------
+Converts an image from one color space to another.
+
+.. ocv:function:: void gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null())
+
+    :param src: Source image with  ``CV_8U`` , ``CV_16U`` , or  ``CV_32F`` depth and 1, 3, or 4 channels.
+
+    :param dst: Destination image with the same size and depth as  ``src`` .
+
+    :param code: Color space conversion code. For details, see  :ocv:func:`cvtColor` . Conversion to/from Luv and Bayer color spaces is not supported.
+
+    :param dcn: Number of channels in the destination image. If the parameter is 0, the number of the channels is derived automatically from  ``src`` and the  ``code`` .
+
+    :param stream: Stream for the asynchronous version.
+
+3-channel color spaces (like ``HSV``, ``XYZ``, and so on) can be stored in a 4-channel image for better performance.
+
+.. seealso:: :ocv:func:`cvtColor`
+
+
+
+gpu::swapChannels
+-----------------
+Exchanges the color channels of an image in-place.
+
+.. ocv:function:: void gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null())
+
+    :param image: Source image. Supports only ``CV_8UC4`` type.
+
+    :param dstOrder: Integer array describing how channel values are permutated. The n-th entry of the array contains the number of the channel that is stored in the n-th channel of the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order.
+
+    :param stream: Stream for the asynchronous version.
+
+The methods support arbitrary permutations of the original channels, including replication.
+
+
+
+gpu::resize
+---------------
+Resizes an image.
+
+.. ocv:function:: void gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null())
+
+    :param src: Source image.
+
+    :param dst: Destination image  with the same type as  ``src`` . The size is ``dsize`` (when it is non-zero) or the size is computed from  ``src.size()`` , ``fx`` , and  ``fy`` .
+
+    :param dsize: Destination image size. If it is zero, it is computed as:
+
+        .. math::
+            \texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}
+
+        Either  ``dsize`` or both  ``fx`` and  ``fy`` must be non-zero.
+
+    :param fx: Scale factor along the horizontal axis. If it is zero, it is computed as:
+
+        .. math::
+
+            \texttt{(double)dsize.width/src.cols}
+
+    :param fy: Scale factor along the vertical axis. If it is zero, it is computed as:
+
+        .. math::
+
+            \texttt{(double)dsize.height/src.rows}
+
+    :param interpolation: Interpolation method. ``INTER_NEAREST`` , ``INTER_LINEAR`` and ``INTER_CUBIC`` are supported for now.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`resize`
+
+
+
+gpu::warpAffine
+-------------------
+Applies an affine transformation to an image.
+
+.. ocv:function:: void gpu::warpAffine( const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags=INTER_LINEAR, int borderMode=BORDER_CONSTANT, Scalar borderValue=Scalar(), Stream& stream=Stream::Null() )
+
+    :param src: Source image.  ``CV_8U`` , ``CV_16U`` , ``CV_32S`` , or  ``CV_32F`` depth and 1, 3, or 4 channels are supported.
+
+    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .
+
+    :param M: *2x3*  transformation matrix.
+
+    :param dsize: Size of the destination image.
+
+    :param flags: Combination of interpolation methods (see  :ocv:func:`resize`) and the optional flag  ``WARP_INVERSE_MAP`` specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ). Only ``INTER_NEAREST`` , ``INTER_LINEAR`` , and  ``INTER_CUBIC`` interpolation methods are supported.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`warpAffine`
+
+
+
+gpu::buildWarpAffineMaps
+------------------------
+Builds transformation maps for affine transformation.
+
+.. ocv:function:: void gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null())
+
+    :param M: *2x3*  transformation matrix.
+
+    :param inverse: Flag  specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ).
+
+    :param dsize: Size of the destination image.
+
+    :param xmap: X values with  ``CV_32FC1`` type.
+
+    :param ymap: Y values with  ``CV_32FC1`` type.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::warpAffine` , :ocv:func:`gpu::remap`
+
+
+
+gpu::warpPerspective
+------------------------
+Applies a perspective transformation to an image.
+
+.. ocv:function:: void gpu::warpPerspective( const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags=INTER_LINEAR, int borderMode=BORDER_CONSTANT, Scalar borderValue=Scalar(), Stream& stream=Stream::Null() )
+
+    :param src: Source image. ``CV_8U`` , ``CV_16U`` , ``CV_32S`` , or  ``CV_32F`` depth and 1, 3, or 4 channels are supported.
+
+    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .
+
+    :param M: *3x3* transformation matrix.
+
+    :param dsize: Size of the destination image.
+
+    :param flags: Combination of interpolation methods (see  :ocv:func:`resize` ) and the optional flag  ``WARP_INVERSE_MAP`` specifying that  ``M`` is the inverse transformation ( ``dst => src`` ). Only  ``INTER_NEAREST`` , ``INTER_LINEAR`` , and  ``INTER_CUBIC`` interpolation methods are supported.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`warpPerspective`
+
+
+
+gpu::buildWarpPerspectiveMaps
+-----------------------------
+Builds transformation maps for perspective transformation.
+
+.. ocv:function:: void gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null())
+
+    :param M: *3x3*  transformation matrix.
+
+    :param inverse: Flag  specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ).
+
+    :param dsize: Size of the destination image.
+
+    :param xmap: X values with  ``CV_32FC1`` type.
+
+    :param ymap: Y values with  ``CV_32FC1`` type.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::warpPerspective` , :ocv:func:`gpu::remap`
+
+
+
+gpu::rotate
+---------------
+Rotates an image around the origin (0,0) and then shifts it.
+
+.. ocv:function:: void gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null())
+
+    :param src: Source image. Supports 1, 3 or 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32F`` depth.
+
+    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .
+
+    :param dsize: Size of the destination image.
+
+    :param angle: Angle of rotation in degrees.
+
+    :param xShift: Shift along the horizontal axis.
+
+    :param yShift: Shift along the vertical axis.
+
+    :param interpolation: Interpolation method. Only  ``INTER_NEAREST`` , ``INTER_LINEAR`` , and  ``INTER_CUBIC`` are supported.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::warpAffine`
+
+
+
+gpu::copyMakeBorder
+-----------------------
+Forms a border around an image.
+
+.. ocv:function:: void gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null())
+
+    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and  ``CV_32FC1`` types are supported.
+
+    :param dst: Destination image with the same type as  ``src``. The size is  ``Size(src.cols+left+right, src.rows+top+bottom)`` .
+
+    :param top:
+
+    :param bottom:
+
+    :param left:
+
+    :param right: Number of pixels in each direction from the source image rectangle to extrapolate. For example:  ``top=1, bottom=1, left=1, right=1`` mean that 1 pixel-wide border needs to be built.
+
+    :param borderType: Border type. See  :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
+
+    :param value: Border value.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`copyMakeBorder`
+
+
+
+gpu::rectStdDev
+-------------------
+Computes a standard deviation of integral images.
+
+.. ocv:function:: void gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only the ``CV_32SC1`` type is supported.
+
+    :param sqr: Squared source image. Only  the ``CV_32FC1`` type is supported.
+
+    :param dst: Destination image with the same type and size as  ``src`` .
+
+    :param rect: Rectangular window.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::evenLevels
+-------------------
+Computes levels with even distribution.
+
+.. ocv:function:: void gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel)
+
+    :param levels: Destination array.  ``levels`` has 1 row, ``nLevels`` columns, and the ``CV_32SC1`` type.
+
+    :param nLevels: Number of computed levels.  ``nLevels`` must be at least 2.
+
+    :param lowerLevel: Lower boundary value of the lowest level.
+
+    :param upperLevel: Upper boundary value of the greatest level.
+
+
+
+gpu::histEven
+-----------------
+Calculates a histogram with evenly distributed bins.
+
+.. ocv:function:: void gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::histEven( const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::histEven( const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream=Stream::Null() )
+
+    :param src: Source image. ``CV_8U``, ``CV_16U``, or ``CV_16S`` depth and 1 or 4 channels are supported. For a four-channel image, all channels are processed separately.
+
+    :param hist: Destination histogram with one row, ``histSize`` columns, and the ``CV_32S`` type.
+
+    :param histSize: Size of the histogram.
+
+    :param lowerLevel: Lower boundary of lowest-level bin.
+
+    :param upperLevel: Upper boundary of highest-level bin.
+
+    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::histRange
+------------------
+Calculates a histogram with bins determined by the ``levels`` array.
+
+.. ocv:function:: void gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null())
+
+    :param src: Source image. ``CV_8U`` , ``CV_16U`` , or  ``CV_16S`` depth and 1 or 4 channels are supported. For a four-channel image, all channels are processed separately.
+
+    :param hist: Destination histogram with one row, ``(levels.cols-1)`` columns, and the  ``CV_32SC1`` type.
+
+    :param levels: Number of levels in the histogram.
+
+    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::calcHist
+------------------
+Calculates histogram for one channel 8-bit image.
+
+.. ocv:function:: void gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null())
+
+    :param src: Source image.
+
+    :param hist: Destination histogram with one row, 256 columns, and the  ``CV_32SC1`` type.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::equalizeHist
+------------------
+Equalizes the histogram of a grayscale image.
+
+.. ocv:function:: void gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null())
+
+    :param src: Source image.
+
+    :param dst: Destination image.
+
+    :param hist: Destination histogram with one row, 256 columns, and the  ``CV_32SC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`equalizeHist`
+
+
+
+gpu::buildWarpPlaneMaps
+-----------------------
+Builds plane warping maps.
+
+.. ocv:function:: void gpu::buildWarpPlaneMaps( Size src_size, Rect dst_roi, const Mat & K, const Mat& R, const Mat & T, float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream=Stream::Null() )
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::buildWarpCylindricalMaps
+-----------------------------
+Builds cylindrical warping maps.
+
+.. ocv:function:: void gpu::buildWarpCylindricalMaps( Size src_size, Rect dst_roi, const Mat & K, const Mat& R, float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream=Stream::Null() )
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::buildWarpSphericalMaps
+---------------------------
+Builds spherical warping maps.
+
+.. ocv:function:: void gpu::buildWarpSphericalMaps( Size src_size, Rect dst_roi, const Mat & K, const Mat& R, float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream=Stream::Null() )
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::pyrDown
+-------------------
+Smoothes an image and downsamples it.
+
+.. ocv:function:: void gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source image.
+
+    :param dst: Destination image. Will have ``Size((src.cols+1)/2, (src.rows+1)/2)`` size and the same type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`pyrDown`
+
+
+
+gpu::pyrUp
+-------------------
+Upsamples an image and then smoothes it.
+
+.. ocv:function:: void gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source image.
+
+    :param dst: Destination image. Will have ``Size(src.cols*2, src.rows*2)`` size and the same type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`pyrUp`
+
+
+
+gpu::blendLinear
+-------------------
+Performs linear blending of two images.
+
+.. ocv:function:: void gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, GpuMat& result, Stream& stream = Stream::Null())
+
+    :param img1: First image. Supports only ``CV_8U`` and ``CV_32F`` depth.
+
+    :param img2: Second image. Must have the same size and the same type as ``img1`` .
+
+    :param weights1: Weights for first image. Must have tha same size as ``img1`` . Supports only ``CV_32F`` type.
+
+    :param weights2: Weights for second image. Must have tha same size as ``img2`` . Supports only ``CV_32F`` type.
+
+    :param result: Destination image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+gpu::bilateralFilter
+--------------------
+Performs bilateral filtering of passed image
+
+.. ocv:function:: void gpu::bilateralFilter( const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode=BORDER_DEFAULT, Stream& stream=Stream::Null() )
+
+    :param src: Source image. Supports only (channles != 2 && depth() != CV_8S && depth() != CV_32S && depth() != CV_64F).
+
+    :param dst: Destination imagwe.
+
+    :param kernel_size: Kernel window size.
+
+    :param sigma_color: Filter sigma in the color space.
+
+    :param sigma_spatial:  Filter sigma in the coordinate space.
+
+    :param borderMode:  Border type. See :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso::
+
+    :ocv:func:`bilateralFilter`,
+
+
+gpu::nonLocalMeans
+-------------------
+Performs pure non local means denoising without any simplification, and thus it is not fast.
+
+.. ocv:function:: void gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null())
+
+    :param src: Source image. Supports only CV_8UC1, CV_8UC2 and CV_8UC3.
+
+    :param dst: Destination image.
+
+    :param h: Filter sigma regulating filter strength for color.
+
+    :param search_window: Size of search window.
+
+    :param block_size: Size of block used for computing weights.
+
+    :param borderMode:  Border type. See :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso::
+
+    :ocv:func:`fastNlMeansDenoising`
+
+gpu::FastNonLocalMeansDenoising
+-------------------------------
+.. ocv:class:: gpu::FastNonLocalMeansDenoising
+
+    ::
+
+        class FastNonLocalMeansDenoising
+        {
+        public:
+            //! Simple method, recommended for grayscale images (though it supports multichannel images)
+            void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
+            //! Processes luminance and color components separatelly
+            void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
+        };
+
+The class implements fast approximate Non Local Means Denoising algorithm.
+
+gpu::FastNonLocalMeansDenoising::simpleMethod()
+-----------------------------------------------
+Perform image denoising using Non-local Means Denoising algorithm http://www.ipol.im/pub/algo/bcm_non_local_means_denoising with several computational optimizations. Noise expected to be a gaussian white noise
+
+.. ocv:function:: void gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
+
+    :param src: Input 8-bit 1-channel, 2-channel or 3-channel image.
+
+    :param dst: Output image with the same size and type as  ``src`` .
+
+    :param h: Parameter regulating filter strength. Big h value perfectly removes noise but also removes image details, smaller h value preserves details but also preserves some noise
+
+    :param search_window: Size in pixels of the window that is used to compute weighted average for given pixel. Should be odd. Affect performance linearly: greater search_window - greater denoising time. Recommended value 21 pixels
+
+    :param block_size: Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value 7 pixels
+
+    :param stream: Stream for the asynchronous invocations.
+
+This function expected to be applied to grayscale images. For colored images look at ``FastNonLocalMeansDenoising::labMethod``.
+
+.. seealso::
+
+    :ocv:func:`fastNlMeansDenoising`
+
+gpu::FastNonLocalMeansDenoising::labMethod()
+--------------------------------------------
+Modification of ``FastNonLocalMeansDenoising::simpleMethod`` for color images
+
+.. ocv:function:: void gpu::FastNonLocalMeansDenoising::labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null())
+
+    :param src: Input 8-bit 3-channel image.
+
+    :param dst: Output image with the same size and type as  ``src`` .
+
+    :param h_luminance: Parameter regulating filter strength. Big h value perfectly removes noise but also removes image details, smaller h value preserves details but also preserves some noise
+
+    :param float: The same as h but for color components. For most images value equals 10 will be enought to remove colored noise and do not distort colors
+
+    :param search_window: Size in pixels of the window that is used to compute weighted average for given pixel. Should be odd. Affect performance linearly: greater search_window - greater denoising time. Recommended value 21 pixels
+
+    :param block_size: Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value 7 pixels
+
+    :param stream: Stream for the asynchronous invocations.
+
+The function converts image to CIELAB colorspace and then separately denoise L and AB components with given h parameters using ``FastNonLocalMeansDenoising::simpleMethod`` function.
+
+.. seealso::
+
+    :ocv:func:`fastNlMeansDenoisingColored`
+
+gpu::alphaComp
+-------------------
+Composites two images using alpha opacity values contained in each image.
+
+.. ocv:function:: void gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null())
+
+    :param img1: First image. Supports ``CV_8UC4`` , ``CV_16UC4`` , ``CV_32SC4`` and ``CV_32FC4`` types.
+
+    :param img2: Second image. Must have the same size and the same type as ``img1`` .
+
+    :param dst: Destination image.
+
+    :param alpha_op: Flag specifying the alpha-blending operation:
+
+            * **ALPHA_OVER**
+            * **ALPHA_IN**
+            * **ALPHA_OUT**
+            * **ALPHA_ATOP**
+            * **ALPHA_XOR**
+            * **ALPHA_PLUS**
+            * **ALPHA_OVER_PREMUL**
+            * **ALPHA_IN_PREMUL**
+            * **ALPHA_OUT_PREMUL**
+            * **ALPHA_ATOP_PREMUL**
+            * **ALPHA_XOR_PREMUL**
+            * **ALPHA_PLUS_PREMUL**
+            * **ALPHA_PREMUL**
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::Canny
+-------------------
+Finds edges in an image using the [Canny86]_ algorithm.
+
+.. ocv:function:: void gpu::Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false)
+
+.. ocv:function:: void gpu::Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false)
+
+.. ocv:function:: void gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false)
+
+.. ocv:function:: void gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false)
+
+    :param image: Single-channel 8-bit input image.
+
+    :param dx: First derivative of image in the vertical direction. Support only ``CV_32S`` type.
+
+    :param dy: First derivative of image in the horizontal direction. Support only ``CV_32S`` type.
+
+    :param edges: Output edge map. It has the same size and type as  ``image`` .
+
+    :param low_thresh: First threshold for the hysteresis procedure.
+
+    :param high_thresh: Second threshold for the hysteresis procedure.
+
+    :param apperture_size: Aperture size for the  :ocv:func:`Sobel`  operator.
+
+    :param L2gradient: Flag indicating whether a more accurate  :math:`L_2`  norm  :math:`=\sqrt{(dI/dx)^2 + (dI/dy)^2}`  should be used to compute the image gradient magnitude ( ``L2gradient=true`` ), or a faster default  :math:`L_1`  norm  :math:`=|dI/dx|+|dI/dy|`  is enough ( ``L2gradient=false`` ).
+
+    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+
+.. seealso:: :ocv:func:`Canny`
+
+
+
+gpu::HoughLines
+---------------
+Finds lines in a binary image using the classical Hough transform.
+
+.. ocv:function:: void gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096)
+
+.. ocv:function:: void gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096)
+
+    :param src: 8-bit, single-channel binary source image.
+
+    :param lines: Output vector of lines. Each line is represented by a two-element vector  :math:`(\rho, \theta)` .  :math:`\rho`  is the distance from the coordinate origin  :math:`(0,0)`  (top-left corner of the image).  :math:`\theta`  is the line rotation angle in radians ( :math:`0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}` ).
+
+    :param rho: Distance resolution of the accumulator in pixels.
+
+    :param theta: Angle resolution of the accumulator in radians.
+
+    :param threshold: Accumulator threshold parameter. Only those lines are returned that get enough votes ( :math:`>\texttt{threshold}` ).
+
+    :param doSort: Performs lines sort by votes.
+
+    :param maxLines: Maximum number of output lines.
+
+    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+
+.. seealso:: :ocv:func:`HoughLines`
+
+
+
+gpu::HoughLinesDownload
+-----------------------
+Downloads results from :ocv:func:`gpu::HoughLines` to host memory.
+
+.. ocv:function:: void gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray())
+
+    :param d_lines: Result of :ocv:func:`gpu::HoughLines` .
+
+    :param h_lines: Output host array.
+
+    :param h_votes: Optional output array for line's votes.
+
+.. seealso:: :ocv:func:`gpu::HoughLines`
+
+
+
+gpu::HoughCircles
+-----------------
+Finds circles in a grayscale image using the Hough transform.
+
+.. ocv:function:: void gpu::HoughCircles(const GpuMat& src, GpuMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096)
+
+.. ocv:function:: void gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096)
+
+    :param src: 8-bit, single-channel grayscale input image.
+
+    :param circles: Output vector of found circles. Each vector is encoded as a 3-element floating-point vector  :math:`(x, y, radius)` .
+
+    :param method: Detection method to use. Currently, the only implemented method is  ``CV_HOUGH_GRADIENT`` , which is basically  *21HT* , described in  [Yuen90]_.
+
+    :param dp: Inverse ratio of the accumulator resolution to the image resolution. For example, if  ``dp=1`` , the accumulator has the same resolution as the input image. If  ``dp=2`` , the accumulator has half as big width and height.
+
+    :param minDist: Minimum distance between the centers of the detected circles. If the parameter is too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is too large, some circles may be missed.
+
+    :param cannyThreshold: The higher threshold of the two passed to  the :ocv:func:`gpu::Canny`  edge detector (the lower one is twice smaller).
+
+    :param votesThreshold: The accumulator threshold for the circle centers at the detection stage. The smaller it is, the more false circles may be detected.
+
+    :param minRadius: Minimum circle radius.
+
+    :param maxRadius: Maximum circle radius.
+
+    :param maxCircles: Maximum number of output circles.
+
+    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+
+.. seealso:: :ocv:func:`HoughCircles`
+
+
+
+gpu::HoughCirclesDownload
+-------------------------
+Downloads results from :ocv:func:`gpu::HoughCircles` to host memory.
+
+.. ocv:function:: void gpu::HoughCirclesDownload(const GpuMat& d_circles, OutputArray h_circles)
+
+    :param d_circles: Result of :ocv:func:`gpu::HoughCircles` .
+
+    :param h_circles: Output host array.
+
+.. seealso:: :ocv:func:`gpu::HoughCircles`
diff --git a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
new file mode 100644
index 0000000000..d602d0a131
--- /dev/null
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -0,0 +1,441 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPUIMGPROC_HPP__
+#define __OPENCV_GPUIMGPROC_HPP__
+
+#ifndef __cplusplus
+#  error gpuimgproc.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/gpufilters.hpp"
+#include "opencv2/imgproc.hpp"
+
+namespace cv { namespace gpu {
+
+enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
+       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
+
+//! Composite two images using alpha opacity values contained in each image
+//! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
+CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null());
+
+//! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]
+//! supports only CV_32FC1 map type
+CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
+                      int interpolation, int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(),
+                      Stream& stream = Stream::Null());
+
+//! Does mean shift filtering on GPU.
+CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
+                                   TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                                   Stream& stream = Stream::Null());
+
+//! Does mean shift procedure on GPU.
+CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,
+                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                              Stream& stream = Stream::Null());
+
+//! Does mean shift segmentation with elimination of small regions.
+CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,
+                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+
+//! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.
+//! Supported types of input disparity: CV_8U, CV_16S.
+//! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).
+CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null());
+
+//! Reprojects disparity image to 3D space.
+//! Supports CV_8U and CV_16S types of input disparity.
+//! The output is a 3- or 4-channel floating-point matrix.
+//! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.
+//! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.
+CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null());
+
+//! converts image from one color space to another
+CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
+
+enum
+{
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    COLOR_BayerBG2BGR_MHT = 256,
+    COLOR_BayerGB2BGR_MHT = 257,
+    COLOR_BayerRG2BGR_MHT = 258,
+    COLOR_BayerGR2BGR_MHT = 259,
+
+    COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT,
+    COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT,
+    COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT,
+    COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT,
+
+    COLOR_BayerBG2GRAY_MHT = 260,
+    COLOR_BayerGB2GRAY_MHT = 261,
+    COLOR_BayerRG2GRAY_MHT = 262,
+    COLOR_BayerGR2GRAY_MHT = 263
+};
+CV_EXPORTS void demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn = -1, Stream& stream = Stream::Null());
+
+//! swap channels
+//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
+//!            of the array contains the number of the channel that is stored in the n-th channel of
+//!            the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR
+//!            channel order.
+CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null());
+
+//! Routines for correcting image color gamma
+CV_EXPORTS void gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward = true, Stream& stream = Stream::Null());
+
+//! resizes the image
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
+CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
+
+//! warps the image using affine transformation
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
+    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
+
+CV_EXPORTS void buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null());
+
+//! warps the image using perspective transformation
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
+    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
+
+CV_EXPORTS void buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null());
+
+//! builds plane warping maps
+CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, float scale,
+                                   GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());
+
+//! builds cylindrical warping maps
+CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
+                                         GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());
+
+//! builds spherical warping maps
+CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
+                                       GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());
+
+//! rotates an image around the origin (0,0) and then shifts it
+//! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+//! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth
+CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
+                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
+
+//! computes Harris cornerness criteria at each image pixel
+CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
+CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
+CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k,
+                             int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());
+
+//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
+CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
+CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
+CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize,
+    int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());
+
+struct CV_EXPORTS MatchTemplateBuf
+{
+    Size user_block_size;
+    GpuMat imagef, templf;
+    std::vector<GpuMat> images;
+    std::vector<GpuMat> image_sums;
+    std::vector<GpuMat> image_sqsums;
+};
+
+//! computes the proximity map for the raster template and the image where the template is searched for
+CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null());
+
+//! computes the proximity map for the raster template and the image where the template is searched for
+CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null());
+
+//! smoothes the source image and downsamples it
+CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! upsamples the source image and then smoothes it
+CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! performs linear blending of two images
+//! to avoid accuracy errors sum of weigths shouldn't be very close to zero
+CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
+                            GpuMat& result, Stream& stream = Stream::Null());
+
+//! Performa bilateral filtering of passsed image
+CV_EXPORTS void bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial,
+                                int borderMode = BORDER_DEFAULT, Stream& stream = Stream::Null());
+
+//! Brute force non-local means algorith (slow but universal)
+CV_EXPORTS void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null());
+
+//! Fast (but approximate)version of non-local means algorith similar to CPU function (running sums technique)
+class CV_EXPORTS FastNonLocalMeansDenoising
+{
+public:
+    //! Simple method, recommended for grayscale images (though it supports multichannel images)
+    void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
+
+    //! Processes luminance and color components separatelly
+    void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
+
+private:
+
+    GpuMat buffer, extended_src_buffer;
+    GpuMat lab, l, ab;
+};
+
+struct CV_EXPORTS CannyBuf
+{
+    void create(const Size& image_size, int apperture_size = 3);
+    void release();
+
+    GpuMat dx, dy;
+    GpuMat mag;
+    GpuMat map;
+    GpuMat st1, st2;
+    Ptr<FilterEngine_GPU> filterDX, filterDY;
+};
+
+CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+
+class CV_EXPORTS ImagePyramid
+{
+public:
+    inline ImagePyramid() : nLayers_(0) {}
+    inline ImagePyramid(const GpuMat& img, int nLayers, Stream& stream = Stream::Null())
+    {
+        build(img, nLayers, stream);
+    }
+
+    void build(const GpuMat& img, int nLayers, Stream& stream = Stream::Null());
+
+    void getLayer(GpuMat& outImg, Size outRoi, Stream& stream = Stream::Null()) const;
+
+    inline void release()
+    {
+        layer0_.release();
+        pyramid_.clear();
+        nLayers_ = 0;
+    }
+
+private:
+    GpuMat layer0_;
+    std::vector<GpuMat> pyramid_;
+    int nLayers_;
+};
+
+//! HoughLines
+
+struct HoughLinesBuf
+{
+    GpuMat accum;
+    GpuMat list;
+};
+
+CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
+CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
+CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
+
+//! HoughLinesP
+
+//! finds line segments in the black-n-white image using probabalistic Hough transform
+CV_EXPORTS void HoughLinesP(const GpuMat& image, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);
+
+//! HoughCircles
+
+struct HoughCirclesBuf
+{
+    GpuMat edges;
+    GpuMat accum;
+    GpuMat list;
+    CannyBuf cannyBuf;
+};
+
+CV_EXPORTS void HoughCircles(const GpuMat& src, GpuMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
+CV_EXPORTS void HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
+CV_EXPORTS void HoughCirclesDownload(const GpuMat& d_circles, OutputArray h_circles);
+
+//! finds arbitrary template in the grayscale image using Generalized Hough Transform
+//! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
+//! Guil, N., GonzÃ¡lez-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
+class CV_EXPORTS GeneralizedHough_GPU : public cv::Algorithm
+{
+public:
+    static Ptr<GeneralizedHough_GPU> create(int method);
+
+    virtual ~GeneralizedHough_GPU();
+
+    //! set template to search
+    void setTemplate(const GpuMat& templ, int cannyThreshold = 100, Point templCenter = Point(-1, -1));
+    void setTemplate(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter = Point(-1, -1));
+
+    //! find template on image
+    void detect(const GpuMat& image, GpuMat& positions, int cannyThreshold = 100);
+    void detect(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions);
+
+    void download(const GpuMat& d_positions, OutputArray h_positions, OutputArray h_votes = noArray());
+
+    void release();
+
+protected:
+    virtual void setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter) = 0;
+    virtual void detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions) = 0;
+    virtual void releaseImpl() = 0;
+
+private:
+    GpuMat edges_;
+    CannyBuf cannyBuf_;
+};
+
+//!performs labeling via graph cuts of a 2D regular 4-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
+                         GpuMat& buf, Stream& stream = Stream::Null());
+
+//!performs labeling via graph cuts of a 2D regular 8-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
+                         GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight,
+                         GpuMat& labels,
+                         GpuMat& buf, Stream& stream = Stream::Null());
+
+//! compute mask for Generalized Flood fill componetns labeling.
+CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& stream = Stream::Null());
+
+//! performs connected componnents labeling.
+CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, int flags = 0, Stream& stream = Stream::Null());
+
+//! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.
+CV_EXPORTS void evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel);
+//! Calculates histogram with evenly distributed bins for signle channel source.
+//! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types.
+//! Output hist will have one row and histSize cols and CV_32SC1 type.
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
+//! Calculates histogram with evenly distributed bins for four-channel source.
+//! All channels of source are processed separately.
+//! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types.
+//! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type.
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
+//! Calculates histogram with bins determined by levels array.
+//! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
+//! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types.
+//! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type.
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null());
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null());
+//! Calculates histogram with bins determined by levels array.
+//! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
+//! All channels of source are processed separately.
+//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.
+//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream = Stream::Null());
+
+//! Calculates histogram for 8u one channel image
+//! Output hist will have one row, 256 cols and CV32SC1 type.
+CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
+CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
+
+//! normalizes the grayscale image brightness and contrast by normalizing its histogram
+CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
+CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
+
+class CV_EXPORTS CLAHE : public cv::CLAHE
+{
+public:
+    using cv::CLAHE::apply;
+    virtual void apply(InputArray src, OutputArray dst, Stream& stream) = 0;
+};
+CV_EXPORTS Ptr<cv::gpu::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
+
+class CV_EXPORTS GoodFeaturesToTrackDetector_GPU
+{
+public:
+    explicit GoodFeaturesToTrackDetector_GPU(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
+        int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
+
+    //! return 1 rows matrix with CV_32FC2 type
+    void operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask = GpuMat());
+
+    int maxCorners;
+    double qualityLevel;
+    double minDistance;
+
+    int blockSize;
+    bool useHarrisDetector;
+    double harrisK;
+
+    void releaseMemory()
+    {
+        Dx_.release();
+        Dy_.release();
+        buf_.release();
+        eig_.release();
+        minMaxbuf_.release();
+        tmpCorners_.release();
+    }
+
+private:
+    GpuMat Dx_;
+    GpuMat Dy_;
+    GpuMat buf_;
+    GpuMat eig_;
+    GpuMat minMaxbuf_;
+    GpuMat tmpCorners_;
+};
+
+inline GoodFeaturesToTrackDetector_GPU::GoodFeaturesToTrackDetector_GPU(int maxCorners_, double qualityLevel_, double minDistance_,
+        int blockSize_, bool useHarrisDetector_, double harrisK_)
+{
+    maxCorners = maxCorners_;
+    qualityLevel = qualityLevel_;
+    minDistance = minDistance_;
+    blockSize = blockSize_;
+    useHarrisDetector = useHarrisDetector_;
+    harrisK = harrisK_;
+}
+
+}} // namespace cv { namespace gpu {
+
+#endif /* __OPENCV_GPUIMGPROC_HPP__ */
diff --git a/modules/gpuimgproc/perf/perf_denoising.cpp b/modules/gpuimgproc/perf/perf_denoising.cpp
new file mode 100644
index 0000000000..1e33601d60
--- /dev/null
+++ b/modules/gpuimgproc/perf/perf_denoising.cpp
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)
+
+//////////////////////////////////////////////////////////////////////
+// BilateralFilter
+
+DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, MatCn, int);
+
+PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values(CV_8U, CV_32F),
+                    GPU_CHANNELS_1_3,
+                    Values(3, 5, 9)))
+{
+    declare.time(60.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int kernel_size = GET_PARAM(3);
+
+    const float sigma_color = 7;
+    const float sigma_spatial = 5;
+    const int borderMode = cv::BORDER_REFLECT101;
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::bilateralFilter(d_src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// nonLocalMeans
+
+DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
+
+PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(5)))
+{
+    declare.time(600.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int search_widow_size = GET_PARAM(3);
+    const int block_size = GET_PARAM(4);
+
+    const float h = 10;
+    const int borderMode = cv::BORDER_REFLECT101;
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::nonLocalMeans(d_src, dst, h, search_widow_size, block_size, borderMode);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// fastNonLocalMeans
+
+DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
+
+PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(7)))
+{
+    declare.time(60.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);
+
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::FastNonLocalMeansDenoising fnlmd;
+
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// fastNonLocalMeans (colored)
+
+DEF_PARAM_TEST(Sz_Depth_WinSz_BlockSz, cv::Size, MatDepth, int, int);
+
+PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeansColored,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    Values(21),
+                    Values(7)))
+{
+    declare.time(60.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);
+
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 3);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::FastNonLocalMeansDenoising fnlmd;
+
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
diff --git a/modules/gpuimgproc/perf/perf_imgproc.cpp b/modules/gpuimgproc/perf/perf_imgproc.cpp
new file mode 100644
index 0000000000..349dcc825d
--- /dev/null
+++ b/modules/gpuimgproc/perf/perf_imgproc.cpp
@@ -0,0 +1,1631 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+//////////////////////////////////////////////////////////////////////
+// Remap
+
+enum { HALF_SIZE=0, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH };
+CV_ENUM(RemapMode, HALF_SIZE, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH);
+
+void generateMap(cv::Mat& map_x, cv::Mat& map_y, int remapMode)
+{
+    for (int j = 0; j < map_x.rows; ++j)
+    {
+        for (int i = 0; i < map_x.cols; ++i)
+        {
+            switch (remapMode)
+            {
+            case HALF_SIZE:
+                if (i > map_x.cols*0.25 && i < map_x.cols*0.75 && j > map_x.rows*0.25 && j < map_x.rows*0.75)
+                {
+                    map_x.at<float>(j,i) = 2.f * (i - map_x.cols * 0.25f) + 0.5f;
+                    map_y.at<float>(j,i) = 2.f * (j - map_x.rows * 0.25f) + 0.5f;
+                }
+                else
+                {
+                    map_x.at<float>(j,i) = 0.f;
+                    map_y.at<float>(j,i) = 0.f;
+                }
+                break;
+            case UPSIDE_DOWN:
+                map_x.at<float>(j,i) = static_cast<float>(i);
+                map_y.at<float>(j,i) = static_cast<float>(map_x.rows - j);
+                break;
+            case REFLECTION_X:
+                map_x.at<float>(j,i) = static_cast<float>(map_x.cols - i);
+                map_y.at<float>(j,i) = static_cast<float>(j);
+                break;
+            case REFLECTION_BOTH:
+                map_x.at<float>(j,i) = static_cast<float>(map_x.cols - i);
+                map_y.at<float>(j,i) = static_cast<float>(map_x.rows - j);
+                break;
+            } // end of switch
+        }
+    }
+}
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border_Mode, cv::Size, MatDepth, MatCn, Interpolation, BorderMode, RemapMode);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    ALL_BORDER_MODES,
+                    RemapMode::all()))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const int borderMode = GET_PARAM(4);
+    const int remapMode = GET_PARAM(5);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat xmap(size, CV_32FC1);
+    cv::Mat ymap(size, CV_32FC1);
+    generateMap(xmap, ymap, remapMode);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        const cv::gpu::GpuMat d_xmap(xmap);
+        const cv::gpu::GpuMat d_ymap(ymap);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::remap(d_src, dst, d_xmap, d_ymap, interpolation, borderMode);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Resize
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Scale, cv::Size, MatDepth, MatCn, Interpolation, double);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    Values(0.5, 0.3, 2.0)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const double f = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::resize(d_src, dst, cv::Size(), f, f, interpolation);
+
+        GPU_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ResizeArea
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Scale, cv::Size, MatDepth, MatCn, double);
+
+PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    Values(0.2, 0.1, 0.05)))
+{
+    declare.time(1.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = cv::INTER_AREA;
+    const double f = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::resize(d_src, dst, cv::Size(), f, f, interpolation);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// WarpAffine
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border, cv::Size, MatDepth, MatCn, Interpolation, BorderMode);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    ALL_BORDER_MODES))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const int borderMode = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const double aplha = CV_PI / 4;
+    const double mat[2 * 3] =
+    {
+        std::cos(aplha), -std::sin(aplha), src.cols / 2,
+        std::sin(aplha),  std::cos(aplha), 0
+    };
+    const cv::Mat M(2, 3, CV_64F, (void*) mat);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::warpAffine(d_src, dst, M, size, interpolation, borderMode);
+
+        GPU_SANITY_CHECK(dst, 1);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::warpAffine(src, dst, M, size, interpolation, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// WarpPerspective
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    ALL_BORDER_MODES))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const int borderMode = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const double aplha = CV_PI / 4;
+    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
+                         {std::sin(aplha),  std::cos(aplha), 0},
+                         {0.0,              0.0,             1.0}};
+    const cv::Mat M(3, 3, CV_64F, (void*) mat);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::warpPerspective(d_src, dst, M, size, interpolation, borderMode);
+
+        GPU_SANITY_CHECK(dst, 1);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Threshold
+
+CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV)
+
+DEF_PARAM_TEST(Sz_Depth_Op, cv::Size, MatDepth, ThreshOp);
+
+PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+            Values(CV_8U, CV_16U, CV_32F, CV_64F),
+            ThreshOp::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int threshOp = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::threshold(d_src, dst, 100.0, 255.0, threshOp);
+
+        GPU_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::threshold(src, dst, 100.0, 255.0, threshOp);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HistEvenC1
+
+PERF_TEST_P(Sz_Depth, HistEvenC1,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::histEven(d_src, dst, d_buf, 30, 0, 180);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        const int hbins = 30;
+        const float hranges[] = {0.0f, 180.0f};
+        const int histSize[] = {hbins};
+        const float* ranges[] = {hranges};
+        const int channels[] = {0};
+
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::calcHist(&src, 1, channels, cv::Mat(), dst, 1, histSize, ranges);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HistEvenC4
+
+PERF_TEST_P(Sz_Depth, HistEvenC4,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, CV_MAKE_TYPE(depth, 4));
+    declare.in(src, WARMUP_RNG);
+
+    int histSize[] = {30, 30, 30, 30};
+    int lowerLevel[] = {0, 0, 0, 0};
+    int upperLevel[] = {180, 180, 180, 180};
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_hist[4];
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::histEven(d_src, d_hist, d_buf, histSize, lowerLevel, upperLevel);
+
+        cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3;
+        d_hist[0].download(cpu_hist0);
+        d_hist[1].download(cpu_hist1);
+        d_hist[2].download(cpu_hist2);
+        d_hist[3].download(cpu_hist3);
+        SANITY_CHECK(cpu_hist0);
+        SANITY_CHECK(cpu_hist1);
+        SANITY_CHECK(cpu_hist2);
+        SANITY_CHECK(cpu_hist3);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CalcHist
+
+PERF_TEST_P(Sz, CalcHist,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::calcHist(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// EqualizeHist
+
+PERF_TEST_P(Sz, EqualizeHist,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_hist;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::equalizeHist(d_src, dst, d_hist, d_buf);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::equalizeHist(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double);
+
+PERF_TEST_P(Sz_ClipLimit, CLAHE,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(0.0, 40.0)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const double clipLimit = GET_PARAM(1);
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::Ptr<cv::gpu::CLAHE> clahe = cv::gpu::createCLAHE(clipLimit);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() clahe->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE(clipLimit);
+        cv::Mat dst;
+
+        TEST_CYCLE() clahe->apply(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Canny
+
+DEF_PARAM_TEST(Image_AppertureSz_L2gradient, string, int, bool);
+
+PERF_TEST_P(Image_AppertureSz_L2gradient, Canny,
+            Combine(Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
+                    Values(3, 5),
+                    Bool()))
+{
+    const string fileName = GET_PARAM(0);
+    const int apperture_size = GET_PARAM(1);
+    const bool useL2gradient = GET_PARAM(2);
+
+    const cv::Mat image = readImage(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    const double low_thresh = 50.0;
+    const double high_thresh = 100.0;
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_image(image);
+        cv::gpu::GpuMat dst;
+        cv::gpu::CannyBuf d_buf;
+
+        TEST_CYCLE() cv::gpu::Canny(d_image, d_buf, dst, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Canny(image, dst, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MeanShiftFiltering
+
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, MeanShiftFiltering,
+            Values<string>("gpu/meanshift/cones.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam());
+    ASSERT_FALSE(img.empty());
+
+    cv::Mat rgba;
+    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
+
+    const int sp = 50;
+    const int sr = 50;
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(rgba);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::meanShiftFiltering(d_src, dst, sp, sr);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::pyrMeanShiftFiltering(img, dst, sp, sr);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MeanShiftProc
+
+PERF_TEST_P(Image, MeanShiftProc,
+            Values<string>("gpu/meanshift/cones.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam());
+    ASSERT_FALSE(img.empty());
+
+    cv::Mat rgba;
+    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
+
+    const int sp = 50;
+    const int sr = 50;
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(rgba);
+        cv::gpu::GpuMat dstr;
+        cv::gpu::GpuMat dstsp;
+
+        TEST_CYCLE() cv::gpu::meanShiftProc(d_src, dstr, dstsp, sp, sr);
+
+        GPU_SANITY_CHECK(dstr);
+        GPU_SANITY_CHECK(dstsp);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MeanShiftSegmentation
+
+PERF_TEST_P(Image, MeanShiftSegmentation,
+            Values<string>("gpu/meanshift/cones.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam());
+    ASSERT_FALSE(img.empty());
+
+    cv::Mat rgba;
+    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
+
+    const int sp = 10;
+    const int sr = 10;
+    const int minsize = 20;
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(rgba);
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::gpu::meanShiftSegmentation(d_src, dst, sp, sr, minsize);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BlendLinear
+
+PERF_TEST_P(Sz_Depth_Cn, BlendLinear,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat img1(size, type);
+    cv::Mat img2(size, type);
+    declare.in(img1, img2, WARMUP_RNG);
+
+    const cv::Mat weights1(size, CV_32FC1, cv::Scalar::all(0.5));
+    const cv::Mat weights2(size, CV_32FC1, cv::Scalar::all(0.5));
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_img1(img1);
+        const cv::gpu::GpuMat d_img2(img2);
+        const cv::gpu::GpuMat d_weights1(weights1);
+        const cv::gpu::GpuMat d_weights2(weights2);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::blendLinear(d_img1, d_img2, d_weights1, d_weights2, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate8U
+
+CV_ENUM(TemplateMethod, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
+
+DEF_PARAM_TEST(Sz_TemplateSz_Cn_Method, cv::Size, cv::Size, MatCn, TemplateMethod);
+
+PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate8U,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(cv::Size(5, 5), cv::Size(16, 16), cv::Size(30, 30)),
+                    GPU_CHANNELS_1_3_4,
+                    TemplateMethod::all()))
+{
+    declare.time(300.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const cv::Size templ_size = GET_PARAM(1);
+    const int cn = GET_PARAM(2);
+    const int method = GET_PARAM(3);
+
+    cv::Mat image(size, CV_MAKE_TYPE(CV_8U, cn));
+    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_8U, cn));
+    declare.in(image, templ, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_image(image);
+        const cv::gpu::GpuMat d_templ(templ);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+
+        GPU_SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::matchTemplate(image, templ, dst, method);
+
+        CPU_SANITY_CHECK(dst);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate32F
+
+PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate32F,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(cv::Size(5, 5), cv::Size(16, 16), cv::Size(30, 30)),
+                    GPU_CHANNELS_1_3_4,
+                    Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))))
+{
+    declare.time(300.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const cv::Size templ_size = GET_PARAM(1);
+    const int cn = GET_PARAM(2);
+    int method = GET_PARAM(3);
+
+    cv::Mat image(size, CV_MAKE_TYPE(CV_32F, cn));
+    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_32F, cn));
+    declare.in(image, templ, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_image(image);
+        const cv::gpu::GpuMat d_templ(templ);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+
+        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::matchTemplate(image, templ, dst, method);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CornerHarris
+
+DEF_PARAM_TEST(Image_Type_Border_BlockSz_ApertureSz, string, MatType, BorderMode, int, int);
+
+PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, CornerHarris,
+            Combine(Values<string>("gpu/stereobm/aloe-L.png"),
+                    Values(CV_8UC1, CV_32FC1),
+                    Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
+                    Values(3, 5, 7),
+                    Values(0, 3, 5, 7)))
+{
+    const string fileName = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int borderMode = GET_PARAM(2);
+    const int blockSize = GET_PARAM(3);
+    const int apertureSize = GET_PARAM(4);
+
+    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
+
+    const double k = 0.5;
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_Dx;
+        cv::gpu::GpuMat d_Dy;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::cornerHarris(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, k, borderMode);
+
+        GPU_SANITY_CHECK(dst, 1e-4);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CornerMinEigenVal
+
+PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, CornerMinEigenVal,
+            Combine(Values<string>("gpu/stereobm/aloe-L.png"),
+                    Values(CV_8UC1, CV_32FC1),
+                    Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
+                    Values(3, 5, 7),
+                    Values(0, 3, 5, 7)))
+{
+    const string fileName = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int borderMode = GET_PARAM(2);
+    const int blockSize = GET_PARAM(3);
+    const int apertureSize = GET_PARAM(4);
+
+    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_Dx;
+        cv::gpu::GpuMat d_Dy;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::cornerMinEigenVal(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, borderMode);
+
+        GPU_SANITY_CHECK(dst, 1e-4);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BuildWarpPlaneMaps
+
+PERF_TEST_P(Sz, ImgProc_BuildWarpPlaneMaps,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
+    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
+    const cv::Mat T = cv::Mat::zeros(1, 3, CV_32F);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat map_x;
+        cv::gpu::GpuMat map_y;
+
+        TEST_CYCLE() cv::gpu::buildWarpPlaneMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, T, 1.0, map_x, map_y);
+
+        GPU_SANITY_CHECK(map_x);
+        GPU_SANITY_CHECK(map_y);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BuildWarpCylindricalMaps
+
+PERF_TEST_P(Sz, ImgProc_BuildWarpCylindricalMaps,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
+    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat map_x;
+        cv::gpu::GpuMat map_y;
+
+        TEST_CYCLE() cv::gpu::buildWarpCylindricalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
+
+        GPU_SANITY_CHECK(map_x);
+        GPU_SANITY_CHECK(map_y);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BuildWarpSphericalMaps
+
+PERF_TEST_P(Sz, ImgProc_BuildWarpSphericalMaps,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
+    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat map_x;
+        cv::gpu::GpuMat map_y;
+
+        TEST_CYCLE() cv::gpu::buildWarpSphericalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
+
+        GPU_SANITY_CHECK(map_x);
+        GPU_SANITY_CHECK(map_y);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Rotate
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter, cv::Size, MatDepth, MatCn, Interpolation);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter, ImgProc_Rotate,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::rotate(d_src, dst, size, 30.0, 0, 0, interpolation);
+
+        GPU_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// PyrDown
+
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::pyrDown(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::pyrDown(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// PyrUp
+
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::pyrUp(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::pyrUp(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CvtColor
+
+DEF_PARAM_TEST(Sz_Depth_Code, cv::Size, MatDepth, CvtColorInfo);
+
+PERF_TEST_P(Sz_Depth_Code, CvtColor,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_32F),
+                    Values(CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA),
+                           CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY),
+                           CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2XYZ),
+                           CvtColorInfo(3, 3, cv::COLOR_XYZ2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2YCrCb),
+                           CvtColorInfo(3, 3, cv::COLOR_YCrCb2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2YUV),
+                           CvtColorInfo(3, 3, cv::COLOR_YUV2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
+                           CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
+                           CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
+                           CvtColorInfo(3, 3, cv::COLOR_LBGR2Lab),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
+                           CvtColorInfo(3, 3, cv::COLOR_LBGR2Luv),
+                           CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_Lab2LBGR),
+                           CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
+                           CvtColorInfo(3, 3, cv::COLOR_Luv2LRGB))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const CvtColorInfo info = GET_PARAM(2);
+
+    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
+    cv::randu(src, 0, depth == CV_8U ? 255.0 : 1.0);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::cvtColor(d_src, dst, info.code, info.dcn);
+
+        GPU_SANITY_CHECK(dst, 1e-4);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cvtColor(src, dst, info.code, info.dcn);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+PERF_TEST_P(Sz_Depth_Code, CvtColorBayer,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U),
+                    Values(CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
+                           CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
+                           CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
+                           CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
+
+                           CvtColorInfo(1, 1, cv::COLOR_BayerBG2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerGB2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerRG2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerGR2GRAY))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const CvtColorInfo info = GET_PARAM(2);
+
+    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::cvtColor(d_src, dst, info.code, info.dcn);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cvtColor(src, dst, info.code, info.dcn);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+CV_ENUM(DemosaicingCode,
+        COLOR_BayerBG2BGR, COLOR_BayerGB2BGR, COLOR_BayerRG2BGR, COLOR_BayerGR2BGR,
+        COLOR_BayerBG2GRAY, COLOR_BayerGB2GRAY, COLOR_BayerRG2GRAY, COLOR_BayerGR2GRAY,
+        COLOR_BayerBG2BGR_MHT, COLOR_BayerGB2BGR_MHT, COLOR_BayerRG2BGR_MHT, COLOR_BayerGR2BGR_MHT,
+        COLOR_BayerBG2GRAY_MHT, COLOR_BayerGB2GRAY_MHT, COLOR_BayerRG2GRAY_MHT, COLOR_BayerGR2GRAY_MHT)
+
+DEF_PARAM_TEST(Sz_Code, cv::Size, DemosaicingCode);
+
+PERF_TEST_P(Sz_Code, Demosaicing,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    DemosaicingCode::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int code = GET_PARAM(1);
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::demosaicing(d_src, dst, code);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        if (code >= cv::COLOR_COLORCVT_MAX)
+        {
+            FAIL_NO_CPU();
+        }
+        else
+        {
+            cv::Mat dst;
+
+            TEST_CYCLE() cv::cvtColor(src, dst, code);
+
+            CPU_SANITY_CHECK(dst);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SwapChannels
+
+PERF_TEST_P(Sz, SwapChannels,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC4);
+    declare.in(src, WARMUP_RNG);
+
+    const int dstOrder[] = {2, 1, 0, 3};
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat dst(src);
+
+        TEST_CYCLE() cv::gpu::swapChannels(dst, dstOrder);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// AlphaComp
+
+CV_ENUM(AlphaOp, ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL, ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL)
+
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, AlphaOp);
+
+PERF_TEST_P(Sz_Type_Op, AlphaComp,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC4, CV_16UC4, CV_32SC4, CV_32FC4),
+                    AlphaOp::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int alpha_op = GET_PARAM(2);
+
+    cv::Mat img1(size, type);
+    cv::Mat img2(size, type);
+    declare.in(img1, img2, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_img1(img1);
+        const cv::gpu::GpuMat d_img2(img2);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::alphaComp(d_img1, d_img2, dst, alpha_op);
+
+        GPU_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ImagePyramidBuild
+
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidBuild,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const int nLayers = 5;
+    const cv::Size dstSize(size.width / 2 + 10, size.height / 2 + 10);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+
+        cv::gpu::ImagePyramid d_pyr;
+
+        TEST_CYCLE() d_pyr.build(d_src, nLayers);
+
+        cv::gpu::GpuMat dst;
+        d_pyr.getLayer(dst, dstSize);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ImagePyramidGetLayer
+
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const int nLayers = 3;
+    const cv::Size dstSize(size.width / 2 + 10, size.height / 2 + 10);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::gpu::ImagePyramid d_pyr(d_src, nLayers);
+
+        TEST_CYCLE() d_pyr.getLayer(dst, dstSize);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HoughLines
+
+namespace
+{
+    struct Vec4iComparator
+    {
+        bool operator()(const cv::Vec4i& a, const cv::Vec4i b) const
+        {
+            if (a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else if(a[2] != b[2]) return a[2] < b[2];
+            else return a[3] < b[3];
+        }
+    };
+    struct Vec3fComparator
+    {
+        bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else return a[2] < b[2];
+        }
+    };
+    struct Vec2fComparator
+    {
+        bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else return a[1] < b[1];
+        }
+    };
+}
+
+PERF_TEST_P(Sz, HoughLines,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    declare.time(30.0);
+
+    const cv::Size size = GetParam();
+
+    const float rho = 1.0f;
+    const float theta = static_cast<float>(CV_PI / 180.0);
+    const int threshold = 300;
+
+    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
+    cv::line(src, cv::Point(0, 100), cv::Point(src.cols, 100), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(0, 200), cv::Point(src.cols, 200), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(0, 400), cv::Point(src.cols, 400), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(100, 0), cv::Point(100, src.rows), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(200, 0), cv::Point(200, src.rows), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(400, 0), cv::Point(400, src.rows), cv::Scalar::all(255), 1);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        TEST_CYCLE() cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold);
+
+        cv::Mat gpu_lines(d_lines.row(0));
+        cv::Vec2f* begin = gpu_lines.ptr<cv::Vec2f>(0);
+        cv::Vec2f* end = begin + gpu_lines.cols;
+        std::sort(begin, end, Vec2fComparator());
+        SANITY_CHECK(gpu_lines);
+    }
+    else
+    {
+        std::vector<cv::Vec2f> cpu_lines;
+
+        TEST_CYCLE() cv::HoughLines(src, cpu_lines, rho, theta, threshold);
+
+        SANITY_CHECK(cpu_lines);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+PERF_TEST_P(Image, HoughLinesP,
+            testing::Values("cv/shared/pic5.png", "stitching/a1.png"))
+{
+    declare.time(30.0);
+
+    const std::string fileName = getDataPath(GetParam());
+
+    const float rho = 1.0f;
+    const float theta = static_cast<float>(CV_PI / 180.0);
+    const int threshold = 100;
+    const int minLineLenght = 50;
+    const int maxLineGap = 5;
+
+    const cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat mask;
+    cv::Canny(image, mask, 50, 100);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        TEST_CYCLE() cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+
+        cv::Mat gpu_lines(d_lines);
+        cv::Vec4i* begin = gpu_lines.ptr<cv::Vec4i>();
+        cv::Vec4i* end = begin + gpu_lines.cols;
+        std::sort(begin, end, Vec4iComparator());
+        SANITY_CHECK(gpu_lines);
+    }
+    else
+    {
+        std::vector<cv::Vec4i> cpu_lines;
+
+        TEST_CYCLE() cv::HoughLinesP(mask, cpu_lines, rho, theta, threshold, minLineLenght, maxLineGap);
+
+        SANITY_CHECK(cpu_lines);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HoughCircles
+
+DEF_PARAM_TEST(Sz_Dp_MinDist, cv::Size, float, float);
+
+PERF_TEST_P(Sz_Dp_MinDist, HoughCircles,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(1.0f, 2.0f, 4.0f),
+                    Values(1.0f)))
+{
+    declare.time(30.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const float dp = GET_PARAM(1);
+    const float minDist = GET_PARAM(2);
+
+    const int minRadius = 10;
+    const int maxRadius = 30;
+    const int cannyThreshold = 100;
+    const int votesThreshold = 15;
+
+    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
+    cv::circle(src, cv::Point(100, 100), 20, cv::Scalar::all(255), -1);
+    cv::circle(src, cv::Point(200, 200), 25, cv::Scalar::all(255), -1);
+    cv::circle(src, cv::Point(200, 100), 25, cv::Scalar::all(255), -1);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_circles;
+        cv::gpu::HoughCirclesBuf d_buf;
+
+        TEST_CYCLE() cv::gpu::HoughCircles(d_src, d_circles, d_buf, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+
+        cv::Mat gpu_circles(d_circles);
+        cv::Vec3f* begin = gpu_circles.ptr<cv::Vec3f>(0);
+        cv::Vec3f* end = begin + gpu_circles.cols;
+        std::sort(begin, end, Vec3fComparator());
+        SANITY_CHECK(gpu_circles);
+    }
+    else
+    {
+        std::vector<cv::Vec3f> cpu_circles;
+
+        TEST_CYCLE() cv::HoughCircles(src, cpu_circles, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+
+        SANITY_CHECK(cpu_circles);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// GeneralizedHough
+
+enum { GHT_POSITION = cv::GeneralizedHough::GHT_POSITION,
+       GHT_SCALE    = cv::GeneralizedHough::GHT_SCALE,
+       GHT_ROTATION = cv::GeneralizedHough::GHT_ROTATION
+     };
+
+CV_FLAGS(GHMethod, GHT_POSITION, GHT_SCALE, GHT_ROTATION);
+
+DEF_PARAM_TEST(Method_Sz, GHMethod, cv::Size);
+
+PERF_TEST_P(Method_Sz, GeneralizedHough,
+            Combine(Values(GHMethod(GHT_POSITION), GHMethod(GHT_POSITION | GHT_SCALE), GHMethod(GHT_POSITION | GHT_ROTATION), GHMethod(GHT_POSITION | GHT_SCALE | GHT_ROTATION)),
+                    GPU_TYPICAL_MAT_SIZES))
+{
+    declare.time(10);
+
+    const int method = GET_PARAM(0);
+    const cv::Size imageSize = GET_PARAM(1);
+
+    const cv::Mat templ = readImage("cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(templ.empty());
+
+    cv::Mat image(imageSize, CV_8UC1, cv::Scalar::all(0));
+    templ.copyTo(image(cv::Rect(50, 50, templ.cols, templ.rows)));
+
+    cv::RNG rng(123456789);
+    const int objCount = rng.uniform(5, 15);
+    for (int i = 0; i < objCount; ++i)
+    {
+        double scale = rng.uniform(0.7, 1.3);
+        bool rotate = 1 == rng.uniform(0, 2);
+
+        cv::Mat obj;
+        cv::resize(templ, obj, cv::Size(), scale, scale);
+        if (rotate)
+            obj = obj.t();
+
+        cv::Point pos;
+
+        pos.x = rng.uniform(0, image.cols - obj.cols);
+        pos.y = rng.uniform(0, image.rows - obj.rows);
+
+        cv::Mat roi = image(cv::Rect(pos, obj.size()));
+        cv::add(roi, obj, roi);
+    }
+
+    cv::Mat edges;
+    cv::Canny(image, edges, 50, 100);
+
+    cv::Mat dx, dy;
+    cv::Sobel(image, dx, CV_32F, 1, 0);
+    cv::Sobel(image, dy, CV_32F, 0, 1);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_edges(edges);
+        const cv::gpu::GpuMat d_dx(dx);
+        const cv::gpu::GpuMat d_dy(dy);
+        cv::gpu::GpuMat posAndVotes;
+
+        cv::Ptr<cv::gpu::GeneralizedHough_GPU> d_hough = cv::gpu::GeneralizedHough_GPU::create(method);
+        if (method & GHT_ROTATION)
+        {
+            d_hough->set("maxAngle", 90.0);
+            d_hough->set("angleStep", 2.0);
+        }
+
+        d_hough->setTemplate(cv::gpu::GpuMat(templ));
+
+        TEST_CYCLE() d_hough->detect(d_edges, d_dx, d_dy, posAndVotes);
+
+        const cv::gpu::GpuMat positions(1, posAndVotes.cols, CV_32FC4, posAndVotes.data);
+        GPU_SANITY_CHECK(positions);
+    }
+    else
+    {
+        cv::Mat positions;
+
+        cv::Ptr<cv::GeneralizedHough> hough = cv::GeneralizedHough::create(method);
+        if (method & GHT_ROTATION)
+        {
+            hough->set("maxAngle", 90.0);
+            hough->set("angleStep", 2.0);
+        }
+
+        hough->setTemplate(templ);
+
+        TEST_CYCLE() hough->detect(edges, dx, dy, positions);
+
+        CPU_SANITY_CHECK(positions);
+    }
+}
diff --git a/modules/gpuimgproc/perf/perf_labeling.cpp b/modules/gpuimgproc/perf/perf_labeling.cpp
new file mode 100644
index 0000000000..0484da9d59
--- /dev/null
+++ b/modules/gpuimgproc/perf/perf_labeling.cpp
@@ -0,0 +1,195 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+DEF_PARAM_TEST_1(Image, string);
+
+struct GreedyLabeling
+{
+    struct dot
+    {
+        int x;
+        int y;
+
+        static dot make(int i, int j)
+        {
+            dot d; d.x = i; d.y = j;
+            return d;
+        }
+    };
+
+    struct InInterval
+    {
+        InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {}
+        const int lo, hi;
+
+        bool operator() (const unsigned char a, const unsigned char b) const
+        {
+            int d = a - b;
+            return lo <= d && d <= hi;
+        }
+
+    private:
+        InInterval& operator=(const InInterval&);
+
+
+    };
+
+    GreedyLabeling(cv::Mat img)
+    : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
+
+    ~GreedyLabeling(){delete[] stack;}
+
+    void operator() (cv::Mat labels) const
+    {
+        labels.setTo(cv::Scalar::all(-1));
+        InInterval inInt(0, 2);
+        int cc = -1;
+
+        int* dist_labels = (int*)labels.data;
+        int pitch = static_cast<int>(labels.step1());
+
+        unsigned char* source = (unsigned char*)image.data;
+        int width = image.cols;
+        int height = image.rows;
+
+        for (int j = 0; j < image.rows; ++j)
+            for (int i = 0; i < image.cols; ++i)
+            {
+                if (dist_labels[j * pitch + i] != -1) continue;
+
+                dot* top = stack;
+                dot p = dot::make(i, j);
+                cc++;
+
+                dist_labels[j * pitch + i] = cc;
+
+                while (top >= stack)
+                {
+                    int*  dl = &dist_labels[p.y * pitch + p.x];
+                    unsigned char* sp = &source[p.y * image.step1() + p.x];
+
+                    dl[0] = cc;
+
+                    //right
+                    if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                        *top++ = dot::make(p.x + 1, p.y);
+
+                    //left
+                    if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                        *top++ = dot::make(p.x - 1, p.y);
+
+                    //bottom
+                    if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
+                        *top++ = dot::make(p.x, p.y + 1);
+
+                    //top
+                    if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-static_cast<int>(image.step1())]))
+                        *top++ = dot::make(p.x, p.y - 1);
+
+                    p = *--top;
+                }
+            }
+    }
+
+    cv::Mat image;
+    cv::Mat _labels;
+    dot* stack;
+};
+
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectivityMask,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_image(image);
+        cv::gpu::GpuMat mask;
+
+        TEST_CYCLE() cv::gpu::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        GPU_SANITY_CHECK(mask);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectedComponents,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_mask;
+        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        cv::gpu::GpuMat components;
+
+        TEST_CYCLE() cv::gpu::labelComponents(d_mask, components);
+
+        GPU_SANITY_CHECK(components);
+    }
+    else
+    {
+        GreedyLabeling host(image);
+
+        TEST_CYCLE() host(host._labels);
+
+        cv::Mat components = host._labels;
+        CPU_SANITY_CHECK(components);
+    }
+}
diff --git a/modules/gpuimgproc/perf/perf_main.cpp b/modules/gpuimgproc/perf/perf_main.cpp
new file mode 100644
index 0000000000..6b3bec5f84
--- /dev/null
+++ b/modules/gpuimgproc/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_MAIN(gpuimgproc, printCudaInfo())
diff --git a/modules/gpuimgproc/perf/perf_precomp.cpp b/modules/gpuimgproc/perf/perf_precomp.cpp
new file mode 100644
index 0000000000..81f16e8f14
--- /dev/null
+++ b/modules/gpuimgproc/perf/perf_precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
diff --git a/modules/gpuimgproc/perf/perf_precomp.hpp b/modules/gpuimgproc/perf/perf_precomp.hpp
new file mode 100644
index 0000000000..6ecb958f40
--- /dev/null
+++ b/modules/gpuimgproc/perf/perf_precomp.hpp
@@ -0,0 +1,66 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
+
+#include "opencv2/gpuimgproc.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include "opencv2/photo.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
diff --git a/modules/gpuimgproc/src/blend.cpp b/modules/gpuimgproc/src/blend.cpp
new file mode 100644
index 0000000000..3fd6507810
--- /dev/null
+++ b/modules/gpuimgproc/src/blend.cpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace blend
+    {
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::cudev::blend;
+
+void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
+                          GpuMat& result, Stream& stream)
+{
+    CV_Assert(img1.size() == img2.size());
+    CV_Assert(img1.type() == img2.type());
+    CV_Assert(weights1.size() == img1.size());
+    CV_Assert(weights2.size() == img2.size());
+    CV_Assert(weights1.type() == CV_32F);
+    CV_Assert(weights2.type() == CV_32F);
+
+    const Size size = img1.size();
+    const int depth = img1.depth();
+    const int cn = img1.channels();
+
+    result.create(size, CV_MAKE_TYPE(depth, cn));
+
+    switch (depth)
+    {
+    case CV_8U:
+        if (cn != 4)
+            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        else
+            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    case CV_32F:
+        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    default:
+        CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
+    }
+}
+
+#endif
diff --git a/modules/gpuimgproc/src/color.cpp b/modules/gpuimgproc/src/color.cpp
new file mode 100644
index 0000000000..dc35823486
--- /dev/null
+++ b/modules/gpuimgproc/src/color.cpp
@@ -0,0 +1,1989 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_no_cuda(); }
+void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+#include "cvt_color_internal.h"
+
+namespace cv { namespace gpu {
+    namespace cudev
+    {
+        template <int cn>
+        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template <int cn>
+        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+        template <int cn>
+        void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    }
+}}
+
+using namespace ::cv::gpu::cudev;
+
+namespace
+{
+    typedef void (*gpu_func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, bgr_to_rgb_16u, 0, 0, bgr_to_rgb_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, bgr_to_bgra_16u, 0, 0, bgr_to_bgra_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, bgr_to_rgba_16u, 0, 0, bgr_to_rgba_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, bgra_to_bgr_16u, 0, 0, bgra_to_bgr_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, bgra_to_rgb_16u, 0, 0, bgra_to_rgb_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, bgra_to_rgba_16u, 0, 0, bgra_to_rgba_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::bgr_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::bgr_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::rgb_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::rgb_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::bgra_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::bgra_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgba_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::rgba_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgba_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::rgba_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);
+
+        cudev::bgr555_to_rgb(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);
+
+        cudev::bgr565_to_rgb(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);
+
+        cudev::bgr555_to_bgr(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);
+
+        cudev::bgr565_to_bgr(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);
+
+        cudev::bgr555_to_rgba(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);
+
+        cudev::bgr565_to_rgba(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);
+
+        cudev::bgr555_to_bgra(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);
+
+        cudev::bgr565_to_bgra(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, gray_to_bgr_16u, 0, 0, gray_to_bgr_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, gray_to_bgra_16u, 0, 0, gray_to_bgra_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::gray_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_8UC2);
+
+        cudev::gray_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC1);
+
+        cudev::bgr555_to_gray(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC1);
+
+        cudev::bgr565_to_gray(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, rgb_to_gray_16u, 0, 0, rgb_to_gray_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, bgr_to_gray_16u, 0, 0, bgr_to_gray_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgba_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, rgba_to_gray_16u, 0, 0, rgba_to_gray_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, bgra_to_gray_16u, 0, 0, bgra_to_gray_32f};
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {rgb_to_yuv_8u, 0, rgb_to_yuv_16u, 0, 0, rgb_to_yuv_32f},
+                {rgba_to_yuv_8u, 0, rgba_to_yuv_16u, 0, 0, rgba_to_yuv_32f}
+            },
+            {
+                {rgb_to_yuv4_8u, 0, rgb_to_yuv4_16u, 0, 0, rgb_to_yuv4_32f},
+                {rgba_to_yuv4_8u, 0, rgba_to_yuv4_16u, 0, 0, rgba_to_yuv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {bgr_to_yuv_8u, 0, bgr_to_yuv_16u, 0, 0, bgr_to_yuv_32f},
+                {bgra_to_yuv_8u, 0, bgra_to_yuv_16u, 0, 0, bgra_to_yuv_32f}
+            },
+            {
+                {bgr_to_yuv4_8u, 0, bgr_to_yuv4_16u, 0, 0, bgr_to_yuv4_32f},
+                {bgra_to_yuv4_8u, 0, bgra_to_yuv4_16u, 0, 0, bgra_to_yuv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void yuv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {yuv_to_rgb_8u, 0, yuv_to_rgb_16u, 0, 0, yuv_to_rgb_32f},
+                {yuv4_to_rgb_8u, 0, yuv4_to_rgb_16u, 0, 0, yuv4_to_rgb_32f}
+            },
+            {
+                {yuv_to_rgba_8u, 0, yuv_to_rgba_16u, 0, 0, yuv_to_rgba_32f},
+                {yuv4_to_rgba_8u, 0, yuv4_to_rgba_16u, 0, 0, yuv4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void yuv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {yuv_to_bgr_8u, 0, yuv_to_bgr_16u, 0, 0, yuv_to_bgr_32f},
+                {yuv4_to_bgr_8u, 0, yuv4_to_bgr_16u, 0, 0, yuv4_to_bgr_32f}
+            },
+            {
+                {yuv_to_bgra_8u, 0, yuv_to_bgra_16u, 0, 0, yuv_to_bgra_32f},
+                {yuv4_to_bgra_8u, 0, yuv4_to_bgra_16u, 0, 0, yuv4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {rgb_to_YCrCb_8u, 0, rgb_to_YCrCb_16u, 0, 0, rgb_to_YCrCb_32f},
+                {rgba_to_YCrCb_8u, 0, rgba_to_YCrCb_16u, 0, 0, rgba_to_YCrCb_32f}
+            },
+            {
+                {rgb_to_YCrCb4_8u, 0, rgb_to_YCrCb4_16u, 0, 0, rgb_to_YCrCb4_32f},
+                {rgba_to_YCrCb4_8u, 0, rgba_to_YCrCb4_16u, 0, 0, rgba_to_YCrCb4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {bgr_to_YCrCb_8u, 0, bgr_to_YCrCb_16u, 0, 0, bgr_to_YCrCb_32f},
+                {bgra_to_YCrCb_8u, 0, bgra_to_YCrCb_16u, 0, 0, bgra_to_YCrCb_32f}
+            },
+            {
+                {bgr_to_YCrCb4_8u, 0, bgr_to_YCrCb4_16u, 0, 0, bgr_to_YCrCb4_32f},
+                {bgra_to_YCrCb4_8u, 0, bgra_to_YCrCb4_16u, 0, 0, bgra_to_YCrCb4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void YCrCb_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {YCrCb_to_rgb_8u, 0, YCrCb_to_rgb_16u, 0, 0, YCrCb_to_rgb_32f},
+                {YCrCb4_to_rgb_8u, 0, YCrCb4_to_rgb_16u, 0, 0, YCrCb4_to_rgb_32f}
+            },
+            {
+                {YCrCb_to_rgba_8u, 0, YCrCb_to_rgba_16u, 0, 0, YCrCb_to_rgba_32f},
+                {YCrCb4_to_rgba_8u, 0, YCrCb4_to_rgba_16u, 0, 0, YCrCb4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void YCrCb_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {YCrCb_to_bgr_8u, 0, YCrCb_to_bgr_16u, 0, 0, YCrCb_to_bgr_32f},
+                {YCrCb4_to_bgr_8u, 0, YCrCb4_to_bgr_16u, 0, 0, YCrCb4_to_bgr_32f}
+            },
+            {
+                {YCrCb_to_bgra_8u, 0, YCrCb_to_bgra_16u, 0, 0, YCrCb_to_bgra_32f},
+                {YCrCb4_to_bgra_8u, 0, YCrCb4_to_bgra_16u, 0, 0, YCrCb4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {rgb_to_xyz_8u, 0, rgb_to_xyz_16u, 0, 0, rgb_to_xyz_32f},
+                {rgba_to_xyz_8u, 0, rgba_to_xyz_16u, 0, 0, rgba_to_xyz_32f}
+            },
+            {
+                {rgb_to_xyz4_8u, 0, rgb_to_xyz4_16u, 0, 0, rgb_to_xyz4_32f},
+                {rgba_to_xyz4_8u, 0, rgba_to_xyz4_16u, 0, 0, rgba_to_xyz4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {bgr_to_xyz_8u, 0, bgr_to_xyz_16u, 0, 0, bgr_to_xyz_32f},
+                {bgra_to_xyz_8u, 0, bgra_to_xyz_16u, 0, 0, bgra_to_xyz_32f}
+            },
+            {
+                {bgr_to_xyz4_8u, 0, bgr_to_xyz4_16u, 0, 0, bgr_to_xyz4_32f},
+                {bgra_to_xyz4_8u, 0, bgra_to_xyz4_16u, 0, 0, bgra_to_xyz4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void xyz_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {xyz_to_rgb_8u, 0, xyz_to_rgb_16u, 0, 0, xyz_to_rgb_32f},
+                {xyz4_to_rgb_8u, 0, xyz4_to_rgb_16u, 0, 0, xyz4_to_rgb_32f}
+            },
+            {
+                {xyz_to_rgba_8u, 0, xyz_to_rgba_16u, 0, 0, xyz_to_rgba_32f},
+                {xyz4_to_rgba_8u, 0, xyz4_to_rgba_16u, 0, 0, xyz4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void xyz_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {xyz_to_bgr_8u, 0, xyz_to_bgr_16u, 0, 0, xyz_to_bgr_32f},
+                {xyz4_to_bgr_8u, 0, xyz4_to_bgr_16u, 0, 0, xyz4_to_bgr_32f}
+            },
+            {
+                {xyz_to_bgra_8u, 0, xyz_to_bgra_16u, 0, 0, xyz_to_bgra_32f},
+                {xyz4_to_bgra_8u, 0, xyz4_to_bgra_16u, 0, 0, xyz4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {rgb_to_hsv_8u, 0, 0, 0, 0, rgb_to_hsv_32f},
+                {rgba_to_hsv_8u, 0, 0, 0, 0, rgba_to_hsv_32f},
+            },
+            {
+                {rgb_to_hsv4_8u, 0, 0, 0, 0, rgb_to_hsv4_32f},
+                {rgba_to_hsv4_8u, 0, 0, 0, 0, rgba_to_hsv4_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {bgr_to_hsv_8u, 0, 0, 0, 0, bgr_to_hsv_32f},
+                {bgra_to_hsv_8u, 0, 0, 0, 0, bgra_to_hsv_32f}
+            },
+            {
+                {bgr_to_hsv4_8u, 0, 0, 0, 0, bgr_to_hsv4_32f},
+                {bgra_to_hsv4_8u, 0, 0, 0, 0, bgra_to_hsv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hsv_to_rgb_8u, 0, 0, 0, 0, hsv_to_rgb_32f},
+                {hsv4_to_rgb_8u, 0, 0, 0, 0, hsv4_to_rgb_32f}
+            },
+            {
+                {hsv_to_rgba_8u, 0, 0, 0, 0, hsv_to_rgba_32f},
+                {hsv4_to_rgba_8u, 0, 0, 0, 0, hsv4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hsv_to_bgr_8u, 0, 0, 0, 0, hsv_to_bgr_32f},
+                {hsv4_to_bgr_8u, 0, 0, 0, 0, hsv4_to_bgr_32f}
+            },
+            {
+                {hsv_to_bgra_8u, 0, 0, 0, 0, hsv_to_bgra_32f},
+                {hsv4_to_bgra_8u, 0, 0, 0, 0, hsv4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {rgb_to_hls_8u, 0, 0, 0, 0, rgb_to_hls_32f},
+                {rgba_to_hls_8u, 0, 0, 0, 0, rgba_to_hls_32f},
+            },
+            {
+                {rgb_to_hls4_8u, 0, 0, 0, 0, rgb_to_hls4_32f},
+                {rgba_to_hls4_8u, 0, 0, 0, 0, rgba_to_hls4_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {bgr_to_hls_8u, 0, 0, 0, 0, bgr_to_hls_32f},
+                {bgra_to_hls_8u, 0, 0, 0, 0, bgra_to_hls_32f}
+            },
+            {
+                {bgr_to_hls4_8u, 0, 0, 0, 0, bgr_to_hls4_32f},
+                {bgra_to_hls4_8u, 0, 0, 0, 0, bgra_to_hls4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hls_to_rgb_8u, 0, 0, 0, 0, hls_to_rgb_32f},
+                {hls4_to_rgb_8u, 0, 0, 0, 0, hls4_to_rgb_32f}
+            },
+            {
+                {hls_to_rgba_8u, 0, 0, 0, 0, hls_to_rgba_32f},
+                {hls4_to_rgba_8u, 0, 0, 0, 0, hls4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hls_to_bgr_8u, 0, 0, 0, 0, hls_to_bgr_32f},
+                {hls4_to_bgr_8u, 0, 0, 0, 0, hls4_to_bgr_32f}
+            },
+            {
+                {hls_to_bgra_8u, 0, 0, 0, 0, hls_to_bgra_32f},
+                {hls4_to_bgra_8u, 0, 0, 0, 0, hls4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {rgb_to_hsv_full_8u, 0, 0, 0, 0, rgb_to_hsv_full_32f},
+                {rgba_to_hsv_full_8u, 0, 0, 0, 0, rgba_to_hsv_full_32f},
+            },
+            {
+                {rgb_to_hsv4_full_8u, 0, 0, 0, 0, rgb_to_hsv4_full_32f},
+                {rgba_to_hsv4_full_8u, 0, 0, 0, 0, rgba_to_hsv4_full_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {bgr_to_hsv_full_8u, 0, 0, 0, 0, bgr_to_hsv_full_32f},
+                {bgra_to_hsv_full_8u, 0, 0, 0, 0, bgra_to_hsv_full_32f}
+            },
+            {
+                {bgr_to_hsv4_full_8u, 0, 0, 0, 0, bgr_to_hsv4_full_32f},
+                {bgra_to_hsv4_full_8u, 0, 0, 0, 0, bgra_to_hsv4_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hsv_to_rgb_full_8u, 0, 0, 0, 0, hsv_to_rgb_full_32f},
+                {hsv4_to_rgb_full_8u, 0, 0, 0, 0, hsv4_to_rgb_full_32f}
+            },
+            {
+                {hsv_to_rgba_full_8u, 0, 0, 0, 0, hsv_to_rgba_full_32f},
+                {hsv4_to_rgba_full_8u, 0, 0, 0, 0, hsv4_to_rgba_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hsv_to_bgr_full_8u, 0, 0, 0, 0, hsv_to_bgr_full_32f},
+                {hsv4_to_bgr_full_8u, 0, 0, 0, 0, hsv4_to_bgr_full_32f}
+            },
+            {
+                {hsv_to_bgra_full_8u, 0, 0, 0, 0, hsv_to_bgra_full_32f},
+                {hsv4_to_bgra_full_8u, 0, 0, 0, 0, hsv4_to_bgra_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {rgb_to_hls_full_8u, 0, 0, 0, 0, rgb_to_hls_full_32f},
+                {rgba_to_hls_full_8u, 0, 0, 0, 0, rgba_to_hls_full_32f},
+            },
+            {
+                {rgb_to_hls4_full_8u, 0, 0, 0, 0, rgb_to_hls4_full_32f},
+                {rgba_to_hls4_full_8u, 0, 0, 0, 0, rgba_to_hls4_full_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {bgr_to_hls_full_8u, 0, 0, 0, 0, bgr_to_hls_full_32f},
+                {bgra_to_hls_full_8u, 0, 0, 0, 0, bgra_to_hls_full_32f}
+            },
+            {
+                {bgr_to_hls4_full_8u, 0, 0, 0, 0, bgr_to_hls4_full_32f},
+                {bgra_to_hls4_full_8u, 0, 0, 0, 0, bgra_to_hls4_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hls_to_rgb_full_8u, 0, 0, 0, 0, hls_to_rgb_full_32f},
+                {hls4_to_rgb_full_8u, 0, 0, 0, 0, hls4_to_rgb_full_32f}
+            },
+            {
+                {hls_to_rgba_full_8u, 0, 0, 0, 0, hls_to_rgba_full_32f},
+                {hls4_to_rgba_full_8u, 0, 0, 0, 0, hls4_to_rgba_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {hls_to_bgr_full_8u, 0, 0, 0, 0, hls_to_bgr_full_32f},
+                {hls4_to_bgr_full_8u, 0, 0, 0, 0, hls4_to_bgr_full_32f}
+            },
+            {
+                {hls_to_bgra_full_8u, 0, 0, 0, 0, hls_to_bgra_full_32f},
+                {hls4_to_bgra_full_8u, 0, 0, 0, 0, hls4_to_bgra_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {bgr_to_lab_8u, bgr_to_lab_32f},
+                {bgra_to_lab_8u, bgra_to_lab_32f}
+            },
+            {
+                {bgr_to_lab4_8u, bgr_to_lab4_32f},
+                {bgra_to_lab4_8u, bgra_to_lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {rgb_to_lab_8u, rgb_to_lab_32f},
+                {rgba_to_lab_8u, rgba_to_lab_32f}
+            },
+            {
+                {rgb_to_lab4_8u, rgb_to_lab4_32f},
+                {rgba_to_lab4_8u, rgba_to_lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lbgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lbgr_to_lab_8u, lbgr_to_lab_32f},
+                {lbgra_to_lab_8u, lbgra_to_lab_32f}
+            },
+            {
+                {lbgr_to_lab4_8u, lbgr_to_lab4_32f},
+                {lbgra_to_lab4_8u, lbgra_to_lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lrgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lrgb_to_lab_8u, lrgb_to_lab_32f},
+                {lrgba_to_lab_8u, lrgba_to_lab_32f}
+            },
+            {
+                {lrgb_to_lab4_8u, lrgb_to_lab4_32f},
+                {lrgba_to_lab4_8u, lrgba_to_lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_bgr_8u, lab_to_bgr_32f},
+                {lab4_to_bgr_8u, lab4_to_bgr_32f}
+            },
+            {
+                {lab_to_bgra_8u, lab_to_bgra_32f},
+                {lab4_to_bgra_8u, lab4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lab_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_rgb_8u, lab_to_rgb_32f},
+                {lab4_to_rgb_8u, lab4_to_rgb_32f}
+            },
+            {
+                {lab_to_rgba_8u, lab_to_rgba_32f},
+                {lab4_to_rgba_8u, lab4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lab_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_lbgr_8u, lab_to_lbgr_32f},
+                {lab4_to_lbgr_8u, lab4_to_lbgr_32f}
+            },
+            {
+                {lab_to_lbgra_8u, lab_to_lbgra_32f},
+                {lab4_to_lbgra_8u, lab4_to_lbgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lab_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_lrgb_8u, lab_to_lrgb_32f},
+                {lab4_to_lrgb_8u, lab4_to_lrgb_32f}
+            },
+            {
+                {lab_to_lrgba_8u, lab_to_lrgba_32f},
+                {lab4_to_lrgba_8u, lab4_to_lrgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {bgr_to_luv_8u, bgr_to_luv_32f},
+                {bgra_to_luv_8u, bgra_to_luv_32f}
+            },
+            {
+                {bgr_to_luv4_8u, bgr_to_luv4_32f},
+                {bgra_to_luv4_8u, bgra_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {rgb_to_luv_8u, rgb_to_luv_32f},
+                {rgba_to_luv_8u, rgba_to_luv_32f}
+            },
+            {
+                {rgb_to_luv4_8u, rgb_to_luv4_32f},
+                {rgba_to_luv4_8u, rgba_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lbgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lbgr_to_luv_8u, lbgr_to_luv_32f},
+                {lbgra_to_luv_8u, lbgra_to_luv_32f}
+            },
+            {
+                {lbgr_to_luv4_8u, lbgr_to_luv4_32f},
+                {lbgra_to_luv4_8u, lbgra_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lrgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lrgb_to_luv_8u, lrgb_to_luv_32f},
+                {lrgba_to_luv_8u, lrgba_to_luv_32f}
+            },
+            {
+                {lrgb_to_luv4_8u, lrgb_to_luv4_32f},
+                {lrgba_to_luv4_8u, lrgba_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_bgr_8u, luv_to_bgr_32f},
+                {luv4_to_bgr_8u, luv4_to_bgr_32f}
+            },
+            {
+                {luv_to_bgra_8u, luv_to_bgra_32f},
+                {luv4_to_bgra_8u, luv4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_rgb_8u, luv_to_rgb_32f},
+                {luv4_to_rgb_8u, luv4_to_rgb_32f}
+            },
+            {
+                {luv_to_rgba_8u, luv_to_rgba_32f},
+                {luv4_to_rgba_8u, luv4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_lbgr_8u, luv_to_lbgr_32f},
+                {luv4_to_lbgr_8u, luv4_to_lbgr_32f}
+            },
+            {
+                {luv_to_lbgra_8u, luv_to_lbgra_32f},
+                {luv4_to_lbgra_8u, luv4_to_lbgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::cudev;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_lrgb_8u, luv_to_lrgb_32f},
+                {luv4_to_lrgb_8u, luv4_to_lrgb_32f}
+            },
+            {
+                {luv_to_lrgba_8u, luv_to_lrgba_32f},
+                {luv4_to_lrgba_8u, luv4_to_lrgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgba_to_mbgra(const GpuMat& src, GpuMat& dst, int, Stream& st)
+    {
+    #if (CUDA_VERSION < 5000)
+        (void)src;
+        (void)dst;
+        (void)st;
+        CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+    #else
+        CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4);
+
+        dst.create(src.size(), src.type());
+
+        cudaStream_t stream = StreamAccessor::getStream(st);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        if (src.depth() == CV_8U)
+            nppSafeCall( nppiAlphaPremul_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+        else
+            nppSafeCall( nppiAlphaPremul_16u_AC4R(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    #endif
+    }
+
+    void bayer_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, bool blue_last, bool start_with_green, Stream& stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        static const func_t funcs[3][4] =
+        {
+            {0,0,Bayer2BGR_8u_gpu<3>, Bayer2BGR_8u_gpu<4>},
+            {0,0,0,0},
+            {0,0,Bayer2BGR_16u_gpu<3>, Bayer2BGR_16u_gpu<4>}
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
+        CV_Assert(src.rows > 2 && src.cols > 2);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
+    }
+    void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, false, false, stream);
+    }
+    void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, false, true, stream);
+    }
+    void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, true, false, stream);
+    }
+    void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, true, true, stream);
+    }
+
+    void bayer_to_gray(const GpuMat& src, GpuMat& dst, bool blue_last, bool start_with_green, Stream& stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        static const func_t funcs[3] =
+        {
+            Bayer2BGR_8u_gpu<1>,
+            0,
+            Bayer2BGR_16u_gpu<1>,
+        };
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
+        CV_Assert(src.rows > 2 && src.cols > 2);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
+
+        funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
+    }
+    void bayerBG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, false, stream);
+    }
+    void bayerGB_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, true, stream);
+    }
+    void bayerRG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, false, stream);
+    }
+    void bayerGR_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, true, stream);
+    }
+}
+
+void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream);
+    static const func_t funcs[] =
+    {
+        bgr_to_bgra,            // CV_BGR2BGRA    =0
+        bgra_to_bgr,            // CV_BGRA2BGR    =1
+        bgr_to_rgba,            // CV_BGR2RGBA    =2
+        bgra_to_rgb,            // CV_RGBA2BGR    =3
+        bgr_to_rgb,             // CV_BGR2RGB     =4
+        bgra_to_rgba,           // CV_BGRA2RGBA   =5
+
+        bgr_to_gray,            // CV_BGR2GRAY    =6
+        rgb_to_gray,            // CV_RGB2GRAY    =7
+        gray_to_bgr,            // CV_GRAY2BGR    =8
+        gray_to_bgra,           // CV_GRAY2BGRA   =9
+        bgra_to_gray,           // CV_BGRA2GRAY   =10
+        rgba_to_gray,           // CV_RGBA2GRAY   =11
+
+        bgr_to_bgr565,          // CV_BGR2BGR565  =12
+        rgb_to_bgr565,          // CV_RGB2BGR565  =13
+        bgr565_to_bgr,          // CV_BGR5652BGR  =14
+        bgr565_to_rgb,          // CV_BGR5652RGB  =15
+        bgra_to_bgr565,         // CV_BGRA2BGR565 =16
+        rgba_to_bgr565,         // CV_RGBA2BGR565 =17
+        bgr565_to_bgra,         // CV_BGR5652BGRA =18
+        bgr565_to_rgba,         // CV_BGR5652RGBA =19
+
+        gray_to_bgr565,         // CV_GRAY2BGR565 =20
+        bgr565_to_gray,         // CV_BGR5652GRAY =21
+
+        bgr_to_bgr555,          // CV_BGR2BGR555  =22
+        rgb_to_bgr555,          // CV_RGB2BGR555  =23
+        bgr555_to_bgr,          // CV_BGR5552BGR  =24
+        bgr555_to_rgb,          // CV_BGR5552RGB  =25
+        bgra_to_bgr555,         // CV_BGRA2BGR555 =26
+        rgba_to_bgr555,         // CV_RGBA2BGR555 =27
+        bgr555_to_bgra,         // CV_BGR5552BGRA =28
+        bgr555_to_rgba,         // CV_BGR5552RGBA =29
+
+        gray_to_bgr555,         // CV_GRAY2BGR555 =30
+        bgr555_to_gray,         // CV_BGR5552GRAY =31
+
+        bgr_to_xyz,             // CV_BGR2XYZ     =32
+        rgb_to_xyz,             // CV_RGB2XYZ     =33
+        xyz_to_bgr,             // CV_XYZ2BGR     =34
+        xyz_to_rgb,             // CV_XYZ2RGB     =35
+
+        bgr_to_YCrCb,           // CV_BGR2YCrCb   =36
+        rgb_to_YCrCb,           // CV_RGB2YCrCb   =37
+        YCrCb_to_bgr,           // CV_YCrCb2BGR   =38
+        YCrCb_to_rgb,           // CV_YCrCb2RGB   =39
+
+        bgr_to_hsv,             // CV_BGR2HSV     =40
+        rgb_to_hsv,             // CV_RGB2HSV     =41
+
+        0,                      //                =42
+        0,                      //                =43
+
+        bgr_to_lab,             // CV_BGR2Lab     =44
+        rgb_to_lab,             // CV_RGB2Lab     =45
+
+        bayerBG_to_bgr,         // CV_BayerBG2BGR =46
+        bayerGB_to_bgr,         // CV_BayerGB2BGR =47
+        bayerRG_to_bgr,         // CV_BayerRG2BGR =48
+        bayerGR_to_bgr,         // CV_BayerGR2BGR =49
+
+        bgr_to_luv,             // CV_BGR2Luv     =50
+        rgb_to_luv,             // CV_RGB2Luv     =51
+
+        bgr_to_hls,             // CV_BGR2HLS     =52
+        rgb_to_hls,             // CV_RGB2HLS     =53
+
+        hsv_to_bgr,             // CV_HSV2BGR     =54
+        hsv_to_rgb,             // CV_HSV2RGB     =55
+
+        lab_to_bgr,             // CV_Lab2BGR     =56
+        lab_to_rgb,             // CV_Lab2RGB     =57
+        luv_to_bgr,             // CV_Luv2BGR     =58
+        luv_to_rgb,             // CV_Luv2RGB     =59
+
+        hls_to_bgr,             // CV_HLS2BGR     =60
+        hls_to_rgb,             // CV_HLS2RGB     =61
+
+        0,                      // CV_BayerBG2BGR_VNG =62
+        0,                      // CV_BayerGB2BGR_VNG =63
+        0,                      // CV_BayerRG2BGR_VNG =64
+        0,                      // CV_BayerGR2BGR_VNG =65
+
+        bgr_to_hsv_full,        // CV_BGR2HSV_FULL = 66
+        rgb_to_hsv_full,        // CV_RGB2HSV_FULL = 67
+        bgr_to_hls_full,        // CV_BGR2HLS_FULL = 68
+        rgb_to_hls_full,        // CV_RGB2HLS_FULL = 69
+
+        hsv_to_bgr_full,        // CV_HSV2BGR_FULL = 70
+        hsv_to_rgb_full,        // CV_HSV2RGB_FULL = 71
+        hls_to_bgr_full,        // CV_HLS2BGR_FULL = 72
+        hls_to_rgb_full,        // CV_HLS2RGB_FULL = 73
+
+        lbgr_to_lab,            // CV_LBGR2Lab     = 74
+        lrgb_to_lab,            // CV_LRGB2Lab     = 75
+        lbgr_to_luv,            // CV_LBGR2Luv     = 76
+        lrgb_to_luv,            // CV_LRGB2Luv     = 77
+
+        lab_to_lbgr,            // CV_Lab2LBGR     = 78
+        lab_to_lrgb,            // CV_Lab2LRGB     = 79
+        luv_to_lbgr,            // CV_Luv2LBGR     = 80
+        luv_to_lrgb,            // CV_Luv2LRGB     = 81
+
+        bgr_to_yuv,             // CV_BGR2YUV      = 82
+        rgb_to_yuv,             // CV_RGB2YUV      = 83
+        yuv_to_bgr,             // CV_YUV2BGR      = 84
+        yuv_to_rgb,             // CV_YUV2RGB      = 85
+
+        bayerBG_to_gray,        // CV_BayerBG2GRAY = 86
+        bayerGB_to_gray,        // CV_BayerGB2GRAY = 87
+        bayerRG_to_gray,        // CV_BayerRG2GRAY = 88
+        bayerGR_to_gray,        // CV_BayerGR2GRAY = 89
+
+        //YUV 4:2:0 formats family
+        0,                      // CV_YUV2RGB_NV12 = 90,
+        0,                      // CV_YUV2BGR_NV12 = 91,
+        0,                      // CV_YUV2RGB_NV21 = 92,
+        0,                      // CV_YUV2BGR_NV21 = 93,
+
+        0,                      // CV_YUV2RGBA_NV12 = 94,
+        0,                      // CV_YUV2BGRA_NV12 = 95,
+        0,                      // CV_YUV2RGBA_NV21 = 96,
+        0,                      // CV_YUV2BGRA_NV21 = 97,
+
+        0,                      // CV_YUV2RGB_YV12 = 98,
+        0,                      // CV_YUV2BGR_YV12 = 99,
+        0,                      // CV_YUV2RGB_IYUV = 100,
+        0,                      // CV_YUV2BGR_IYUV = 101,
+
+        0,                      // CV_YUV2RGBA_YV12 = 102,
+        0,                      // CV_YUV2BGRA_YV12 = 103,
+        0,                      // CV_YUV2RGBA_IYUV = 104,
+        0,                      // CV_YUV2BGRA_IYUV = 105,
+
+        0,                      // CV_YUV2GRAY_420 = 106,
+
+        //YUV 4:2:2 formats family
+        0,                      // CV_YUV2RGB_UYVY = 107,
+        0,                      // CV_YUV2BGR_UYVY = 108,
+        0,                      // //CV_YUV2RGB_VYUY = 109,
+        0,                      // //CV_YUV2BGR_VYUY = 110,
+
+        0,                      // CV_YUV2RGBA_UYVY = 111,
+        0,                      // CV_YUV2BGRA_UYVY = 112,
+        0,                      // //CV_YUV2RGBA_VYUY = 113,
+        0,                      // //CV_YUV2BGRA_VYUY = 114,
+
+        0,                      // CV_YUV2RGB_YUY2 = 115,
+        0,                      // CV_YUV2BGR_YUY2 = 116,
+        0,                      // CV_YUV2RGB_YVYU = 117,
+        0,                      // CV_YUV2BGR_YVYU = 118,
+
+        0,                      // CV_YUV2RGBA_YUY2 = 119,
+        0,                      // CV_YUV2BGRA_YUY2 = 120,
+        0,                      // CV_YUV2RGBA_YVYU = 121,
+        0,                      // CV_YUV2BGRA_YVYU = 122,
+
+        0,                      // CV_YUV2GRAY_UYVY = 123,
+        0,                      // CV_YUV2GRAY_YUY2 = 124,
+
+        // alpha premultiplication
+        rgba_to_mbgra,          // CV_RGBA2mRGBA = 125,
+        0,                      // CV_mRGBA2RGBA = 126,
+
+        0,                      // CV_COLORCVT_MAX  = 127
+    };
+
+    CV_Assert(code < 128);
+
+    func_t func = funcs[code];
+
+    if (func == 0)
+        CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" );
+
+    func(src, dst, dcn, stream);
+}
+
+void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
+{
+    const int depth = src.depth();
+
+    CV_Assert( src.channels() == 1 );
+
+    switch (code)
+    {
+    case cv::COLOR_BayerBG2GRAY: case cv::COLOR_BayerGB2GRAY: case cv::COLOR_BayerRG2GRAY: case cv::COLOR_BayerGR2GRAY:
+        bayer_to_gray(src, dst, code == cv::COLOR_BayerBG2GRAY || code == cv::COLOR_BayerGB2GRAY, code == cv::COLOR_BayerGB2GRAY || code == cv::COLOR_BayerGR2GRAY, stream);
+        break;
+
+    case cv::COLOR_BayerBG2BGR: case cv::COLOR_BayerGB2BGR: case cv::COLOR_BayerRG2BGR: case cv::COLOR_BayerGR2BGR:
+        bayer_to_bgr(src, dst, dcn, code == cv::COLOR_BayerBG2BGR || code == cv::COLOR_BayerGB2BGR, code == cv::COLOR_BayerGB2BGR || code == cv::COLOR_BayerGR2BGR, stream);
+        break;
+
+    case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT:
+    {
+        if (dcn <= 0)
+            dcn = 3;
+
+        CV_Assert( depth == CV_8U );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        dst.create(src.size(), CV_MAKETYPE(depth, dcn));
+        dst.setTo(Scalar::all(0));
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        if (dcn == 3)
+            cudev::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        else
+            cudev::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT:
+    {
+        CV_Assert( depth == CV_8U );
+
+        dst.create(src.size(), CV_MAKETYPE(depth, 1));
+        dst.setTo(Scalar::all(0));
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        cudev::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    default:
+        CV_Error( cv::Error::StsBadFlag, "Unknown / unsupported color conversion code" );
+    }
+}
+
+void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
+{
+    CV_Assert(image.type() == CV_8UC4);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiSize sz;
+    sz.width  = image.cols;
+    sz.height = image.rows;
+
+    nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void cv::gpu::gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward, Stream& stream)
+{
+#if (CUDA_VERSION < 5000)
+    (void)src;
+    (void)dst;
+    (void)forward;
+    (void)stream;
+    CV_Error( cv::Error::StsNotImplemented, "This function works only with CUDA 5.0 or higher" );
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc, int nSrcStep, Npp8u* pDst, int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*func_inplace_t)(Npp8u* pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
+
+    static const func_t funcs[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3R, nppiGammaInv_8u_AC4R},
+        {0, 0, 0, nppiGammaFwd_8u_C3R, nppiGammaFwd_8u_AC4R}
+    };
+    static const func_inplace_t funcs_inplace[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3IR, nppiGammaInv_8u_AC4IR},
+        {0, 0, 0, nppiGammaFwd_8u_C3IR, nppiGammaFwd_8u_AC4IR}
+    };
+
+    CV_Assert(src.type() == CV_8UC3 || src.type() == CV_8UC4);
+
+    dst.create(src.size(), src.type());
+
+    NppStreamHandler h(StreamAccessor::getStream(stream));
+
+    NppiSize oSizeROI;
+    oSizeROI.width = src.cols;
+    oSizeROI.height = src.rows;
+
+    if (dst.data == src.data)
+        funcs_inplace[forward][src.channels()](dst.ptr<Npp8u>(), static_cast<int>(src.step), oSizeROI);
+    else
+        funcs[forward][src.channels()](src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI);
+
+#endif
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpuimgproc/src/cuda/bilateral_filter.cu b/modules/gpuimgproc/src/cuda/bilateral_filter.cu
new file mode 100644
index 0000000000..4449274548
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/bilateral_filter.cu
@@ -0,0 +1,199 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::gpu;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+/// Bilateral filtering
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
+        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
+        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
+        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
+
+        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
+
+        template<typename T, typename B>
+        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= src.cols || y >= src.rows)
+                return;
+
+            value_type center = saturate_cast<value_type>(src(y, x));
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0;
+
+            int r = ksz / 2;
+            float r2 = (float)(r * r);
+
+            int tx = x - r + ksz;
+            int ty = y - r + ksz;
+
+            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(src(cy, cx));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            else
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            dst(y, x) = saturate_cast<T>(sum1 / sum2);
+        }
+
+        template<typename T, template <typename> class B>
+        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
+             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
+
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdReflect101>,
+                bilateral_caller<T, BrdReplicate>,
+                bilateral_caller<T, BrdConstant>,
+                bilateral_caller<T, BrdReflect>,
+                bilateral_caller<T, BrdWrap>,
+            };
+            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+        }
+    }
+}}}
+
+
+#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
+    template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
+
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(short)
+//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
+OCV_INSTANTIATE_BILATERAL_FILTER(short3)
+OCV_INSTANTIATE_BILATERAL_FILTER(short4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
+//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(int)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(float)
+//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
+OCV_INSTANTIATE_BILATERAL_FILTER(float3)
+OCV_INSTANTIATE_BILATERAL_FILTER(float4)
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/blend.cu b/modules/gpuimgproc/src/cuda/blend.cu
new file mode 100644
index 0000000000..be8c0b2f35
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/blend.cu
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace blend
+    {
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }
+
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+
+
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/canny.cu b/modules/gpuimgproc/src/cuda/canny.cu
new file mode 100644
index 0000000000..042e9afcc6
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/canny.cu
@@ -0,0 +1,494 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <utility>
+#include <algorithm>//std::swap
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace canny
+{
+    struct L1 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::abs(x) + ::abs(y);
+        }
+
+        __device__ __forceinline__ L1() {}
+        __device__ __forceinline__ L1(const L1&) {}
+    };
+    struct L2 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::sqrtf(x * x + y * y);
+        }
+
+        __device__ __forceinline__ L2() {}
+        __device__ __forceinline__ L2(const L2&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTex
+    {
+        const int xoff;
+        const int yoff;
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D(tex_src, x + xoff, y + yoff);
+        }
+    };
+
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= mag.rows || x >= mag.cols)
+            return;
+
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
+
+        bindTexture(&tex_src, srcWhole);
+        SrcTex src(xoff, yoff);
+
+        if (L2Grad)
+        {
+            L2 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+        else
+        {
+            L1 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        if (L2Grad)
+        {
+            L2 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
+        else
+        {
+            L1 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
+        {
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
+
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+        }
+
+        map(y, x) = edge_type;
+    }
+
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
+
+        bindTexture(&tex_mag, mag);
+
+        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __device__ int counter = 0;
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    {
+        __shared__ volatile int smem[18][18];
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        if (threadIdx.y == 0)
+            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+        if (threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+        if (threadIdx.x == 0)
+            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1)
+            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+
+        __syncthreads();
+
+        if (x >= map.cols || y >= map.rows)
+            return;
+
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+        }
+
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+        map(y, x) = e;
+
+        n = 0;
+
+        if (e == 2)
+        {
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+        }
+
+        if (n > 0)
+        {
+            const int ind =  ::atomicAdd(&counter, 1);
+            st[ind] = make_ushort2(x, y);
+        }
+    }
+
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
+
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
+
+        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+    {
+        const int stack_size = 512;
+
+        __shared__ int s_counter;
+        __shared__ int s_ind;
+        __shared__ ushort2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind >= count)
+            return;
+
+        ushort2 pos = st1[ind];
+
+        if (threadIdx.x < 8)
+        {
+            pos.x += c_dx[threadIdx.x];
+            pos.y += c_dy[threadIdx.x];
+
+            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+            {
+                map(pos.y, pos.x) = 2;
+
+                ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                s_st[ind] = pos;
+            }
+        }
+
+        __syncthreads();
+
+        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+        {
+            const int subTaskIdx = threadIdx.x >> 3;
+            const int portion = ::min(s_counter, blockDim.x >> 3);
+
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+                s_counter -= portion;
+
+            __syncthreads();
+
+            if (subTaskIdx < portion)
+            {
+                pos.x += c_dx[threadIdx.x & 7];
+                pos.y += c_dy[threadIdx.x & 7];
+
+                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+                {
+                    map(pos.y, pos.x) = 2;
+
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                    s_st[ind] = pos;
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if (s_counter > 0)
+        {
+            if (threadIdx.x == 0)
+            {
+                ind = ::atomicAdd(&counter, s_counter);
+                s_ind = ind - s_counter;
+            }
+
+            __syncthreads();
+
+            ind = s_ind;
+
+            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                st2[ind + i] = s_st[i];
+        }
+    }
+
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
+
+        int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        while (count > 0)
+        {
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+            const dim3 block(128);
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
+
+            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            std::swap(st1, st2);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    struct GetEdges : unary_function<int, uchar>
+    {
+        __device__ __forceinline__ uchar operator ()(int e) const
+        {
+            return (uchar)(-(e >> 1));
+        }
+
+        __device__ __forceinline__ GetEdges() {}
+        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    {
+        transform(map, dst, GetEdges(), WithOutMask(), 0);
+    }
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/ccomponetns.cu b/modules/gpuimgproc/src/cuda/ccomponetns.cu
new file mode 100644
index 0000000000..9552f1b06f
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/ccomponetns.cu
@@ -0,0 +1,534 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/core/cuda/vec_traits.hpp>
+#include <opencv2/core/cuda/vec_math.hpp>
+#include <opencv2/core/cuda/emulation.hpp>
+
+#include <iostream>
+#include <stdio.h>
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace ccl
+    {
+        enum
+        {
+            WARP_SIZE  = 32,
+            WARP_LOG   = 5,
+
+            CTA_SIZE_X = 32,
+            CTA_SIZE_Y = 8,
+
+            STA_SIZE_MERGE_Y = 4,
+            STA_SIZE_MERGE_X = 32,
+
+            TPB_X = 1,
+            TPB_Y = 4,
+
+            TILE_COLS = CTA_SIZE_X * TPB_X,
+            TILE_ROWS = CTA_SIZE_Y * TPB_Y
+        };
+
+        template<typename T> struct IntervalsTraits
+        {
+            typedef T elem_type;
+        };
+
+        template<> struct IntervalsTraits<unsigned char>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<uchar3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<uchar4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<unsigned short>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<ushort3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<ushort4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<float>
+        {
+            typedef float dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<int>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        typedef unsigned char component;
+        enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
+
+        template<typename T, int CH> struct InInterval {};
+
+        template<typename T> struct InInterval<T, 1>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+
+        template<typename T> struct InInterval<T, 3>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z;
+            }
+        };
+
+        template<typename T> struct InInterval<T, 4>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z &&
+                       lo.w <= d.w && d.w <= hi.w;
+            }
+        };
+
+
+        template<typename T, typename F>
+        __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= image.cols || y >= image.rows) return;
+
+            T intensity = image(y, x);
+            component c = 0;
+
+            if ( x > 0 && connected(intensity, image(y, x - 1)))
+                c |= LEFT;
+
+            if ( y > 0 && connected(intensity, image(y - 1, x)))
+                c |= UP;
+
+            if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
+                c |= RIGHT;
+
+            if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
+                c |= DOWN;
+
+            components(y, x) = c;
+        }
+
+        template< typename T>
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
+        {
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
+
+            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
+
+            Int_t inInt(lo, hi);
+            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
+
+            cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+        __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
+        {
+            int x = threadIdx.x + blockIdx.x * TILE_COLS;
+            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
+
+            if (x >= edges.cols || y >= edges.rows) return;
+
+            //currently x is 1
+            int bounds = ((y + TPB_Y) < edges.rows);
+
+            __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
+            __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
+
+            int new_labels[TPB_Y][TPB_X];
+            int old_labels[TPB_Y][TPB_X];
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+                #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                    int xloc = threadIdx.x + CTA_SIZE_X * j;
+                    component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
+
+                    if (!xloc) c &= ~LEFT;
+                    if (!yloc) c &= ~UP;
+
+                    if (xloc == TILE_COLS -1) c &= ~RIGHT;
+                    if (yloc == TILE_ROWS -1) c &= ~DOWN;
+
+                    new_labels[i][j] = yloc * TILE_COLS + xloc;
+                    edgesTile[yloc][xloc] = c;
+                }
+
+            for (int k = 0; ;++k)
+            {
+                //1. backup
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        old_labels[i][j]       = new_labels[i][j];
+                        labelsTile[yloc][xloc] = new_labels[i][j];
+                    }
+
+                __syncthreads();
+
+                //2. compare local arrays
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        component c = edgesTile[yloc][xloc];
+                        int label = new_labels[i][j];
+
+                        if (c & UP)
+                           label = ::min(label, labelsTile[yloc - 1][xloc]);
+
+                        if (c &  DOWN)
+                           label = ::min(label, labelsTile[yloc + 1][xloc]);
+
+                        if (c & LEFT)
+                           label = ::min(label, labelsTile[yloc][xloc - 1]);
+
+                        if (c & RIGHT)
+                           label = ::min(label, labelsTile[yloc][xloc + 1]);
+
+                       new_labels[i][j] = label;
+                    }
+
+                __syncthreads();
+
+                //3. determine: Is any value changed?
+                int changed = 0;
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        if (new_labels[i][j] < old_labels[i][j])
+                        {
+                            changed = 1;
+                            Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
+                        }
+                    }
+
+                changed = Emulation::syncthreadsOr(changed);
+
+                if (!changed)
+                    break;
+
+                //4. Compact paths
+                const int *labels = &labelsTile[0][0];
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int label = new_labels[i][j];
+
+                        while( labels[label] < label ) label = labels[label];
+
+                        new_labels[i][j] = label;
+                    }
+                __syncthreads();
+            }
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+            #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int label = new_labels[i][j];
+                    int yloc = label / TILE_COLS;
+                    int xloc = label - yloc * TILE_COLS;
+
+                    xloc += blockIdx.x * TILE_COLS;
+                    yloc += blockIdx.y * TILE_ROWS;
+
+                    label = yloc * edges.cols + xloc;
+                    // do it for x too.
+                    if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
+                }
+        }
+
+        __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
+        {
+            while(1)
+            {
+                int y = label / comps.cols;
+                int x = label - y * comps.cols;
+
+                int parent = comps(y, x);
+
+                if (label == parent) break;
+
+                label = parent;
+            }
+            return label;
+        }
+
+        __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
+        {
+            int r1 = root(comps, l1);
+            int r2 = root(comps, l2);
+
+            if (r1 == r2) return;
+
+            int mi = ::min(r1, r2);
+            int ma = ::max(r1, r2);
+
+            int y = ma / comps.cols;
+            int x = ma - y * comps.cols;
+
+            atomicMin(&comps.ptr(y)[x], mi);
+            changed = true;
+        }
+
+        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
+            const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
+        {
+            int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            int stride = blockDim.y * blockDim.x;
+
+            int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
+            int yend   = ybegin + tilesNumY * tileSizeY;
+
+            if (blockIdx.y == gridDim.y - 1)
+            {
+                yend -= yIncomplete * tileSizeY;
+                yend -= tileSizeY;
+                tileSizeY = (edges.rows % tileSizeY);
+
+                yend += tileSizeY;
+            }
+
+            int xbegin = blockIdx.x * tilesNumX * tileSizeX;
+            int xend   = xbegin + tilesNumX * tileSizeX;
+
+            if (blockIdx.x == gridDim.x - 1)
+            {
+                if (xIncomplete) yend = ybegin;
+                xend -= xIncomplete * tileSizeX;
+                xend -= tileSizeX;
+                tileSizeX = (edges.cols % tileSizeX);
+
+                xend += tileSizeX;
+            }
+
+            if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
+            {
+                xend = xbegin;
+            }
+
+            int tasksV = (tilesNumX - 1) * (yend - ybegin);
+            int tasksH = (tilesNumY - 1) * (xend - xbegin);
+
+            int total = tasksH + tasksV;
+
+            bool changed;
+            do
+            {
+                changed = false;
+                for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
+                {
+                    if (taskIdx < tasksH)
+                    {
+                        int indexH = taskIdx;
+
+                        int row = indexH / (xend - xbegin);
+                        int col = indexH - row * (xend - xbegin);
+
+                        int y = ybegin + (row + 1) * tileSizeY;
+                        int x = xbegin + col;
+
+                        component e = edges( x, y);
+                        if (e & UP)
+                        {
+                            int lc = comps(y,x);
+                            int lu = comps(y - 1, x);
+
+                            isConnected(comps, lc, lu, changed);
+                        }
+                    }
+                    else
+                    {
+                        int indexV = taskIdx - tasksH;
+
+                        int col = indexV / (yend - ybegin);
+                        int row = indexV - col * (yend - ybegin);
+
+                        int x = xbegin + (col + 1) * tileSizeX;
+                        int y = ybegin + row;
+
+                        component e = edges(x, y);
+                        if (e & LEFT)
+                        {
+                            int lc = comps(y, x);
+                            int ll = comps(y, x - 1);
+
+                            isConnected(comps, lc, ll, changed);
+                        }
+                    }
+                }
+            } while (Emulation::syncthreadsOr(changed));
+        }
+
+        __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if( x < comps.cols && y < comps.rows)
+                comps(y, x) = root(comps, comps(y, x));
+        }
+
+        enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
+
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
+        {
+            (void) flags;
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
+
+            lableTiles<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+
+            int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
+            while (grid.x > 1 || grid.y > 1)
+            {
+                dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
+                dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
+                // debug log
+                // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
+                crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
+                tileSizeX <<= 1;
+                tileSizeY <<= 1;
+                grid = mergeGrid;
+
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            grid.x = divUp(edges.cols, block.x);
+            grid.y = divUp(edges.rows, block.y);
+            flatten<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+} } }
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/clahe.cu b/modules/gpuimgproc/src/cuda/clahe.cu
new file mode 100644
index 0000000000..7c6645749b
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/clahe.cu
@@ -0,0 +1,186 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace clahe
+{
+    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
+                                  const int2 tileSize, const int tilesX,
+                                  const int clipLimit, const float lutScale)
+    {
+        __shared__ int smem[512];
+
+        const int tx = blockIdx.x;
+        const int ty = blockIdx.y;
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        smem[tid] = 0;
+        __syncthreads();
+
+        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
+        {
+            const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
+            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
+            {
+                const int data = srcPtr[j];
+                Emulation::smem::atomicAdd(&smem[data], 1);
+            }
+        }
+
+        __syncthreads();
+
+        int tHistVal = smem[tid];
+
+        __syncthreads();
+
+        if (clipLimit > 0)
+        {
+            // clip histogram bar
+
+            int clipped = 0;
+            if (tHistVal > clipLimit)
+            {
+                clipped = tHistVal - clipLimit;
+                tHistVal = clipLimit;
+            }
+
+            // find number of overall clipped samples
+
+            reduce<256>(smem, clipped, tid, plus<int>());
+
+            // broadcast evaluated value
+
+            __shared__ int totalClipped;
+
+            if (tid == 0)
+                totalClipped = clipped;
+            __syncthreads();
+
+            // redistribute clipped samples evenly
+
+            int redistBatch = totalClipped / 256;
+            tHistVal += redistBatch;
+
+            int residual = totalClipped - redistBatch * 256;
+            if (tid < residual)
+                ++tHistVal;
+        }
+
+        const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
+
+        lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
+    }
+
+    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(tilesX, tilesY);
+
+        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    __global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
+        int ty1 = __float2int_rd(tyf);
+        int ty2 = ty1 + 1;
+        const float ya = tyf - ty1;
+        ty1 = ::max(ty1, 0);
+        ty2 = ::min(ty2, tilesY - 1);
+
+        const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
+        int tx1 = __float2int_rd(txf);
+        int tx2 = tx1 + 1;
+        const float xa = txf - tx1;
+        tx1 = ::max(tx1, 0);
+        tx2 = ::min(tx2, tilesX - 1);
+
+        const int srcVal = src(y, x);
+
+        float res = 0;
+
+        res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
+        res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
+        res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
+        res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
+
+        dst(y, x) = saturate_cast<uchar>(res);
+    }
+
+    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
+
+        tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // CUDA_DISABLER
diff --git a/modules/gpuimgproc/src/cuda/color.cu b/modules/gpuimgproc/src/cuda/color.cu
new file mode 100644
index 0000000000..1a5d4865ed
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/color.cu
@@ -0,0 +1,461 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "cvt_color_internal.h"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
+    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
+    { \
+        traits::functor_type functor = traits::create_functor(); \
+        typedef typename traits::functor_type::argument_type src_t; \
+        typedef typename traits::functor_type::result_type   dst_t; \
+        cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
+
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
+}}} // namespace cv { namespace gpu { namespace cudev
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/debayer.cu b/modules/gpuimgproc/src/cuda/debayer.cu
new file mode 100644
index 0000000000..46a1c14ef4
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/debayer.cu
@@ -0,0 +1,544 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <typename T> struct Bayer2BGR;
+
+    template <> struct Bayer2BGR<uchar>
+    {
+        uchar3 res0;
+        uchar3 res1;
+        uchar3 res2;
+        uchar3 res3;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            uchar4 patch[3][3];
+            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
+
+                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
+                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
+
+                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
+                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+
+                    res2.x = t5;
+                    res2.y = patch[1][1].z;
+                    res2.z = t4;
+
+                    res3.x = patch[1][1].w;
+                    res3.y = t7;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+
+                    res2.x = t4;
+                    res2.y = patch[1][1].z;
+                    res2.z = t5;
+
+                    res3.x = t6;
+                    res3.y = t7;
+                    res3.z = patch[1][1].w;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
+
+                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
+                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
+
+                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
+                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+
+                    res2.x = patch[1][1].z;
+                    res2.y = t5;
+                    res2.z = t4;
+
+                    res3.x = t7;
+                    res3.y = patch[1][1].w;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+
+                    res2.x = t4;
+                    res2.y = t5;
+                    res2.z = patch[1][1].z;
+
+                    res3.x = t6;
+                    res3.y = patch[1][1].w;
+                    res3.z = t7;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
+    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
+    {
+        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
+    {
+        return make_uchar4(pix.x, pix.y, pix.z, 255);
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 2) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<uchar> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+        if (d_x + 2 < src.cols)
+            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
+        if (d_x + 3 < src.cols)
+            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
+    }
+
+    template <> struct Bayer2BGR<ushort>
+    {
+        ushort3 res0;
+        ushort3 res1;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            ushort2 patch[3][3];
+            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
+    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
+    {
+        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
+    {
+        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 1) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<ushort> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+    }
+
+    template <int cn>
+    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int cn>
+    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //
+    // by Morgan McGuire, Williams College
+    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
+    //
+    // ported to CUDA
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    template <typename DstType>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    {
+        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
+        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
+        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
+        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
+        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
+        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
+            return;
+
+        int2 center;
+        center.x = x + sourceOffset.x;
+        center.y = y + sourceOffset.y;
+
+        int4 xCoord;
+        xCoord.x = center.x - 2;
+        xCoord.y = center.x - 1;
+        xCoord.z = center.x + 1;
+        xCoord.w = center.x + 2;
+
+        int4 yCoord;
+        yCoord.x = center.y - 2;
+        yCoord.y = center.y - 1;
+        yCoord.z = center.y + 1;
+        yCoord.w = center.y + 2;
+
+        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+
+        float4 Dvec;
+        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
+        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
+        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
+        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+
+        float4 value;
+        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
+        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
+        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
+        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+
+        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
+        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
+        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
+        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
+        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+
+        float4 PATTERN;
+        PATTERN.x = kCx * C;
+        PATTERN.y = kCy * C;
+        PATTERN.z = kCz * C;
+        PATTERN.w = PATTERN.z;
+
+        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
+
+        // There are five filter patterns (identity, cross, checker,
+        // theta, phi). Precompute the terms from all of them and then
+        // use swizzles to assign to color channels.
+        //
+        // Channel Matches
+        // x cross (e.g., EE G)
+        // y checker (e.g., EE B)
+        // z theta (e.g., EO R)
+        // w phi (e.g., EO B)
+
+        #define A value.x  // A0 + A1
+        #define B value.y  // B0 + B1
+        #define E value.z  // E0 + E1
+        #define F value.w  // F0 + F1
+
+        float3 temp;
+
+        // PATTERN.yzw += (kD.yz * D).xyy;
+        temp.x = kDy * D;
+        temp.y = kDz * D;
+        PATTERN.y += temp.x;
+        PATTERN.z += temp.y;
+        PATTERN.w += temp.y;
+
+        // PATTERN += (kA.xyz * A).xyzx;
+        temp.x = kAx * A;
+        temp.y = kAy * A;
+        temp.z = kAz * A;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.z;
+        PATTERN.w += temp.x;
+
+        // PATTERN += (kE.xyw * E).xyxz;
+        temp.x = kEx * E;
+        temp.y = kEy * E;
+        temp.z = kEw * E;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.x;
+        PATTERN.w += temp.z;
+
+        // PATTERN.xw += kB.xw * B;
+        PATTERN.x += kBx * B;
+        PATTERN.w += kBw * B;
+
+        // PATTERN.xz += kF.xz * F;
+        PATTERN.x += kFx * F;
+        PATTERN.z += kFz * F;
+
+        // Determine which of four types of pixels we are on.
+        int2 alternate;
+        alternate.x = (x + firstRed.x) % 2;
+        alternate.y = (y + firstRed.y) % 2;
+
+        // in BGR sequence;
+        uchar3 pixelColor =
+            (alternate.y == 0) ?
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
+                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
+                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
+
+        dst(y, x) = toDst<DstType>(pixelColor);
+    }
+
+    template <int cn>
+    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        bindTexture(&sourceTex, src);
+
+        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/gftt.cu b/modules/gpuimgproc/src/cuda/gftt.cu
new file mode 100644
index 0000000000..b4af9e5dbc
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/gftt.cu
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace gfft
+    {
+        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __device__ int g_counter = 0;
+
+        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
+        {
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
+            {
+                float val = tex2D(eigTex, j, i);
+
+                if (val > threshold)
+                {
+                    float maxVal = val;
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
+
+                    if (val == maxVal)
+                    {
+                        const int ind = ::atomicAdd(&g_counter, 1);
+
+                        if (ind < max_count)
+                            corners[ind] = make_float2(j, i);
+                    }
+                }
+            }
+        }
+
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+            bindTexture(&eigTex, eig);
+
+            dim3 block(16, 16);
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
+
+            if (mask.data)
+                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+            else
+                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return std::min(count, max_count);
+        }
+
+        class EigGreater
+        {
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
+                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
+            }
+        };
+
+
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
+        {
+            bindTexture(&eigTex, eig);
+
+            thrust::device_ptr<float2> ptr(corners);
+
+            thrust::sort(ptr, ptr + count, EigGreater());
+        }
+    } // namespace optical_flow
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/hist.cu b/modules/gpuimgproc/src/cuda/hist.cu
new file mode 100644
index 0000000000..474c27cf76
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/hist.cu
@@ -0,0 +1,153 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace hist
+{
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __constant__ int c_lut[256];
+
+    struct EqualizeHist : unary_function<uchar, uchar>
+    {
+        float scale;
+
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+
+        __device__ __forceinline__ uchar operator ()(uchar val) const
+        {
+            const int lut = c_lut[val];
+            return __float2int_rn(scale * lut);
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        const float scale = 255.0f / (src.cols * src.rows);
+
+        cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+    }
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/hough.cu b/modules/gpuimgproc/src/cuda/hough.cu
new file mode 100644
index 0000000000..5a4481b6e5
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/hough.cu
@@ -0,0 +1,1709 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough
+    {
+        __device__ int g_counter;
+
+        ////////////////////////////////////////////////////////////////////////
+        // buildPointList
+
+        template <int PIXELS_PER_THREAD>
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
+        {
+            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
+            __shared__ int s_qsize[4];
+            __shared__ int s_globStart[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (threadIdx.x == 0)
+                s_qsize[threadIdx.y] = 0;
+            __syncthreads();
+
+            if (y < src.rows)
+            {
+                // fill the queue
+                const uchar* srcRow = src.ptr(y);
+                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
+                {
+                    if (srcRow[xx])
+                    {
+                        const unsigned int val = (y << 16) | xx;
+                        const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
+                        s_queues[threadIdx.y][qidx] = val;
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            // let one thread reserve the space required in the global list
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                // find how many items are stored in each list
+                int totalSize = 0;
+                for (int i = 0; i < blockDim.y; ++i)
+                {
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
+                }
+
+                // calculate the offset in the global list
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_globStart[i] += globalOffset;
+            }
+
+            __syncthreads();
+
+            // copy local queues to global queue
+            const int qsize = s_qsize[threadIdx.y];
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
+        }
+
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
+        {
+            const int PIXELS_PER_THREAD = 16;
+
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 4);
+            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
+
+            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesAccum
+
+        __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                ::atomicAdd(accumRow + r + 1, 1);
+            }
+        }
+
+        __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                smem[i] = 0;
+
+            __syncthreads();
+
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                Emulation::smem::atomicAdd(&smem[r + 1], 1);
+            }
+
+            __syncthreads();
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
+        }
+
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        {
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(accum.rows - 2);
+
+            size_t smemSize = (accum.cols - 1) * sizeof(int);
+
+            if (smemSize < sharedMemPerBlock - 1000)
+                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+            else
+                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesGetResult
+
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  accum(n + 1, r) &&
+                curVotes >= accum(n + 1, r + 2) &&
+                curVotes >  accum(n, r + 1) &&
+                curVotes >= accum(n + 2, r + 1))
+            {
+                const float radius = (r - (numrho - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                const int ind = ::atomicAdd(&g_counter, 1);
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float2(radius, angle);
+                    votes[ind] = curVotes;
+                }
+            }
+        }
+
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
+
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            if (doSort && totalCount > 0)
+            {
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
+            }
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // houghLinesProbabilistic
+
+        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
+                                                int4* out, const int maxSize,
+                                                const float rho, const float theta,
+                                                const int lineGap, const int lineLength,
+                                                const int rows, const int cols)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes >= lineLength &&
+                curVotes > accum(n, r) &&
+                curVotes > accum(n, r + 1) &&
+                curVotes > accum(n, r + 2) &&
+                curVotes > accum(n + 1, r) &&
+                curVotes > accum(n + 1, r + 2) &&
+                curVotes > accum(n + 2, r) &&
+                curVotes > accum(n + 2, r + 1) &&
+                curVotes > accum(n + 2, r + 2))
+            {
+                const float radius = (r - (accum.cols - 2 - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                float cosa;
+                float sina;
+                sincosf(angle, &sina, &cosa);
+
+                float2 p0 = make_float2(cosa * radius, sina * radius);
+                float2 dir = make_float2(-sina, cosa);
+
+                float2 pb[4] = {make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1)};
+                float a;
+
+                if (dir.x != 0)
+                {
+                    a = -p0.x / dir.x;
+                    pb[0].x = 0;
+                    pb[0].y = p0.y + a * dir.y;
+
+                    a = (cols - 1 - p0.x) / dir.x;
+                    pb[1].x = cols - 1;
+                    pb[1].y = p0.y + a * dir.y;
+                }
+                if (dir.y != 0)
+                {
+                    a = -p0.y / dir.y;
+                    pb[2].x = p0.x + a * dir.x;
+                    pb[2].y = 0;
+
+                    a = (rows - 1 - p0.y) / dir.y;
+                    pb[3].x = p0.x + a * dir.x;
+                    pb[3].y = rows - 1;
+                }
+
+                if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[0];
+                    if (dir.x < 0)
+                        dir = -dir;
+                }
+                else if (pb[1].x == cols - 1 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[1];
+                    if (dir.x > 0)
+                        dir = -dir;
+                }
+                else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < cols))
+                {
+                    p0 = pb[2];
+                    if (dir.y < 0)
+                        dir = -dir;
+                }
+                else if (pb[3].y == rows - 1 && (pb[3].x >= 0 && pb[3].x < cols))
+                {
+                    p0 = pb[3];
+                    if (dir.y > 0)
+                        dir = -dir;
+                }
+
+                float2 d;
+                if (::fabsf(dir.x) > ::fabsf(dir.y))
+                {
+                    d.x = dir.x > 0 ? 1 : -1;
+                    d.y = dir.y / ::fabsf(dir.x);
+                }
+                else
+                {
+                    d.x = dir.x / ::fabsf(dir.y);
+                    d.y = dir.y > 0 ? 1 : -1;
+                }
+
+                float2 line_end[2];
+                int gap;
+                bool inLine = false;
+
+                float2 p1 = p0;
+                if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    return;
+
+                for (;;)
+                {
+                    if (tex2D(tex_mask, p1.x, p1.y))
+                    {
+                        gap = 0;
+
+                        if (!inLine)
+                        {
+                            line_end[0] = p1;
+                            line_end[1] = p1;
+                            inLine = true;
+                        }
+                        else
+                        {
+                            line_end[1] = p1;
+                        }
+                    }
+                    else if (inLine)
+                    {
+                        if (++gap > lineGap)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                            gap = 0;
+                            inLine = false;
+                        }
+                    }
+
+                    p1 = p1 + d;
+                    if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    {
+                        if (inLine)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            bindTexture(&tex_mask, mask);
+
+            houghLinesProbabilistic<<<grid, block>>>(accum,
+                                                     out, maxSize,
+                                                     rho, theta,
+                                                     lineGap, lineLength,
+                                                     mask.rows, mask.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumCenters
+
+        __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
+                                            PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
+        {
+            const int SHIFT = 10;
+            const int ONE = 1 << SHIFT;
+
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= count)
+                return;
+
+            const unsigned int val = list[tid];
+
+            const int x = (val & 0xFFFF);
+            const int y = (val >> 16) & 0xFFFF;
+
+            const int vx = dx(y, x);
+            const int vy = dy(y, x);
+
+            if (vx == 0 && vy == 0)
+                return;
+
+            const float mag = ::sqrtf(vx * vx + vy * vy);
+
+            const int x0 = __float2int_rn((x * idp) * ONE);
+            const int y0 = __float2int_rn((y * idp) * ONE);
+
+            int sx = __float2int_rn((vx * idp) * ONE / mag);
+            int sy = __float2int_rn((vy * idp) * ONE / mag);
+
+            // Step from minRadius to maxRadius in both directions of the gradient
+            for (int k1 = 0; k1 < 2; ++k1)
+            {
+                int x1 = x0 + minRadius * sx;
+                int y1 = y0 + minRadius * sy;
+
+                for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
+                {
+                    const int x2 = x1 >> SHIFT;
+                    const int y2 = y1 >> SHIFT;
+
+                    if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
+                        break;
+
+                    ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
+                }
+
+                sx = -sx;
+                sy = -sy;
+            }
+        }
+
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(count, block.x));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
+
+            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // buildCentersList
+
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < accum.cols - 2 && y < accum.rows - 2)
+            {
+                const int top = accum(y, x + 1);
+
+                const int left = accum(y + 1, x);
+                const int cur = accum(y + 1, x + 1);
+                const int right = accum(y + 1, x + 2);
+
+                const int bottom = accum(y + 2, x + 1);
+
+                if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
+                {
+                    const unsigned int val = (y << 16) | x;
+                    const int idx = ::atomicAdd(&g_counter, 1);
+                    centers[idx] = val;
+                }
+            }
+        }
+
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
+
+            buildCentersList<<<grid, block>>>(accum, centers, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumRadius
+
+        __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
+                                           float3* circles, const int maxCircles, const float dp,
+                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
+                smem[i] = 0;
+            __syncthreads();
+
+            unsigned int val = centers[blockIdx.x];
+
+            float cx = (val & 0xFFFF);
+            float cy = (val >> 16) & 0xFFFF;
+
+            cx = (cx + 0.5f) * dp;
+            cy = (cy + 0.5f) * dp;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
+                if (rad >= minRadius && rad <= maxRadius)
+                {
+                    const int r = __float2int_rn(rad - minRadius);
+
+                    Emulation::smem::atomicAdd(&smem[r + 1], 1);
+                }
+            }
+
+            __syncthreads();
+
+            for (int i = threadIdx.x; i < histSize; i += blockDim.x)
+            {
+                const int curVotes = smem[i + 1];
+
+                if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
+                {
+                    const int ind = ::atomicAdd(&g_counter, 1);
+                    if (ind < maxCircles)
+                        circles[ind] = make_float3(cx, cy, i + minRadius);
+                }
+            }
+        }
+
+        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(centersCount);
+
+            const int histSize = maxRadius - minRadius + 1;
+            size_t smemSize = (histSize + 2) * sizeof(int);
+
+            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxCircles);
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Generalized Hough
+
+        template <typename T, int PIXELS_PER_THREAD>
+        __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
+        {
+            __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
+            __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
+            __shared__ int s_sizes[4];
+            __shared__ int s_globStart[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (threadIdx.x == 0)
+                s_sizes[threadIdx.y] = 0;
+            __syncthreads();
+
+            if (y < edges.rows)
+            {
+                // fill the queue
+                const uchar* edgesRow = edges.ptr(y);
+                const T* dxRow = dx.ptr(y);
+                const T* dyRow = dy.ptr(y);
+
+                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
+                {
+                    const T dxVal = dxRow[xx];
+                    const T dyVal = dyRow[xx];
+
+                    if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
+                    {
+                        const unsigned int coord = (y << 16) | xx;
+
+                        float theta = ::atan2f(dyVal, dxVal);
+                        if (theta < 0)
+                            theta += 2.0f * CV_PI_F;
+
+                        const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
+
+                        s_coordLists[threadIdx.y][qidx] = coord;
+                        s_thetaLists[threadIdx.y][qidx] = theta;
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            // let one thread reserve the space required in the global list
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                // find how many items are stored in each list
+                int totalSize = 0;
+                for (int i = 0; i < blockDim.y; ++i)
+                {
+                    s_globStart[i] = totalSize;
+                    totalSize += s_sizes[i];
+                }
+
+                // calculate the offset in the global list
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_globStart[i] += globalOffset;
+            }
+
+            __syncthreads();
+
+            // copy local queues to global queue
+            const int qsize = s_sizes[threadIdx.y];
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+            {
+                coordList[gidx] = s_coordLists[threadIdx.y][i];
+                thetaList[gidx] = s_thetaLists[threadIdx.y][i];
+            }
+        }
+
+        template <typename T>
+        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
+        {
+            const int PIXELS_PER_THREAD = 8;
+
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 4);
+            const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
+
+            buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+
+        __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                    PtrStep<short2> r_table, int* r_sizes, int maxSize,
+                                    const short2 templCenter, const float thetaScale)
+        {
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= pointsCount)
+                return;
+
+            const unsigned int coord = coordList[tid];
+            short2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float theta = thetaList[tid];
+            const int n = __float2int_rn(theta * thetaScale);
+
+            const int ind = ::atomicAdd(r_sizes + n, 1);
+            if (ind < maxSize)
+                r_table(n, ind) = p - templCenter;
+        }
+
+        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                             PtrStepSz<short2> r_table, int* r_sizes,
+                             short2 templCenter, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(pointsCount, block.x));
+
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // GHT_Ballard_Pos
+
+        __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                                 const PtrStep<short2> r_table, const int* r_sizes,
+                                                 PtrStepSzi hist,
+                                                 const float idp, const float thetaScale)
+        {
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= pointsCount)
+                return;
+
+            const unsigned int coord = coordList[tid];
+            short2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float theta = thetaList[tid];
+            const int n = __float2int_rn(theta * thetaScale);
+
+            const short2* r_row = r_table.ptr(n);
+            const int r_row_size = r_sizes[n];
+
+            for (int j = 0; j < r_row_size; ++j)
+            {
+                short2 c = p - r_row[j];
+
+                c.x = __float2int_rn(c.x * idp);
+                c.y = __float2int_rn(c.y * idp);
+
+                if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
+                    ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
+            }
+        }
+
+        void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                          PtrStepSz<short2> r_table, const int* r_sizes,
+                                          PtrStepSzi hist,
+                                          float dp, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(pointsCount, block.x));
+
+            const float idp = 1.0f / dp;
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= hist.cols - 2 || y >= hist.rows - 2)
+                return;
+
+            const int curVotes = hist(y + 1, x + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  hist(y + 1, x) &&
+                curVotes >= hist(y + 1, x + 2) &&
+                curVotes >  hist(y, x + 1) &&
+                curVotes >= hist(y + 2, x + 1))
+            {
+                const int ind = ::atomicAdd(&g_counter, 1);
+
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
+                    votes[ind] = make_int3(curVotes, 0, 0);
+                }
+            }
+        }
+
+        int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
+
+            GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // GHT_Ballard_PosScale
+
+        __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
+                                                      PtrStep<short2> r_table, const int* r_sizes,
+                                                      PtrStepi hist, const int rows, const int cols,
+                                                      const float minScale, const float scaleStep, const int scaleRange,
+                                                      const float idp, const float thetaScale)
+        {
+            const unsigned int coord = coordList[blockIdx.x];
+            float2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float theta = thetaList[blockIdx.x];
+            const int n = __float2int_rn(theta * thetaScale);
+
+            const short2* r_row = r_table.ptr(n);
+            const int r_row_size = r_sizes[n];
+
+            for (int j = 0; j < r_row_size; ++j)
+            {
+                const float2 d = saturate_cast<float2>(r_row[j]);
+
+                for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
+                {
+                    const float scale = minScale + s * scaleStep;
+
+                    float2 c = p - scale * d;
+
+                    c.x *= idp;
+                    c.y *= idp;
+
+                    if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
+                        ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
+                }
+            }
+        }
+
+        void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                               PtrStepSz<short2> r_table, const int* r_sizes,
+                                               PtrStepi hist, int rows, int cols,
+                                               float minScale, float scaleStep, int scaleRange,
+                                               float dp, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(pointsCount);
+
+            const float idp = 1.0f / dp;
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
+                                                           r_table, r_sizes,
+                                                           hist, rows, cols,
+                                                           minScale, scaleStep, scaleRange,
+                                                           idp, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
+                                                           float4* out, int3* votes, const int maxSize,
+                                                           const float minScale, const float scaleStep, const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= cols || y >= rows)
+                return;
+
+            for (int s = 0; s < scaleRange; ++s)
+            {
+                const float scale = minScale + s * scaleStep;
+
+                const int prevScaleIdx = (s) * (rows + 2);
+                const int curScaleIdx = (s + 1) * (rows + 2);
+                const int nextScaleIdx = (s + 2) * (rows + 2);
+
+                const int curVotes = hist(curScaleIdx + y + 1, x + 1);
+
+                if (curVotes > threshold &&
+                    curVotes >  hist(curScaleIdx + y + 1, x) &&
+                    curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
+                    curVotes >  hist(curScaleIdx + y, x + 1) &&
+                    curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
+                    curVotes >  hist(prevScaleIdx + y + 1, x + 1) &&
+                    curVotes >= hist(nextScaleIdx + y + 1, x + 1))
+                {
+                    const int ind = ::atomicAdd(&g_counter, 1);
+
+                    if (ind < maxSize)
+                    {
+                        out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
+                        votes[ind] = make_int3(curVotes, curVotes, 0);
+                    }
+                }
+            }
+        }
+
+        int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
+                                                   float minScale, float scaleStep, float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
+
+            GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // GHT_Ballard_PosRotation
+
+        __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
+                                                         PtrStep<short2> r_table, const int* r_sizes,
+                                                         PtrStepi hist, const int rows, const int cols,
+                                                         const float minAngle, const float angleStep, const int angleRange,
+                                                         const float idp, const float thetaScale)
+        {
+            const unsigned int coord = coordList[blockIdx.x];
+            float2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float thetaVal = thetaList[blockIdx.x];
+
+            for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
+            {
+                const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
+                float sinA, cosA;
+                sincosf(angle, &sinA, &cosA);
+
+                float theta = thetaVal - angle;
+                if (theta < 0)
+                    theta += 2.0f * CV_PI_F;
+
+                const int n = __float2int_rn(theta * thetaScale);
+
+                const short2* r_row = r_table.ptr(n);
+                const int r_row_size = r_sizes[n];
+
+                for (int j = 0; j < r_row_size; ++j)
+                {
+                    const float2 d = saturate_cast<float2>(r_row[j]);
+
+                    const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
+
+                    float2 c = make_float2(p.x - dr.x, p.y - dr.y);
+                    c.x *= idp;
+                    c.y *= idp;
+
+                    if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
+                        ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
+                }
+            }
+        }
+
+        void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                  PtrStepSz<short2> r_table, const int* r_sizes,
+                                                  PtrStepi hist, int rows, int cols,
+                                                  float minAngle, float angleStep, int angleRange,
+                                                  float dp, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(pointsCount);
+
+            const float idp = 1.0f / dp;
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
+                                                              r_table, r_sizes,
+                                                              hist, rows, cols,
+                                                              minAngle, angleStep, angleRange,
+                                                              idp, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
+                                                              float4* out, int3* votes, const int maxSize,
+                                                              const float minAngle, const float angleStep, const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= cols || y >= rows)
+                return;
+
+            for (int a = 0; a < angleRange; ++a)
+            {
+                const float angle = minAngle + a * angleStep;
+
+                const int prevAngleIdx = (a) * (rows + 2);
+                const int curAngleIdx = (a + 1) * (rows + 2);
+                const int nextAngleIdx = (a + 2) * (rows + 2);
+
+                const int curVotes = hist(curAngleIdx + y + 1, x + 1);
+
+                if (curVotes > threshold &&
+                    curVotes >  hist(curAngleIdx + y + 1, x) &&
+                    curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
+                    curVotes >  hist(curAngleIdx + y, x + 1) &&
+                    curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
+                    curVotes >  hist(prevAngleIdx + y + 1, x + 1) &&
+                    curVotes >= hist(nextAngleIdx + y + 1, x + 1))
+                {
+                    const int ind = ::atomicAdd(&g_counter, 1);
+
+                    if (ind < maxSize)
+                    {
+                        out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
+                        votes[ind] = make_int3(curVotes, 0, curVotes);
+                    }
+                }
+            }
+        }
+
+        int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
+                                                      float minAngle, float angleStep, float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
+
+            GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // GHT_Guil_Full
+
+        struct FeatureTable
+        {
+            uchar* p1_pos_data;
+            size_t p1_pos_step;
+
+            uchar* p1_theta_data;
+            size_t p1_theta_step;
+
+            uchar* p2_pos_data;
+            size_t p2_pos_step;
+
+            uchar* d12_data;
+            size_t d12_step;
+
+            uchar* r1_data;
+            size_t r1_step;
+
+            uchar* r2_data;
+            size_t r2_step;
+        };
+
+        __constant__ FeatureTable c_templFeatures;
+        __constant__ FeatureTable c_imageFeatures;
+
+        void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
+        {
+            FeatureTable tbl;
+
+            tbl.p1_pos_data = p1_pos.data;
+            tbl.p1_pos_step = p1_pos.step;
+
+            tbl.p1_theta_data = p1_theta.data;
+            tbl.p1_theta_step = p1_theta.step;
+
+            tbl.p2_pos_data = p2_pos.data;
+            tbl.p2_pos_step = p2_pos.step;
+
+            tbl.d12_data = d12.data;
+            tbl.d12_step = d12.step;
+
+            tbl.r1_data = r1.data;
+            tbl.r1_step = r1.step;
+
+            tbl.r2_data = r2.data;
+            tbl.r2_step = r2.step;
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
+        }
+        void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
+        {
+            FeatureTable tbl;
+
+            tbl.p1_pos_data = p1_pos.data;
+            tbl.p1_pos_step = p1_pos.step;
+
+            tbl.p1_theta_data = p1_theta.data;
+            tbl.p1_theta_step = p1_theta.step;
+
+            tbl.p2_pos_data = p2_pos.data;
+            tbl.p2_pos_step = p2_pos.step;
+
+            tbl.d12_data = d12.data;
+            tbl.d12_step = d12.step;
+
+            tbl.r1_data = r1.data;
+            tbl.r1_step = r1.step;
+
+            tbl.r2_data = r2.data;
+            tbl.r2_step = r2.step;
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
+        }
+
+        struct TemplFeatureTable
+        {
+            static __device__ float2* p1_pos(int n)
+            {
+                return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
+            }
+            static __device__ float* p1_theta(int n)
+            {
+                return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
+            }
+            static __device__ float2* p2_pos(int n)
+            {
+                return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
+            }
+
+            static __device__ float* d12(int n)
+            {
+                return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
+            }
+
+            static __device__ float2* r1(int n)
+            {
+                return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
+            }
+            static __device__ float2* r2(int n)
+            {
+                return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
+            }
+        };
+        struct ImageFeatureTable
+        {
+            static __device__ float2* p1_pos(int n)
+            {
+                return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
+            }
+            static __device__ float* p1_theta(int n)
+            {
+                return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
+            }
+            static __device__ float2* p2_pos(int n)
+            {
+                return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
+            }
+
+            static __device__ float* d12(int n)
+            {
+                return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
+            }
+
+            static __device__ float2* r1(int n)
+            {
+                return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
+            }
+            static __device__ float2* r2(int n)
+            {
+                return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
+            }
+        };
+
+        __device__ float clampAngle(float a)
+        {
+            float res = a;
+
+            while (res > 2.0f * CV_PI_F)
+                res -= 2.0f * CV_PI_F;
+            while (res < 0.0f)
+                res += 2.0f * CV_PI_F;
+
+            return res;
+        }
+
+        __device__ bool angleEq(float a, float b, float eps)
+        {
+            return (::fabs(clampAngle(a - b)) <= eps);
+        }
+
+        template <class FT, bool isTempl>
+        __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                                       int* sizes, const int maxSize,
+                                                       const float xi, const float angleEpsilon, const float alphaScale,
+                                                       const float2 center, const float maxDist)
+        {
+            const float p1_theta = thetaList[blockIdx.x];
+            const unsigned int coord1 = coordList[blockIdx.x];
+            float2 p1_pos;
+            p1_pos.x = (coord1 & 0xFFFF);
+            p1_pos.y = (coord1 >> 16) & 0xFFFF;
+
+            for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
+            {
+                const float p2_theta = thetaList[i];
+                const unsigned int coord2 = coordList[i];
+                float2 p2_pos;
+                p2_pos.x = (coord2 & 0xFFFF);
+                p2_pos.y = (coord2 >> 16) & 0xFFFF;
+
+                if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
+                {
+                    const float2 d = p1_pos - p2_pos;
+
+                    float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
+                    float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
+
+                    if (d12 > maxDist)
+                        continue;
+
+                    float2 r1 = p1_pos - center;
+                    float2 r2 = p2_pos - center;
+
+                    const int n = __float2int_rn(alpha12 * alphaScale);
+
+                    const int ind = ::atomicAdd(sizes + n, 1);
+
+                    if (ind < maxSize)
+                    {
+                        if (!isTempl)
+                        {
+                            FT::p1_pos(n)[ind] = p1_pos;
+                            FT::p2_pos(n)[ind] = p2_pos;
+                        }
+
+                        FT::p1_theta(n)[ind] = p1_theta;
+
+                        FT::d12(n)[ind] = d12;
+
+                        if (isTempl)
+                        {
+                            FT::r1(n)[ind] = r1;
+                            FT::r2(n)[ind] = r2;
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class FT, bool isTempl>
+        void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                   int* sizes, int maxSize,
+                                                   float xi, float angleEpsilon, int levels,
+                                                   float2 center, float maxDist)
+        {
+            const dim3 block(256);
+            const dim3 grid(pointsCount);
+
+            const float alphaScale = levels / (2.0f * CV_PI_F);
+
+            GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
+                                                                         sizes, maxSize,
+                                                                         xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
+                                                                         center, maxDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            thrust::device_ptr<int> sizesPtr(sizes);
+            thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, cudev::bind2nd(cudev::minimum<int>(), maxSize));
+        }
+
+        void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                     int* sizes, int maxSize,
+                                                     float xi, float angleEpsilon, int levels,
+                                                     float2 center, float maxDist)
+        {
+            GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
+                                                                           sizes, maxSize,
+                                                                           xi, angleEpsilon, levels,
+                                                                           center, maxDist);
+        }
+        void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                     int* sizes, int maxSize,
+                                                     float xi, float angleEpsilon, int levels,
+                                                     float2 center, float maxDist)
+        {
+            GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
+                                                                            sizes, maxSize,
+                                                                            xi, angleEpsilon, levels,
+                                                                            center, maxDist);
+        }
+
+        __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
+                                                const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
+        {
+            extern __shared__ int s_OHist[];
+            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
+                s_OHist[i] = 0;
+            __syncthreads();
+
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+
+                    const float angle = clampAngle(im_p1_theta - t_p1_theta);
+
+                    if (angle >= minAngle && angle <= maxAngle)
+                    {
+                        const int n = __float2int_rn((angle - minAngle) * iAngleStep);
+                        Emulation::smem::atomicAdd(&s_OHist[n], 1);
+                    }
+                }
+            }
+            __syncthreads();
+
+            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
+                ::atomicAdd(OHist + i, s_OHist[i]);
+        }
+
+        void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
+                                         float minAngle, float maxAngle, float angleStep, int angleRange,
+                                         int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            minAngle *= (CV_PI_F / 180.0f);
+            maxAngle *= (CV_PI_F / 180.0f);
+            angleStep *= (CV_PI_F / 180.0f);
+
+            const size_t smemSize = (angleRange + 1) * sizeof(float);
+
+            GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
+                                                               minAngle, maxAngle, 1.0f / angleStep, angleRange);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
+                                                const float angle, const float angleEpsilon,
+                                                const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
+        {
+            extern __shared__ int s_SHist[];
+            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
+                s_SHist[i] = 0;
+            __syncthreads();
+
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
+                const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+                    const float im_d12 = ImageFeatureTable::d12(level)[i];
+
+                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
+                    {
+                        const float scale = im_d12 / t_d12;
+
+                        if (scale >= minScale && scale <= maxScale)
+                        {
+                            const int s = __float2int_rn((scale - minScale) * iScaleStep);
+                            Emulation::smem::atomicAdd(&s_SHist[s], 1);
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+
+            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
+                ::atomicAdd(SHist + i, s_SHist[i]);
+        }
+
+        void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
+                                         float angle, float angleEpsilon,
+                                         float minScale, float maxScale, float iScaleStep, int scaleRange,
+                                         int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            angle *= (CV_PI_F / 180.0f);
+            angleEpsilon *= (CV_PI_F / 180.0f);
+
+            const size_t smemSize = (scaleRange + 1) * sizeof(float);
+
+            GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
+                                                               angle, angleEpsilon,
+                                                               minScale, maxScale, iScaleStep, scaleRange);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                                const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
+                                                const float idp)
+        {
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
+
+                float2 r1 = TemplFeatureTable::r1(level)[tIdx];
+                float2 r2 = TemplFeatureTable::r2(level)[tIdx];
+
+                r1 = r1 * scale;
+                r2 = r2 * scale;
+
+                r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
+                r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+
+                    const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
+                    const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
+
+                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
+                    {
+                        float2 c1, c2;
+
+                        c1 = im_p1_pos - r1;
+                        c1 = c1 * idp;
+
+                        c2 = im_p2_pos - r2;
+                        c2 = c2 * idp;
+
+                        if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
+                            continue;
+
+                        if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
+                            ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
+                    }
+                }
+            }
+        }
+
+        void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                         float angle, float angleEpsilon, float scale,
+                                         float dp,
+                                         int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            angle *= (CV_PI_F / 180.0f);
+            angleEpsilon *= (CV_PI_F / 180.0f);
+
+            const float sinVal = ::sinf(angle);
+            const float cosVal = ::cosf(angle);
+
+            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
+
+            GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
+                                                     angle, sinVal, cosVal, angleEpsilon, scale,
+                                                     1.0f / dp);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
+                                                    const float angle, const int angleVotes, const float scale, const int scaleVotes,
+                                                    const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= hist.cols - 2 || y >= hist.rows - 2)
+                return;
+
+            const int curVotes = hist(y + 1, x + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  hist(y + 1, x) &&
+                curVotes >= hist(y + 1, x + 2) &&
+                curVotes >  hist(y, x + 1) &&
+                curVotes >= hist(y + 2, x + 1))
+            {
+                const int ind = ::atomicAdd(&g_counter, 1);
+
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float4(x * dp, y * dp, scale, angle);
+                    votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
+                }
+            }
+        }
+
+        int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
+                                             float angle, int angleVotes, float scale, int scaleVotes,
+                                             float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
+
+            GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
+                                                         angle, angleVotes, scale, scaleVotes,
+                                                         dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/imgproc.cu b/modules/gpuimgproc/src/cuda/imgproc.cu
new file mode 100644
index 0000000000..c6dfbb417b
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/imgproc.cu
@@ -0,0 +1,753 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
+
+        texture<uchar4, 2> tex_meanshift;
+
+        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
+                                        size_t out_step, int cols, int rows,
+                                        int sp, int sr, int maxIter, float eps)
+        {
+            int isr2 = sr*sr;
+            uchar4 c = tex2D(tex_meanshift, x0, y0 );
+
+            // iterate meanshift procedure
+            for( int iter = 0; iter < maxIter; iter++ )
+            {
+                int count = 0;
+                int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
+                float icount;
+
+                //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
+                int minx = x0-sp;
+                int miny = y0-sp;
+                int maxx = x0+sp;
+                int maxy = y0+sp;
+
+                for( int y = miny; y <= maxy; y++)
+                {
+                    int rowCount = 0;
+                    for( int x = minx; x <= maxx; x++ )
+                    {
+                        uchar4 t = tex2D( tex_meanshift, x, y );
+
+                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
+                        if( norm2 <= isr2 )
+                        {
+                            s0 += t.x; s1 += t.y; s2 += t.z;
+                            sx += x; rowCount++;
+                        }
+                    }
+                    count += rowCount;
+                    sy += y*rowCount;
+                }
+
+                if( count == 0 )
+                    break;
+
+                icount = 1.f/count;
+                int x1 = __float2int_rz(sx*icount);
+                int y1 = __float2int_rz(sy*icount);
+                s0 = __float2int_rz(s0*icount);
+                s1 = __float2int_rz(s1*icount);
+                s2 = __float2int_rz(s2*icount);
+
+                int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
+
+                bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
+
+                x0 = x1; y0 = y1;
+                c.x = s0; c.y = s1; c.z = s2;
+
+                if( stopFlag )
+                    break;
+            }
+
+            int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
+            *(uchar4*)(out + base) = c;
+
+            return make_short2((short)x0, (short)y0);
+        }
+
+        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if( x0 < cols && y0 < rows )
+                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
+        }
+
+        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
+                                             unsigned char* outsp, size_t outspstep,
+                                             int cols, int rows,
+                                             int sp, int sr, int maxIter, float eps)
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if( x0 < cols && y0 < rows )
+            {
+                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
+                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+            }
+        }
+
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+
+            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
+        }
+
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+
+            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
+        }
+
+        /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
+
+        template <typename T>
+        __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
+        {
+            unsigned int H = ((ndisp-d) * 240)/ndisp;
+
+            unsigned int hi = (H/60) % 6;
+            float f = H/60.f - H/60;
+            float p = V * (1 - S);
+            float q = V * (1 - f * S);
+            float t = V * (1 - (1 - f) * S);
+
+            float3 res;
+
+            if (hi == 0) //R = V,	G = t,	B = p
+            {
+                res.x = p;
+                res.y = t;
+                res.z = V;
+            }
+
+            if (hi == 1) // R = q,	G = V,	B = p
+            {
+                res.x = p;
+                res.y = V;
+                res.z = q;
+            }
+
+            if (hi == 2) // R = p,	G = V,	B = t
+            {
+                res.x = t;
+                res.y = V;
+                res.z = p;
+            }
+
+            if (hi == 3) // R = p,	G = q,	B = V
+            {
+                res.x = V;
+                res.y = q;
+                res.z = p;
+            }
+
+            if (hi == 4) // R = t,	G = p,	B = V
+            {
+                res.x = V;
+                res.y = p;
+                res.z = t;
+            }
+
+            if (hi == 5) // R = V,	G = p,	B = q
+            {
+                res.x = q;
+                res.y = p;
+                res.z = V;
+            }
+            const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
+            const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
+            const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
+            const unsigned int a = 255U;
+
+            return (a << 24) + (r << 16) + (g << 8) + b;
+        }
+
+        __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if(x < width && y < height)
+            {
+                uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
+
+                uint4 res;
+                res.x = cvtPixel(d4.x, ndisp);
+                res.y = cvtPixel(d4.y, ndisp);
+                res.z = cvtPixel(d4.z, ndisp);
+                res.w = cvtPixel(d4.w, ndisp);
+
+                uint4* line = (uint4*)(out_image + y * out_step);
+                line[x >> 2] = res;
+            }
+        }
+
+        __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if(x < width && y < height)
+            {
+                short2 d2 = *(short2*)(disp + y * disp_step + x);
+
+                uint2 res;
+                res.x = cvtPixel(d2.x, ndisp);
+                res.y = cvtPixel(d2.y, ndisp);
+
+                uint2* line = (uint2*)(out_image + y * out_step);
+                line[x >> 1] = res;
+            }
+        }
+
+
+        void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src.cols, threads.x << 2);
+            grid.y = divUp(src.rows, threads.y);
+
+            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src.cols, threads.x << 1);
+            grid.y = divUp(src.rows, threads.y);
+
+            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
+
+        __constant__ float cq[16];
+
+        template <typename T, typename D>
+        __global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y >= disp.rows || x >= disp.cols)
+                return;
+
+            const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
+            const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
+            const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
+            const float qw = x * cq[12] + y * cq[13] + cq[15];
+
+            const T d = disp(y, x);
+
+            const float iW = 1.f / (qw + cq[14] * d);
+
+            D v = VecTraits<D>::all(1.0f);
+            v.x = (qx + cq[2] * d) * iW;
+            v.y = (qy + cq[6] * d) * iW;
+            v.z = (qz + cq[10] * d) * iW;
+
+            xyz(y, x) = v;
+        }
+
+        template <typename T, typename D>
+        void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));
+
+            cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
+
+            reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+
+        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
+
+        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(harrisDxTex, j, i);
+                        float dy = tex2D(harrisDyTex, j, i);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
+            }
+        }
+
+        template <typename BR, typename BC>
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    const int y = border_col.idx_row(i);
+
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        const int x = border_row.idx_col(j);
+
+                        float dx = tex2D(harrisDxTex, x, y);
+                        float dy = tex2D(harrisDyTex, x, y);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
+            }
+        }
+
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
+
+            bindTexture(&harrisDxTex, Dx);
+            bindTexture(&harrisDyTex, Dy);
+
+            switch (border_type)
+            {
+            case BORDER_REFLECT101_GPU:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                break;
+
+            case BORDER_REFLECT_GPU:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                break;
+
+            case BORDER_REPLICATE_GPU:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
+
+        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(minEigenValDxTex, j, i);
+                        float dy = tex2D(minEigenValDyTex, j, i);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                a *= 0.5f;
+                c *= 0.5f;
+
+                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
+            }
+        }
+
+
+        template <typename BR, typename BC>
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    int y = border_col.idx_row(i);
+
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        int x = border_row.idx_col(j);
+
+                        float dx = tex2D(minEigenValDxTex, x, y);
+                        float dy = tex2D(minEigenValDyTex, x, y);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                a *= 0.5f;
+                c *= 0.5f;
+
+                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
+            }
+        }
+
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
+
+            bindTexture(&minEigenValDxTex, Dx);
+            bindTexture(&minEigenValDyTex, Dy);
+
+            switch (border_type)
+            {
+            case BORDER_REFLECT101_GPU:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                break;
+
+            case BORDER_REFLECT_GPU:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                break;
+
+            case BORDER_REPLICATE_GPU:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        //////////////////////////////////////////////////////////////////////////
+        // buildWarpMaps
+
+        // TODO use intrinsics like __sinf and so on
+
+        namespace build_warp_maps
+        {
+
+            __constant__ float ck_rinv[9];
+            __constant__ float cr_kinv[9];
+            __constant__ float ct[3];
+            __constant__ float cscale;
+        }
+
+
+        class PlaneMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
+
+                float x_ = u / cscale - ct[0];
+                float y_ = v / cscale - ct[1];
+
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
+
+                x /= z;
+                y /= z;
+            }
+        };
+
+
+        class CylindricalMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
+
+                u /= cscale;
+                float x_ = ::sinf(u);
+                float y_ = v / cscale;
+                float z_ = ::cosf(u);
+
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+                if (z > 0) { x /= z; y /= z; }
+                else x = y = -1;
+            }
+        };
+
+
+        class SphericalMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
+
+                v /= cscale;
+                u /= cscale;
+
+                float sinv = ::sinf(v);
+                float x_ = sinv * ::sinf(u);
+                float y_ = -::cosf(v);
+                float z_ = sinv * ::cosf(u);
+
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+                if (z > 0) { x /= z; y /= z; }
+                else x = y = -1;
+            }
+        };
+
+
+        template <typename Mapper>
+        __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
+                                            PtrStepf map_x, PtrStepf map_y)
+        {
+            int du = blockIdx.x * blockDim.x + threadIdx.x;
+            int dv = blockIdx.y * blockDim.y + threadIdx.y;
+            if (du < cols && dv < rows)
+            {
+                float u = tl_u + du;
+                float v = tl_v + dv;
+                float x, y;
+                Mapper::mapBackward(u, v, x, y);
+                map_x.ptr(dv)[du] = x;
+                map_y.ptr(dv)[du] = y;
+            }
+        }
+
+
+        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                const float k_rinv[9], const float r_kinv[9], const float t[3],
+                                float scale, cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+
+            int cols = map_x.cols;
+            int rows = map_x.rows;
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                      const float k_rinv[9], const float r_kinv[9], float scale,
+                                      cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+
+            int cols = map_x.cols;
+            int rows = map_x.rows;
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                    const float k_rinv[9], const float r_kinv[9], float scale,
+                                    cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+
+            int cols = map_x.cols;
+            int rows = map_x.rows;
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev {
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/match_template.cu b/modules/gpuimgproc/src/cuda/match_template.cu
new file mode 100644
index 0000000000..6670639290
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/match_template.cu
@@ -0,0 +1,916 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace match_template
+    {
+        __device__ __forceinline__ float sum(float v) { return v; }
+        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
+        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
+        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+
+        __device__ __forceinline__ float first(float v) { return v; }
+        __device__ __forceinline__ float first(float2 v) { return v.x; }
+        __device__ __forceinline__ float first(float3 v) { return v.x; }
+        __device__ __forceinline__ float first(float4 v) { return v.x; }
+
+        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
+        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
+        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
+        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
+        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_CCORR
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_SQDIFF
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+                Typef delta;
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                    {
+                        delta = sub(image_ptr[x + j], templ_ptr[j]);
+                        res = res + delta * delta;
+                    }
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
+                                             cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF_NORMED
+
+        // normAcc* are accurate normalization routines which make GPU matchTemplate
+        // consistent with CPU one
+
+        __device__ float normAcc(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 0;
+        }
+
+
+        __device__ float normAcc_SQDIFF(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 1;
+        }
+
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
+                unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
+                                                  sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
+                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC2(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                unsigned int templ_sum_r, unsigned int templ_sum_g,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
+                    image_sum_r, image_sum_g, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                float templ_sum_scale_a,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                const PtrStep<unsigned int> image_sum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b
+                                         - image_sum_a_ * templ_sum_scale_a;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_a,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    (float)templ_sum_a / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF_NORMED
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
+                int w, int h, float weight,
+                float templ_sum_scale, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum,
+                const PtrStep<unsigned long long> image_sqsum,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float ccorr = result.ptr(y)[x];
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
+                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
+                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                    int w, int h, const PtrStepSz<unsigned int> image_sum,
+                    const PtrStepSz<unsigned long long> image_sqsum,
+                    unsigned int templ_sum, unsigned long long templ_sqsum,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale = templ_sum * weight;
+            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
+                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
+                    image_sum, image_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g,
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sum_scale_a, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float image_sqsum_a_ = (float)(
+                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
+                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
+                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sum_scale_a = templ_sum_a * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
+                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    image_sum_a, image_sqsum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // normalize
+
+        template <int cn>
+        __global__ void normalizeKernel_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
+                unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
+                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 2:
+                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 3:
+                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 4:
+                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // extractFirstChannel
+
+        template <int cn>
+        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
+        {
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef val = ((const Typef*)image.ptr(y))[x];
+                result.ptr(y)[x] = first(val);
+            }
+        }
+
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 2:
+                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 3:
+                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 4:
+                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            }
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } //namespace match_template
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/nlm.cu b/modules/gpuimgproc/src/cuda/nlm.cu
new file mode 100644
index 0000000000..92bfccf37c
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/nlm.cu
@@ -0,0 +1,569 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::gpu;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+//// Non Local Means Denosing
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm2(const float& v) { return v*v; }
+        __device__ __forceinline__ float norm2(const float2& v) { return v.x*v.x + v.y*v.y; }
+        __device__ __forceinline__ float norm2(const float3& v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
+        __device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z  + v.w*v.w; }
+
+        template<typename T, typename B>
+        __global__ void nlm_kernel(const PtrStep<T> src, PtrStepSz<T> dst, const B b, int search_radius, int block_radius, float noise_mult)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            const int i = blockDim.y * blockIdx.y + threadIdx.y;
+            const int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+            if (j >= dst.cols || i >= dst.rows)
+                return;
+
+            int bsize = search_radius + block_radius;
+            int search_window = 2 * search_radius + 1;
+            float minus_search_window2_inv = -1.f/(search_window * search_window);
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0.f;
+
+            if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
+            {
+                for(float y = -search_radius; y <= search_radius; ++y)
+                    for(float x = -search_radius; x <= search_radius; ++x)
+                    {
+                        float dist2 = 0;
+                        for(float ty = -block_radius; ty <= block_radius; ++ty)
+                            for(float tx = -block_radius; tx <= block_radius; ++tx)
+                            {
+                                value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
+                                value_type av = saturate_cast<value_type>(src(i +     ty, j +     tx));
+
+                                dist2 += norm2(av - bv);
+                            }
+
+                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
+
+                        /*if (i == 255 && j == 255)
+                            printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/
+
+                        sum1 = sum1 + w * saturate_cast<value_type>(src(i + y, j + x));
+                        sum2 += w;
+                    }
+            }
+            else
+            {
+                for(float y = -search_radius; y <= search_radius; ++y)
+                    for(float x = -search_radius; x <= search_radius; ++x)
+                    {
+                        float dist2 = 0;
+                        for(float ty = -block_radius; ty <= block_radius; ++ty)
+                            for(float tx = -block_radius; tx <= block_radius; ++tx)
+                            {
+                                value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
+                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));
+                                dist2 += norm2(av - bv);
+                            }
+
+                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
+
+                        sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
+                        sum2 += w;
+                    }
+
+            }
+
+            dst(i, j) = saturate_cast<T>(sum1 / sum2);
+
+        }
+
+        template<typename T, template <typename> class B>
+        void nlm_caller(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            int block_window = 2 * block_radius + 1;
+            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
+            float noise_mult = minus_h2_inv/(block_window * block_window);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);
+
+            static func_t funcs[] =
+            {
+                nlm_caller<T, BrdReflect101>,
+                nlm_caller<T, BrdReplicate>,
+                nlm_caller<T, BrdConstant>,
+                nlm_caller<T, BrdReflect>,
+                nlm_caller<T, BrdWrap>,
+            };
+            funcs[borderMode](src, dst, search_radius, block_radius, h, stream);
+        }
+
+        template void nlm_bruteforce_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
+        template void nlm_bruteforce_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
+        template void nlm_bruteforce_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
+    }
+}}}
+
+//////////////////////////////////////////////////////////////////////////////////
+//// Non Local Means Denosing (fast approximate version)
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+
+        template <int cn> struct Unroll;
+        template <> struct Unroll<1>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
+            {
+                return thrust::tie(val1, val2);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op);
+            }
+        };
+        template <> struct Unroll<2>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op);
+            }
+        };
+        template <> struct Unroll<3>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op);
+            }
+        };
+        template <> struct Unroll<4>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op, op);
+            }
+        };
+
+        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
+        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
+        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
+
+        template <class T> struct FastNonLocalMenas
+        {
+            enum
+            {
+                CTA_SIZE = 128,
+
+                TILE_COLS = 128,
+                TILE_ROWS = 32,
+
+                STRIDE = CTA_SIZE
+            };
+
+            struct plus
+            {
+                __device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; }
+            };
+
+            int search_radius;
+            int block_radius;
+
+            int search_window;
+            int block_window;
+            float minus_h2_inv;
+
+            FastNonLocalMenas(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
+                search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {}
+
+            PtrStep<T> src;
+            mutable PtrStepi buffer;
+
+            __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+            {
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    dist_sums[index] = 0;
+
+                    for(int tx = 0; tx < block_window; ++tx)
+                        col_sums(tx, index) = 0;
+
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    int ay = i;
+                    int ax = j;
+
+                    int by = i + y - search_radius;
+                    int bx = j + x - search_radius;
+
+#if 1
+                    for (int tx = -block_radius; tx <= block_radius; ++tx)
+                    {
+                        int col_sum = 0;
+                        for (int ty = -block_radius; ty <= block_radius; ++ty)
+                        {
+                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
+
+                            dist_sums[index] += dist;
+                            col_sum += dist;
+                        }
+                        col_sums(tx + block_radius, index) = col_sum;
+                    }
+#else
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                        for (int tx = -block_radius; tx <= block_radius; ++tx)
+                        {
+                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
+
+                            dist_sums[index] += dist;
+                            col_sums(tx + block_radius, index) += dist;
+                        }
+#endif
+
+                    up_col_sums(j, index) = col_sums(block_window - 1, index);
+                }
+            }
+
+            __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+            {
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    int ay = i;
+                    int ax = j + block_radius;
+
+                    int by = i + y - search_radius;
+                    int bx = j + x - search_radius + block_radius;
+
+                    int col_sum = 0;
+
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
+
+                    dist_sums[index] += col_sum - col_sums(first, index);
+
+                    col_sums(first, index) = col_sum;
+                    up_col_sums(j, index) = col_sum;
+                }
+            }
+
+            __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+            {
+                int ay = i;
+                int ax = j + block_radius;
+
+                T a_up   = src(ay - block_radius - 1, ax);
+                T a_down = src(ay + block_radius, ax);
+
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    int by = i + y - search_radius;
+                    int bx = j + x - search_radius + block_radius;
+
+                    T b_up   = src(by - block_radius - 1, bx);
+                    T b_down = src(by + block_radius, bx);
+
+                    int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
+
+                    dist_sums[index] += col_sum  - col_sums(first, index);
+                    col_sums(first, index) = col_sum;
+                    up_col_sums(j, index) = col_sum;
+                }
+            }
+
+            __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type;
+
+                float weights_sum = 0;
+                sum_type sum = VecTraits<sum_type>::all(0);
+
+                float bw2_inv = 1.f/(block_window * block_window);
+
+                int sx = j - search_radius;
+                int sy = i - search_radius;
+
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    float avg_dist = dist_sums[index] * bw2_inv;
+                    float weight = __expf(avg_dist * minus_h2_inv);
+                    weights_sum += weight;
+
+                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
+                }
+
+                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
+
+                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
+                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
+                                 threadIdx.x,
+                                 Unroll<VecTraits<T>::cn>::op());
+
+                if (threadIdx.x == 0)
+                    dst = saturate_cast<T>(sum / weights_sum);
+            }
+
+            __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
+            {
+                int tbx = blockIdx.x * TILE_COLS;
+                int tby = blockIdx.y * TILE_ROWS;
+
+                int tex = ::min(tbx + TILE_COLS, dst.cols);
+                int tey = ::min(tby + TILE_ROWS, dst.rows);
+
+                PtrStepi col_sums;
+                col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
+                col_sums.step = buffer.step;
+
+                PtrStepi up_col_sums;
+                up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
+                up_col_sums.step = buffer.step;
+
+                extern __shared__ int dist_sums[]; //search_window * search_window
+
+                int first = 0;
+
+                for (int i = tby; i < tey; ++i)
+                    for (int j = tbx; j < tex; ++j)
+                    {
+                        __syncthreads();
+
+                        if (j == tbx)
+                        {
+                            initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
+                            first = 0;
+                        }
+                        else
+                        {
+                            if (i == tby)
+                              shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
+                            else
+                              shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
+
+                            first = (first + 1) % block_window;
+                        }
+
+                        __syncthreads();
+
+                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
+                    }
+            }
+
+        };
+
+        template<typename T>
+        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
+
+        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
+        {
+            typedef FastNonLocalMenas<uchar> FNLM;
+            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
+
+            buffer_cols = search_window * search_window * grid.y;
+            buffer_rows = src.cols + block_window * grid.x;
+        }
+
+        template<typename T>
+        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
+                          int search_window, int block_window, float h, cudaStream_t stream)
+        {
+            typedef FastNonLocalMenas<T> FNLM;
+            FNLM fnlm(search_window, block_window, h);
+
+            fnlm.src = (PtrStepSz<T>)src;
+            fnlm.buffer = buffer;
+
+            dim3 block(FNLM::CTA_SIZE, 1);
+            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
+            int smem = search_window * search_window * sizeof(int);
+
+
+            fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
+            cudaSafeCall ( cudaGetLastError () );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void nlm_fast_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float,  cudaStream_t);
+        template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
+        template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
+
+
+
+        __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < lab.cols && y < lab.rows)
+            {
+                uchar3 p = lab(y, x);
+                ab(y,x) = make_uchar2(p.y, p.z);
+                l(y,x) = p.x;
+            }
+        }
+
+        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
+        {
+            dim3 b(32, 8);
+            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
+
+            fnlm_split_kernel<<<g, b>>>(lab, l, ab);
+            cudaSafeCall ( cudaGetLastError () );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep<uchar2> ab, PtrStepSz<uchar3> lab)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < lab.cols && y < lab.rows)
+            {
+                uchar2 p = ab(y, x);
+                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
+            }
+        }
+
+        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream)
+        {
+            dim3 b(32, 8);
+            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
+
+            fnlm_merge_kernel<<<g, b>>>(l, ab, lab);
+            cudaSafeCall ( cudaGetLastError () );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/pyr_down.cu b/modules/gpuimgproc/src/cuda/pyr_down.cu
new file mode 100644
index 0000000000..904f549bad
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/pyr_down.cu
@@ -0,0 +1,228 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
+
+            __shared__ work_t smem[256 + 4];
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;
+
+            const int src_y = 2 * y;
+
+            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, x);
+                    sum = sum + 0.25f   * src(src_y - 1, x);
+                    sum = sum + 0.375f  * src(src_y    , x);
+                    sum = sum + 0.25f   * src(src_y + 1, x);
+                    sum = sum + 0.0625f * src(src_y + 2, x);
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, left_x);
+                    sum = sum + 0.25f   * src(src_y - 1, left_x);
+                    sum = sum + 0.375f  * src(src_y    , left_x);
+                    sum = sum + 0.25f   * src(src_y + 1, left_x);
+                    sum = sum + 0.0625f * src(src_y + 2, left_x);
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, right_x);
+                    sum = sum + 0.25f   * src(src_y - 1, right_x);
+                    sum = sum + 0.375f  * src(src_y    , right_x);
+                    sum = sum + 0.25f   * src(src_y + 1, right_x);
+                    sum = sum + 0.0625f * src(src_y + 2, right_x);
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+            else
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+
+            __syncthreads();
+
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;
+
+                work_t sum;
+
+                sum =       0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }
+
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+            B<T> b(src.rows, src.cols);
+
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/pyr_up.cu b/modules/gpuimgproc/src/cuda/pyr_up.cu
new file mode 100644
index 0000000000..36a72274cf
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/pyr_up.cu
@@ -0,0 +1,196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            __shared__ sum_t s_srcPatch[10][10];
+            __shared__ sum_t s_dstPatch[20][16];
+
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+            {
+                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+                srcx = ::abs(srcx);
+                srcx = ::min(src.cols - 1, srcx);
+
+                srcy = ::abs(srcy);
+                srcy = ::min(src.rows - 1, srcy);
+
+                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
+            }
+
+            __syncthreads();
+
+            sum_t sum = VecTraits<sum_t>::all(0);
+
+            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
+            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
+            const bool eveny = ((threadIdx.y & 1) == 0);
+            const int tidx = threadIdx.x;
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
+
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
+            }
+
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
+            }
+
+            __syncthreads();
+
+            sum = VecTraits<sum_t>::all(0);
+
+            const int tidy = threadIdx.y;
+
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
+            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
+
+            if (x < dst.cols && y < dst.rows)
+                dst(y, x) = saturate_cast<T>(4.0f * sum);
+        }
+
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            pyrUp<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/remap.cu b/modules/gpuimgproc/src/cuda/remap.cu
new file mode 100644
index 0000000000..dd2c669159
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/remap.cu
@@ -0,0 +1,274 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float*, bool) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
+            }
+        };
+
+        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
+
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+        }
+
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/resize.cu b/modules/gpuimgproc/src/cuda/resize.cu
new file mode 100644
index 0000000000..04c1fb2ac4
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/resize.cu
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <cfloat>
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x * fx;
+                const float ycoo = y * fy;
+
+                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                dst(y, x) = saturate_cast<T>(src(y, x));
+            }
+        }
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_resize_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                const int xoff; \
+                const int yoff; \
+                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type, srcWhole); \
+                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate< type > brd(src.rows, src.cols); \
+                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
+                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                if (stream == 0)
+                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
+                else
+                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> struct ResizeDispatcher<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                int iscale_x = (int)round(fx);
+                int iscale_y = (int)round(fy);
+
+                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
+                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
+                else
+                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+            PtrStepSzb dst, int interpolation, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
+
+            static const caller_t callers[4] =
+            {
+                ResizeDispatcher<PointFilter, T>::call,
+                ResizeDispatcher<LinearFilter, T>::call,
+                ResizeDispatcher<CubicFilter, T>::call,
+                ResizeDispatcher<AreaFilter, T>::call
+            };
+            // chenge to linear if area interpolation upscaling
+            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
+                interpolation = 1;
+
+            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
+                static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template<typename T> struct scan_traits{};
+
+        template<> struct scan_traits<uchar>
+        {
+            typedef float scan_line_type;
+        };
+
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cuda/warp.cu b/modules/gpuimgproc/src/cuda/warp.cu
new file mode 100644
index 0000000000..8c5a067d36
--- /dev/null
+++ b/modules/gpuimgproc/src/cuda/warp.cu
@@ -0,0 +1,389 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        __constant__ float c_warpMat[3 * 3];
+
+        struct AffineTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        struct PerspectiveTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////
+        // Build Maps
+
+        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < xmap.cols && y < xmap.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                xmap(y, x) = coord.x;
+                ymap(y, x) = coord.y;
+            }
+        }
+
+        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
+
+            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
+        }
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Warp
+
+        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
+            }
+        }
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
+            {
+                (void)xoff;
+                (void)yoff;
+                (void)srcWhole;
+
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
+            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_warp_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
+            {
+                if (stream == 0)
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
+                else
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
+            }
+        };
+
+        template <class Transform, typename T>
+        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
+                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
+
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+        }
+
+        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
+        }
+
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
+        }
+
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpuimgproc/src/cvt_color_internal.h b/modules/gpuimgproc/src/cvt_color_internal.h
new file mode 100644
index 0000000000..010d832a25
--- /dev/null
+++ b/modules/gpuimgproc/src/cvt_color_internal.h
@@ -0,0 +1,274 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __cvt_color_internal_h__
+#define __cvt_color_internal_h__
+
+namespace cv { namespace gpu { namespace cudev
+{
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
+    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u)   \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
+
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
+}}}
+
+#endif
diff --git a/modules/gpuimgproc/src/denoising.cpp b/modules/gpuimgproc/src/denoising.cpp
new file mode 100644
index 0000000000..1687f8e3cc
--- /dev/null
+++ b/modules/gpuimgproc/src/denoising.cpp
@@ -0,0 +1,198 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::bilateralFilter(const GpuMat&, GpuMat&, int, float, float, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::nonLocalMeans(const GpuMat&, GpuMat&, float, int, int, int, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat&, GpuMat&, float, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat&, GpuMat&, float, float, int, int, Stream&) { throw_no_cuda(); }
+
+
+#else
+
+//////////////////////////////////////////////////////////////////////////////////
+//// Non Local Means Denosing (brute force)
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t stream);
+
+        template<typename T>
+        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& s)
+{
+    using cv::gpu::cudev::imgproc::bilateral_filter_gpu;
+
+    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
+
+    static const func_t funcs[6][4] =
+    {
+        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
+        {0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/},
+        {bilateral_filter_gpu<ushort>     , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3>     , bilateral_filter_gpu<ushort4>     },
+        {bilateral_filter_gpu<short>      , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3>      , bilateral_filter_gpu<short4>      },
+        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
+        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
+    };
+
+    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
+    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
+
+
+    int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
+    kernel_size = std::max(radius, 1)*2 + 1;
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+    dst.create(src.size(), src.type());
+    func(src, dst, kernel_size, sigma_spatial, sigma_color, gpuBorderType, StreamAccessor::getStream(s));
+}
+
+void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s)
+{
+    using cv::gpu::cudev::imgproc::nlm_bruteforce_gpu;
+    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
+
+    static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ };
+
+    CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3);
+
+    const func_t func = funcs[src.channels() - 1];
+    CV_Assert(func != 0);
+
+    int b = borderMode;
+    CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP);
+
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+    dst.create(src.size(), src.type());
+    func(src, dst, search_window/2, block_window/2, h, gpuBorderType, StreamAccessor::getStream(s));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////
+//// Non Local Means Denosing (fast approxinate)
+
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
+
+        template<typename T>
+        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
+                          int search_window, int block_window, float h, cudaStream_t stream);
+
+        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream);
+        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream);
+     }
+}}}
+
+void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
+{
+    CV_Assert(src.depth() == CV_8U && src.channels() < 4);
+
+    int border_size = search_window/2 + block_window/2;
+    Size esize = src.size() + Size(border_size, border_size) * 2;
+
+    cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
+    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);
+
+    cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
+    GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));
+
+    int bcols, brows;
+    cudev::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
+    buffer.create(brows, bcols, CV_32S);
+
+    using namespace cv::gpu::cudev::imgproc;
+    typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
+    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};
+
+    dst.create(src.size(), src.type());
+    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
+}
+
+void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
+{
+    CV_Assert(src.type() == CV_8UC3);
+
+    lab.create(src.size(), src.type());
+    cv::gpu::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, s);
+
+    l.create(src.size(), CV_8U);
+    ab.create(src.size(), CV_8UC2);
+    cudev::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
+
+    simpleMethod(l, l, h_luminance, search_window, block_window, s);
+    simpleMethod(ab, ab, h_color, search_window, block_window, s);
+
+    cudev::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
+    cv::gpu::cvtColor(lab, dst, cv::COLOR_Lab2BGR, 0, s);
+}
+
+
+#endif
+
+
diff --git a/modules/gpuimgproc/src/gftt.cpp b/modules/gpuimgproc/src/gftt.cpp
new file mode 100644
index 0000000000..18a729bc17
--- /dev/null
+++ b/modules/gpuimgproc/src/gftt.cpp
@@ -0,0 +1,169 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace gfft
+    {
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
+    }
+}}}
+
+void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask)
+{
+    using namespace cv::gpu::cudev::gfft;
+
+    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
+
+    ensureSizeIsEnough(image.size(), CV_32F, eig_);
+
+    if (useHarrisDetector)
+        cornerHarris(image, eig_, Dx_, Dy_, buf_, blockSize, 3, harrisK);
+    else
+        cornerMinEigenVal(image, eig_, Dx_, Dy_, buf_, blockSize, 3);
+
+    double maxVal = 0;
+    minMax(eig_, 0, &maxVal, GpuMat(), minMaxbuf_);
+
+    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+
+    int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
+
+    if (total == 0)
+    {
+        corners.release();
+        return;
+    }
+
+    sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
+
+    if (minDistance < 1)
+        tmpCorners_.colRange(0, maxCorners > 0 ? std::min(maxCorners, total) : total).copyTo(corners);
+    else
+    {
+        std::vector<Point2f> tmp(total);
+        Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
+        tmpCorners_.colRange(0, total).download(tmpMat);
+
+        std::vector<Point2f> tmp2;
+        tmp2.reserve(total);
+
+        const int cell_size = cvRound(minDistance);
+        const int grid_width = (image.cols + cell_size - 1) / cell_size;
+        const int grid_height = (image.rows + cell_size - 1) / cell_size;
+
+        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+
+        for (int i = 0; i < total; ++i)
+        {
+            Point2f p = tmp[i];
+
+            bool good = true;
+
+            int x_cell = static_cast<int>(p.x / cell_size);
+            int y_cell = static_cast<int>(p.y / cell_size);
+
+            int x1 = x_cell - 1;
+            int y1 = y_cell - 1;
+            int x2 = x_cell + 1;
+            int y2 = y_cell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(grid_width - 1, x2);
+            y2 = std::min(grid_height - 1, y2);
+
+            for (int yy = y1; yy <= y2; yy++)
+            {
+                for (int xx = x1; xx <= x2; xx++)
+                {
+                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
+
+                    if (!m.empty())
+                    {
+                        for(size_t j = 0; j < m.size(); j++)
+                        {
+                            float dx = p.x - m[j].x;
+                            float dy = p.y - m[j].y;
+
+                            if (dx * dx + dy * dy < minDistance * minDistance)
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[y_cell * grid_width + x_cell].push_back(p);
+
+                tmp2.push_back(p);
+
+                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
+                    break;
+            }
+        }
+
+        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+    }
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpuimgproc/src/graphcuts.cpp b/modules/gpuimgproc/src/graphcuts.cpp
new file mode 100644
index 0000000000..40ccd04710
--- /dev/null
+++ b/modules/gpuimgproc/src/graphcuts.cpp
@@ -0,0 +1,282 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
+void cv::gpu::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace ccl
+    {
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream);
+
+        template<typename T>
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+    }
+}}}
+
+static float4 scalarToCudaType(const cv::Scalar& in)
+{
+  return make_float4((float)in[0], (float)in[1], (float)in[2], (float)in[3]);
+}
+
+void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
+{
+    CV_Assert(!image.empty());
+
+    int ch = image.channels();
+    CV_Assert(ch <= 4);
+
+    int depth = image.depth();
+
+    typedef void (*func_t)(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+    static const func_t suppotLookup[8][4] =
+    {   //    1,    2,     3,     4
+        { cudev::ccl::computeEdges<uchar>,  0,  cudev::ccl::computeEdges<uchar3>,  cudev::ccl::computeEdges<uchar4>  },// CV_8U
+        { 0,                                 0,  0,                                  0                                  },// CV_16U
+        { cudev::ccl::computeEdges<ushort>, 0,  cudev::ccl::computeEdges<ushort3>, cudev::ccl::computeEdges<ushort4> },// CV_8S
+        { 0,                                 0,  0,                                  0                                  },// CV_16S
+        { cudev::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
+        { cudev::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
+        { 0,                                 0,  0,                                  0                                  },// CV_64F
+        { 0,                                 0,  0,                                  0                                  } // CV_USRTYPE1
+    };
+
+    func_t f = suppotLookup[depth][ch - 1];
+    CV_Assert(f);
+
+    if (image.size() != mask.size() || mask.type() != CV_8UC1)
+        mask.create(image.size(), CV_8UC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
+    f(image, mask, culo, cuhi, stream);
+}
+
+void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
+{
+    CV_Assert(!mask.empty() && mask.type() == CV_8U);
+
+    if (!deviceSupports(SHARED_ATOMICS))
+        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
+
+    components.create(mask.size(), CV_32SC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudev::ccl::labelComponents(mask, components, flags, stream);
+}
+
+namespace
+{
+    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
+
+    class NppiGraphcutStateHandler
+    {
+    public:
+        NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
+        {
+            nppSafeCall( func(sznpp, &pState, pDeviceMem) );
+        }
+
+        ~NppiGraphcutStateHandler()
+        {
+            nppSafeCall( nppiGraphcutFree(pState) );
+        }
+
+        operator NppiGraphcutState*()
+        {
+            return pState;
+        }
+
+    private:
+        NppiGraphcutState* pState;
+    };
+}
+
+void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
+{
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
+    Size src_size = terminals.size();
+
+    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(leftTransp.type() == terminals.type());
+
+    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(rightTransp.type() == terminals.type());
+
+    CV_Assert(top.size() == src_size);
+    CV_Assert(top.type() == terminals.type());
+
+    CV_Assert(bottom.size() == src_size);
+    CV_Assert(bottom.type() == terminals.type());
+
+    labels.create(src_size, CV_8U);
+
+    NppiSize sznpp;
+    sznpp.width = src_size.width;
+    sznpp.height = src_size.height;
+
+    int bufsz;
+    nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
+
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
+
+#if (CUDA_VERSION < 5000)
+    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), bottom.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
+              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
+{
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
+    Size src_size = terminals.size();
+
+    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(leftTransp.type() == terminals.type());
+
+    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(rightTransp.type() == terminals.type());
+
+    CV_Assert(top.size() == src_size);
+    CV_Assert(top.type() == terminals.type());
+
+    CV_Assert(topLeft.size() == src_size);
+    CV_Assert(topLeft.type() == terminals.type());
+
+    CV_Assert(topRight.size() == src_size);
+    CV_Assert(topRight.type() == terminals.type());
+
+    CV_Assert(bottom.size() == src_size);
+    CV_Assert(bottom.type() == terminals.type());
+
+    CV_Assert(bottomLeft.size() == src_size);
+    CV_Assert(bottomLeft.type() == terminals.type());
+
+    CV_Assert(bottomRight.size() == src_size);
+    CV_Assert(bottomRight.type() == terminals.type());
+
+    labels.create(src_size, CV_8U);
+
+    NppiSize sznpp;
+    sznpp.width = src_size.width;
+    sznpp.height = src_size.height;
+
+    int bufsz;
+    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
+
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
+
+#if (CUDA_VERSION < 5000)
+    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
+        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
+            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
+            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
+            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpuimgproc/src/hough.cpp b/modules/gpuimgproc/src/hough.cpp
new file mode 100644
index 0000000000..bc0a8a400d
--- /dev/null
+++ b/modules/gpuimgproc/src/hough.cpp
@@ -0,0 +1,1432 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::HoughLines(const GpuMat&, GpuMat&, float, float, int, bool, int) { throw_no_cuda(); }
+void cv::gpu::HoughLines(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, bool, int) { throw_no_cuda(); }
+void cv::gpu::HoughLinesDownload(const GpuMat&, OutputArray, OutputArray) { throw_no_cuda(); }
+
+void cv::gpu::HoughLinesP(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, int, int) { throw_no_cuda(); }
+
+void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, int, float, float, int, int, int, int, int) { throw_no_cuda(); }
+void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, HoughCirclesBuf&, int, float, float, int, int, int, int, int) { throw_no_cuda(); }
+void cv::gpu::HoughCirclesDownload(const GpuMat&, OutputArray) { throw_no_cuda(); }
+
+Ptr<GeneralizedHough_GPU> cv::gpu::GeneralizedHough_GPU::create(int) { throw_no_cuda(); return Ptr<GeneralizedHough_GPU>(); }
+cv::gpu::GeneralizedHough_GPU::~GeneralizedHough_GPU() {}
+void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat&, int, Point) { throw_no_cuda(); }
+void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat&, const GpuMat&, const GpuMat&, Point) { throw_no_cuda(); }
+void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat&, GpuMat&, int) { throw_no_cuda(); }
+void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::GeneralizedHough_GPU::download(const GpuMat&, OutputArray, OutputArray) { throw_no_cuda(); }
+void cv::gpu::GeneralizedHough_GPU::release() {}
+
+#else /* !defined (HAVE_CUDA) */
+
+#include "opencv2/core/utility.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough
+    {
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+    }
+}}}
+
+//////////////////////////////////////////////////////////
+// HoughLines
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough
+    {
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
+    }
+}}}
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    HoughLinesBuf buf;
+    HoughLines(src, lines, buf, rho, theta, threshold, doSort, maxLines);
+}
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    using namespace cv::gpu::cudev::hough;
+
+    CV_Assert(src.type() == CV_8UC1);
+    CV_Assert(src.cols < std::numeric_limits<unsigned short>::max());
+    CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());
+
+    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.list);
+    unsigned int* srcPoints = buf.list.ptr<unsigned int>();
+
+    const int pointsCount = buildPointList_gpu(src, srcPoints);
+    if (pointsCount == 0)
+    {
+        lines.release();
+        return;
+    }
+
+    const int numangle = cvRound(CV_PI / theta);
+    const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
+    CV_Assert(numangle > 0 && numrho > 0);
+
+    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, buf.accum);
+    buf.accum.setTo(Scalar::all(0));
+
+    DeviceInfo devInfo;
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+    ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
+
+    int linesCount = linesGetResult_gpu(buf.accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
+    if (linesCount > 0)
+        lines.cols = linesCount;
+    else
+        lines.release();
+}
+
+void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
+{
+    if (d_lines.empty())
+    {
+        h_lines_.release();
+        if (h_votes_.needed())
+            h_votes_.release();
+        return;
+    }
+
+    CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);
+
+    h_lines_.create(1, d_lines.cols, CV_32FC2);
+    Mat h_lines = h_lines_.getMat();
+    d_lines.row(0).download(h_lines);
+
+    if (h_votes_.needed())
+    {
+        h_votes_.create(1, d_lines.cols, CV_32SC1);
+        Mat h_votes = h_votes_.getMat();
+        GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
+        d_votes.download(h_votes);
+    }
+}
+
+//////////////////////////////////////////////////////////
+// HoughLinesP
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough
+    {
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
+    }
+}}}
+
+void cv::gpu::HoughLinesP(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines)
+{
+    using namespace cv::gpu::cudev::hough;
+
+    CV_Assert( src.type() == CV_8UC1 );
+    CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+    CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+
+    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.list);
+    unsigned int* srcPoints = buf.list.ptr<unsigned int>();
+
+    const int pointsCount = buildPointList_gpu(src, srcPoints);
+    if (pointsCount == 0)
+    {
+        lines.release();
+        return;
+    }
+
+    const int numangle = cvRound(CV_PI / theta);
+    const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
+    CV_Assert( numangle > 0 && numrho > 0 );
+
+    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, buf.accum);
+    buf.accum.setTo(Scalar::all(0));
+
+    DeviceInfo devInfo;
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+    ensureSizeIsEnough(1, maxLines, CV_32SC4, lines);
+
+    int linesCount = houghLinesProbabilistic_gpu(src, buf.accum, lines.ptr<int4>(), maxLines, rho, theta, maxLineGap, minLineLength);
+
+    if (linesCount > 0)
+        lines.cols = linesCount;
+    else
+        lines.release();
+}
+
+//////////////////////////////////////////////////////////
+// HoughCircles
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough
+    {
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
+        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
+    }
+}}}
+
+void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
+{
+    HoughCirclesBuf buf;
+    HoughCircles(src, circles, buf, method, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius, maxCircles);
+}
+
+void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf& buf, int method,
+                           float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
+{
+    using namespace cv::gpu::cudev::hough;
+
+    CV_Assert(src.type() == CV_8UC1);
+    CV_Assert(src.cols < std::numeric_limits<unsigned short>::max());
+    CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());
+    CV_Assert(method == cv::HOUGH_GRADIENT);
+    CV_Assert(dp > 0);
+    CV_Assert(minRadius > 0 && maxRadius > minRadius);
+    CV_Assert(cannyThreshold > 0);
+    CV_Assert(votesThreshold > 0);
+    CV_Assert(maxCircles > 0);
+
+    const float idp = 1.0f / dp;
+
+    cv::gpu::Canny(src, buf.cannyBuf, buf.edges, std::max(cannyThreshold / 2, 1), cannyThreshold);
+
+    ensureSizeIsEnough(2, src.size().area(), CV_32SC1, buf.list);
+    unsigned int* srcPoints = buf.list.ptr<unsigned int>(0);
+    unsigned int* centers = buf.list.ptr<unsigned int>(1);
+
+    const int pointsCount = buildPointList_gpu(buf.edges, srcPoints);
+    if (pointsCount == 0)
+    {
+        circles.release();
+        return;
+    }
+
+    ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, buf.accum);
+    buf.accum.setTo(Scalar::all(0));
+
+    circlesAccumCenters_gpu(srcPoints, pointsCount, buf.cannyBuf.dx, buf.cannyBuf.dy, buf.accum, minRadius, maxRadius, idp);
+
+    int centersCount = buildCentersList_gpu(buf.accum, centers, votesThreshold);
+    if (centersCount == 0)
+    {
+        circles.release();
+        return;
+    }
+
+    if (minDist > 1)
+    {
+        cv::AutoBuffer<ushort2> oldBuf_(centersCount);
+        cv::AutoBuffer<ushort2> newBuf_(centersCount);
+        int newCount = 0;
+
+        ushort2* oldBuf = oldBuf_;
+        ushort2* newBuf = newBuf_;
+
+        cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) );
+
+        const int cellSize = cvRound(minDist);
+        const int gridWidth = (src.cols + cellSize - 1) / cellSize;
+        const int gridHeight = (src.rows + cellSize - 1) / cellSize;
+
+        std::vector< std::vector<ushort2> > grid(gridWidth * gridHeight);
+
+        const float minDist2 = minDist * minDist;
+
+        for (int i = 0; i < centersCount; ++i)
+        {
+            ushort2 p = oldBuf[i];
+
+            bool good = true;
+
+            int xCell = static_cast<int>(p.x / cellSize);
+            int yCell = static_cast<int>(p.y / cellSize);
+
+            int x1 = xCell - 1;
+            int y1 = yCell - 1;
+            int x2 = xCell + 1;
+            int y2 = yCell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(gridWidth - 1, x2);
+            y2 = std::min(gridHeight - 1, y2);
+
+            for (int yy = y1; yy <= y2; ++yy)
+            {
+                for (int xx = x1; xx <= x2; ++xx)
+                {
+                    std::vector<ushort2>& m = grid[yy * gridWidth + xx];
+
+                    for(size_t j = 0; j < m.size(); ++j)
+                    {
+                        float dx = (float)(p.x - m[j].x);
+                        float dy = (float)(p.y - m[j].y);
+
+                        if (dx * dx + dy * dy < minDist2)
+                        {
+                            good = false;
+                            goto break_out;
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[yCell * gridWidth + xCell].push_back(p);
+
+                newBuf[newCount++] = p;
+            }
+        }
+
+        cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) );
+        centersCount = newCount;
+    }
+
+    ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);
+
+    const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles,
+                                                    dp, minRadius, maxRadius, votesThreshold, deviceSupports(FEATURE_SET_COMPUTE_20));
+
+    if (circlesCount > 0)
+        circles.cols = circlesCount;
+    else
+        circles.release();
+}
+
+void cv::gpu::HoughCirclesDownload(const GpuMat& d_circles, cv::OutputArray h_circles_)
+{
+    if (d_circles.empty())
+    {
+        h_circles_.release();
+        return;
+    }
+
+    CV_Assert(d_circles.rows == 1 && d_circles.type() == CV_32FC3);
+
+    h_circles_.create(1, d_circles.cols, CV_32FC3);
+    Mat h_circles = h_circles_.getMat();
+    d_circles.download(h_circles);
+}
+
+//////////////////////////////////////////////////////////
+// GeneralizedHough
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough
+    {
+        template <typename T>
+        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                             PtrStepSz<short2> r_table, int* r_sizes,
+                             short2 templCenter, int levels);
+
+        void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                          PtrStepSz<short2> r_table, const int* r_sizes,
+                                          PtrStepSzi hist,
+                                          float dp, int levels);
+        int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold);
+
+        void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                               PtrStepSz<short2> r_table, const int* r_sizes,
+                                               PtrStepi hist, int rows, int cols,
+                                               float minScale, float scaleStep, int scaleRange,
+                                               float dp, int levels);
+        int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
+                                                   float minScale, float scaleStep, float dp, int threshold);
+
+        void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                  PtrStepSz<short2> r_table, const int* r_sizes,
+                                                  PtrStepi hist, int rows, int cols,
+                                                  float minAngle, float angleStep, int angleRange,
+                                                  float dp, int levels);
+        int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
+                                                      float minAngle, float angleStep, float dp, int threshold);
+
+        void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                     int* sizes, int maxSize,
+                                                     float xi, float angleEpsilon, int levels,
+                                                     float2 center, float maxDist);
+        void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                     int* sizes, int maxSize,
+                                                     float xi, float angleEpsilon, int levels,
+                                                     float2 center, float maxDist);
+        void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
+                                         float minAngle, float maxAngle, float angleStep, int angleRange,
+                                         int levels, int tMaxSize);
+        void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
+                                         float angle, float angleEpsilon,
+                                         float minScale, float maxScale, float iScaleStep, int scaleRange,
+                                         int levels, int tMaxSize);
+        void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                         float angle, float angleEpsilon, float scale,
+                                         float dp,
+                                         int levels, int tMaxSize);
+        int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
+                                             float angle, int angleVotes, float scale, int scaleVotes,
+                                             float dp, int threshold);
+    }
+}}}
+
+namespace
+{
+    /////////////////////////////////////
+    // Common
+
+    template <typename T, class A> void releaseVector(std::vector<T, A>& v)
+    {
+        std::vector<T, A> empty;
+        empty.swap(v);
+    }
+
+    class GHT_Pos : public GeneralizedHough_GPU
+    {
+    public:
+        GHT_Pos();
+
+    protected:
+        void setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter);
+        void detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions);
+        void releaseImpl();
+
+        virtual void processTempl() = 0;
+        virtual void processImage() = 0;
+
+        void buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy);
+        void filterMinDist();
+        void convertTo(GpuMat& positions);
+
+        int maxSize;
+        double minDist;
+
+        Size templSize;
+        Point templCenter;
+        GpuMat templEdges;
+        GpuMat templDx;
+        GpuMat templDy;
+
+        Size imageSize;
+        GpuMat imageEdges;
+        GpuMat imageDx;
+        GpuMat imageDy;
+
+        GpuMat edgePointList;
+
+        GpuMat outBuf;
+        int posCount;
+
+        std::vector<float4> oldPosBuf;
+        std::vector<int3> oldVoteBuf;
+        std::vector<float4> newPosBuf;
+        std::vector<int3> newVoteBuf;
+        std::vector<int> indexies;
+    };
+
+    GHT_Pos::GHT_Pos()
+    {
+        maxSize = 10000;
+        minDist = 1.0;
+    }
+
+    void GHT_Pos::setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter_)
+    {
+        templSize = edges.size();
+        templCenter = templCenter_;
+
+        ensureSizeIsEnough(templSize, edges.type(), templEdges);
+        ensureSizeIsEnough(templSize, dx.type(), templDx);
+        ensureSizeIsEnough(templSize, dy.type(), templDy);
+
+        edges.copyTo(templEdges);
+        dx.copyTo(templDx);
+        dy.copyTo(templDy);
+
+        processTempl();
+    }
+
+    void GHT_Pos::detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions)
+    {
+        imageSize = edges.size();
+
+        ensureSizeIsEnough(imageSize, edges.type(), imageEdges);
+        ensureSizeIsEnough(imageSize, dx.type(), imageDx);
+        ensureSizeIsEnough(imageSize, dy.type(), imageDy);
+
+        edges.copyTo(imageEdges);
+        dx.copyTo(imageDx);
+        dy.copyTo(imageDy);
+
+        posCount = 0;
+
+        processImage();
+
+        if (posCount == 0)
+            positions.release();
+        else
+        {
+            if (minDist > 1)
+                filterMinDist();
+            convertTo(positions);
+        }
+    }
+
+    void GHT_Pos::releaseImpl()
+    {
+        templSize = Size();
+        templCenter = Point(-1, -1);
+        templEdges.release();
+        templDx.release();
+        templDy.release();
+
+        imageSize = Size();
+        imageEdges.release();
+        imageDx.release();
+        imageDy.release();
+
+        edgePointList.release();
+
+        outBuf.release();
+        posCount = 0;
+
+        releaseVector(oldPosBuf);
+        releaseVector(oldVoteBuf);
+        releaseVector(newPosBuf);
+        releaseVector(newVoteBuf);
+        releaseVector(indexies);
+    }
+
+    void GHT_Pos::buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy)
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        typedef int (*func_t)(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        static const func_t funcs[] =
+        {
+            0,
+            0,
+            0,
+            buildEdgePointList_gpu<short>,
+            buildEdgePointList_gpu<int>,
+            buildEdgePointList_gpu<float>,
+            0
+        };
+
+        CV_Assert(edges.type() == CV_8UC1);
+        CV_Assert(dx.size() == edges.size());
+        CV_Assert(dy.type() == dx.type() && dy.size() == edges.size());
+
+        const func_t func = funcs[dx.depth()];
+        CV_Assert(func != 0);
+
+        edgePointList.cols = (int) (edgePointList.step / sizeof(int));
+        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);
+
+        edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));
+    }
+
+    struct IndexCmp
+    {
+        const int3* aux;
+
+        explicit IndexCmp(const int3* _aux) : aux(_aux) {}
+
+        bool operator ()(int l1, int l2) const
+        {
+            return aux[l1].x > aux[l2].x;
+        }
+    };
+
+    void GHT_Pos::filterMinDist()
+    {
+        oldPosBuf.resize(posCount);
+        oldVoteBuf.resize(posCount);
+
+        cudaSafeCall( cudaMemcpy(&oldPosBuf[0], outBuf.ptr(0), posCount * sizeof(float4), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&oldVoteBuf[0], outBuf.ptr(1), posCount * sizeof(int3), cudaMemcpyDeviceToHost) );
+
+        indexies.resize(posCount);
+        for (int i = 0; i < posCount; ++i)
+            indexies[i] = i;
+        std::sort(indexies.begin(), indexies.end(), IndexCmp(&oldVoteBuf[0]));
+
+        newPosBuf.clear();
+        newVoteBuf.clear();
+        newPosBuf.reserve(posCount);
+        newVoteBuf.reserve(posCount);
+
+        const int cellSize = cvRound(minDist);
+        const int gridWidth = (imageSize.width + cellSize - 1) / cellSize;
+        const int gridHeight = (imageSize.height + cellSize - 1) / cellSize;
+
+        std::vector< std::vector<Point2f> > grid(gridWidth * gridHeight);
+
+        const double minDist2 = minDist * minDist;
+
+        for (int i = 0; i < posCount; ++i)
+        {
+            const int ind = indexies[i];
+
+            Point2f p(oldPosBuf[ind].x, oldPosBuf[ind].y);
+
+            bool good = true;
+
+            const int xCell = static_cast<int>(p.x / cellSize);
+            const int yCell = static_cast<int>(p.y / cellSize);
+
+            int x1 = xCell - 1;
+            int y1 = yCell - 1;
+            int x2 = xCell + 1;
+            int y2 = yCell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(gridWidth - 1, x2);
+            y2 = std::min(gridHeight - 1, y2);
+
+            for (int yy = y1; yy <= y2; ++yy)
+            {
+                for (int xx = x1; xx <= x2; ++xx)
+                {
+                    const std::vector<Point2f>& m = grid[yy * gridWidth + xx];
+
+                    for(size_t j = 0; j < m.size(); ++j)
+                    {
+                        const Point2f d = p - m[j];
+
+                        if (d.ddot(d) < minDist2)
+                        {
+                            good = false;
+                            goto break_out;
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[yCell * gridWidth + xCell].push_back(p);
+
+                newPosBuf.push_back(oldPosBuf[ind]);
+                newVoteBuf.push_back(oldVoteBuf[ind]);
+            }
+        }
+
+        posCount = static_cast<int>(newPosBuf.size());
+        cudaSafeCall( cudaMemcpy(outBuf.ptr(0), &newPosBuf[0], posCount * sizeof(float4), cudaMemcpyHostToDevice) );
+        cudaSafeCall( cudaMemcpy(outBuf.ptr(1), &newVoteBuf[0], posCount * sizeof(int3), cudaMemcpyHostToDevice) );
+    }
+
+    void GHT_Pos::convertTo(GpuMat& positions)
+    {
+        ensureSizeIsEnough(2, posCount, CV_32FC4, positions);
+        GpuMat(2, posCount, CV_32FC4, outBuf.data, outBuf.step).copyTo(positions);
+    }
+
+    /////////////////////////////////////
+    // POSITION Ballard
+
+    class GHT_Ballard_Pos : public GHT_Pos
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        GHT_Ballard_Pos();
+
+    protected:
+        void releaseImpl();
+
+        void processTempl();
+        void processImage();
+
+        virtual void calcHist();
+        virtual void findPosInHist();
+
+        int levels;
+        int votesThreshold;
+        double dp;
+
+        GpuMat r_table;
+        GpuMat r_sizes;
+
+        GpuMat hist;
+    };
+
+    CV_INIT_ALGORITHM(GHT_Ballard_Pos, "GeneralizedHough_GPU.POSITION",
+                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
+                                           "Maximal size of inner buffers.");
+                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
+                                           "Minimum distance between the centers of the detected objects.");
+                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
+                                           "R-Table levels.");
+                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
+                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
+                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
+                                           "Inverse ratio of the accumulator resolution to the image resolution."));
+
+    GHT_Ballard_Pos::GHT_Ballard_Pos()
+    {
+        levels = 360;
+        votesThreshold = 100;
+        dp = 1.0;
+    }
+
+    void GHT_Ballard_Pos::releaseImpl()
+    {
+        GHT_Pos::releaseImpl();
+
+        r_table.release();
+        r_sizes.release();
+
+        hist.release();
+    }
+
+    void GHT_Ballard_Pos::processTempl()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(levels > 0);
+
+        buildEdgePointList(templEdges, templDx, templDy);
+
+        ensureSizeIsEnough(levels + 1, maxSize, CV_16SC2, r_table);
+        ensureSizeIsEnough(1, levels + 1, CV_32SC1, r_sizes);
+        r_sizes.setTo(Scalar::all(0));
+
+        if (edgePointList.cols > 0)
+        {
+            buildRTable_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
+                            r_table, r_sizes.ptr<int>(), make_short2(templCenter.x, templCenter.y), levels);
+            min(r_sizes, maxSize, r_sizes);
+        }
+    }
+
+    void GHT_Ballard_Pos::processImage()
+    {
+        calcHist();
+        findPosInHist();
+    }
+
+    void GHT_Ballard_Pos::calcHist()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
+        CV_Assert(dp > 0.0);
+
+        const double idp = 1.0 / dp;
+
+        buildEdgePointList(imageEdges, imageDx, imageDy);
+
+        ensureSizeIsEnough(cvCeil(imageSize.height * idp) + 2, cvCeil(imageSize.width * idp) + 2, CV_32SC1, hist);
+        hist.setTo(Scalar::all(0));
+
+        if (edgePointList.cols > 0)
+        {
+            GHT_Ballard_Pos_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
+                                         r_table, r_sizes.ptr<int>(),
+                                         hist,
+                                         (float)dp, levels);
+        }
+    }
+
+    void GHT_Ballard_Pos::findPosInHist()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(votesThreshold > 0);
+
+        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
+
+        posCount = GHT_Ballard_Pos_findPosInHist_gpu(hist, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)dp, votesThreshold);
+    }
+
+    /////////////////////////////////////
+    // POSITION & SCALE
+
+    class GHT_Ballard_PosScale : public GHT_Ballard_Pos
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        GHT_Ballard_PosScale();
+
+    protected:
+        void calcHist();
+        void findPosInHist();
+
+        double minScale;
+        double maxScale;
+        double scaleStep;
+    };
+
+    CV_INIT_ALGORITHM(GHT_Ballard_PosScale, "GeneralizedHough_GPU.POSITION_SCALE",
+                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
+                                           "Maximal size of inner buffers.");
+                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
+                                           "Minimum distance between the centers of the detected objects.");
+                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
+                                           "R-Table levels.");
+                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
+                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
+                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
+                                           "Inverse ratio of the accumulator resolution to the image resolution.");
+                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
+                                           "Minimal scale to detect.");
+                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
+                                           "Maximal scale to detect.");
+                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
+                                           "Scale step."));
+
+    GHT_Ballard_PosScale::GHT_Ballard_PosScale()
+    {
+        minScale = 0.5;
+        maxScale = 2.0;
+        scaleStep = 0.05;
+    }
+
+    void GHT_Ballard_PosScale::calcHist()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
+        CV_Assert(dp > 0.0);
+        CV_Assert(minScale > 0.0 && minScale < maxScale);
+        CV_Assert(scaleStep > 0.0);
+
+        const double idp = 1.0 / dp;
+        const int scaleRange = cvCeil((maxScale - minScale) / scaleStep);
+        const int rows = cvCeil(imageSize.height * idp);
+        const int cols = cvCeil(imageSize.width * idp);
+
+        buildEdgePointList(imageEdges, imageDx, imageDy);
+
+        ensureSizeIsEnough((scaleRange + 2) * (rows + 2), cols + 2, CV_32SC1, hist);
+        hist.setTo(Scalar::all(0));
+
+        if (edgePointList.cols > 0)
+        {
+            GHT_Ballard_PosScale_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
+                                              r_table, r_sizes.ptr<int>(),
+                                              hist, rows, cols,
+                                              (float)minScale, (float)scaleStep, scaleRange, (float)dp, levels);
+        }
+    }
+
+    void GHT_Ballard_PosScale::findPosInHist()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(votesThreshold > 0);
+
+        const double idp = 1.0 / dp;
+        const int scaleRange = cvCeil((maxScale - minScale) / scaleStep);
+        const int rows = cvCeil(imageSize.height * idp);
+        const int cols = cvCeil(imageSize.width * idp);
+
+        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
+
+        posCount =  GHT_Ballard_PosScale_findPosInHist_gpu(hist, rows, cols, scaleRange, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)minScale, (float)scaleStep, (float)dp, votesThreshold);
+    }
+
+    /////////////////////////////////////
+    // POSITION & Rotation
+
+    class GHT_Ballard_PosRotation : public GHT_Ballard_Pos
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        GHT_Ballard_PosRotation();
+
+    protected:
+        void calcHist();
+        void findPosInHist();
+
+        double minAngle;
+        double maxAngle;
+        double angleStep;
+    };
+
+    CV_INIT_ALGORITHM(GHT_Ballard_PosRotation, "GeneralizedHough_GPU.POSITION_ROTATION",
+                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
+                                           "Maximal size of inner buffers.");
+                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
+                                           "Minimum distance between the centers of the detected objects.");
+                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
+                                           "R-Table levels.");
+                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
+                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
+                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
+                                           "Inverse ratio of the accumulator resolution to the image resolution.");
+                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
+                                           "Minimal rotation angle to detect in degrees.");
+                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
+                                           "Maximal rotation angle to detect in degrees.");
+                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
+                                           "Angle step in degrees."));
+
+    GHT_Ballard_PosRotation::GHT_Ballard_PosRotation()
+    {
+        minAngle = 0.0;
+        maxAngle = 360.0;
+        angleStep = 1.0;
+    }
+
+    void GHT_Ballard_PosRotation::calcHist()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
+        CV_Assert(dp > 0.0);
+        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
+        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
+
+        const double idp = 1.0 / dp;
+        const int angleRange = cvCeil((maxAngle - minAngle) / angleStep);
+        const int rows = cvCeil(imageSize.height * idp);
+        const int cols = cvCeil(imageSize.width * idp);
+
+        buildEdgePointList(imageEdges, imageDx, imageDy);
+
+        ensureSizeIsEnough((angleRange + 2) * (rows + 2), cols + 2, CV_32SC1, hist);
+        hist.setTo(Scalar::all(0));
+
+        if (edgePointList.cols > 0)
+        {
+            GHT_Ballard_PosRotation_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
+                                                 r_table, r_sizes.ptr<int>(),
+                                                 hist, rows, cols,
+                                                 (float)minAngle, (float)angleStep, angleRange, (float)dp, levels);
+        }
+    }
+
+    void GHT_Ballard_PosRotation::findPosInHist()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(votesThreshold > 0);
+
+        const double idp = 1.0 / dp;
+        const int angleRange = cvCeil((maxAngle - minAngle) / angleStep);
+        const int rows = cvCeil(imageSize.height * idp);
+        const int cols = cvCeil(imageSize.width * idp);
+
+        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
+
+        posCount = GHT_Ballard_PosRotation_findPosInHist_gpu(hist, rows, cols, angleRange, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)minAngle, (float)angleStep, (float)dp, votesThreshold);
+    }
+
+    /////////////////////////////////////////
+    // POSITION & SCALE & ROTATION
+
+    double toRad(double a)
+    {
+        return a * CV_PI / 180.0;
+    }
+
+    double clampAngle(double a)
+    {
+        double res = a;
+
+        while (res > 360.0)
+            res -= 360.0;
+        while (res < 0)
+            res += 360.0;
+
+        return res;
+    }
+
+    bool angleEq(double a, double b, double eps = 1.0)
+    {
+        return (fabs(clampAngle(a - b)) <= eps);
+    }
+
+    class GHT_Guil_Full : public GHT_Pos
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        GHT_Guil_Full();
+
+    protected:
+        void releaseImpl();
+
+        void processTempl();
+        void processImage();
+
+        struct Feature
+        {
+            GpuMat p1_pos;
+            GpuMat p1_theta;
+            GpuMat p2_pos;
+
+            GpuMat d12;
+
+            GpuMat r1;
+            GpuMat r2;
+
+            GpuMat sizes;
+            int maxSize;
+
+            void create(int levels, int maxCapacity, bool isTempl);
+            void release();
+        };
+
+        typedef void (*set_func_t)(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        typedef void (*build_func_t)(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                     int* sizes, int maxSize,
+                                     float xi, float angleEpsilon, int levels,
+                                     float2 center, float maxDist);
+
+        void buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                              set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center = Point2d());
+
+        void calcOrientation();
+        void calcScale(double angle);
+        void calcPosition(double angle, int angleVotes, double scale, int scaleVotes);
+
+        double xi;
+        int levels;
+        double angleEpsilon;
+
+        double minAngle;
+        double maxAngle;
+        double angleStep;
+        int angleThresh;
+
+        double minScale;
+        double maxScale;
+        double scaleStep;
+        int scaleThresh;
+
+        double dp;
+        int posThresh;
+
+        Feature templFeatures;
+        Feature imageFeatures;
+
+        std::vector< std::pair<double, int> > angles;
+        std::vector< std::pair<double, int> > scales;
+
+        GpuMat hist;
+        std::vector<int> h_buf;
+    };
+
+    CV_INIT_ALGORITHM(GHT_Guil_Full, "GeneralizedHough_GPU.POSITION_SCALE_ROTATION",
+                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
+                                           "Minimum distance between the centers of the detected objects.");
+                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
+                                           "Maximal size of inner buffers.");
+                      obj.info()->addParam(obj, "xi", obj.xi, false, 0, 0,
+                                           "Angle difference in degrees between two points in feature.");
+                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
+                                           "Feature table levels.");
+                      obj.info()->addParam(obj, "angleEpsilon", obj.angleEpsilon, false, 0, 0,
+                                           "Maximal difference between angles that treated as equal.");
+                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
+                                           "Minimal rotation angle to detect in degrees.");
+                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
+                                           "Maximal rotation angle to detect in degrees.");
+                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
+                                           "Angle step in degrees.");
+                      obj.info()->addParam(obj, "angleThresh", obj.angleThresh, false, 0, 0,
+                                           "Angle threshold.");
+                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
+                                           "Minimal scale to detect.");
+                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
+                                           "Maximal scale to detect.");
+                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
+                                           "Scale step.");
+                      obj.info()->addParam(obj, "scaleThresh", obj.scaleThresh, false, 0, 0,
+                                           "Scale threshold.");
+                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
+                                           "Inverse ratio of the accumulator resolution to the image resolution.");
+                      obj.info()->addParam(obj, "posThresh", obj.posThresh, false, 0, 0,
+                                           "Position threshold."));
+
+    GHT_Guil_Full::GHT_Guil_Full()
+    {
+        maxSize = 1000;
+        xi = 90.0;
+        levels = 360;
+        angleEpsilon = 1.0;
+
+        minAngle = 0.0;
+        maxAngle = 360.0;
+        angleStep = 1.0;
+        angleThresh = 15000;
+
+        minScale = 0.5;
+        maxScale = 2.0;
+        scaleStep = 0.05;
+        scaleThresh = 1000;
+
+        dp = 1.0;
+        posThresh = 100;
+    }
+
+    void GHT_Guil_Full::releaseImpl()
+    {
+        GHT_Pos::releaseImpl();
+
+        templFeatures.release();
+        imageFeatures.release();
+
+        releaseVector(angles);
+        releaseVector(scales);
+
+        hist.release();
+        releaseVector(h_buf);
+    }
+
+    void GHT_Guil_Full::processTempl()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        buildFeatureList(templEdges, templDx, templDy, templFeatures,
+            GHT_Guil_Full_setTemplFeatures, GHT_Guil_Full_buildTemplFeatureList_gpu,
+            true, templCenter);
+
+        h_buf.resize(templFeatures.sizes.cols);
+        cudaSafeCall( cudaMemcpy(&h_buf[0], templFeatures.sizes.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+        templFeatures.maxSize = *max_element(h_buf.begin(), h_buf.end());
+    }
+
+    void GHT_Guil_Full::processImage()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        CV_Assert(levels > 0);
+        CV_Assert(templFeatures.sizes.cols == levels + 1);
+        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
+        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
+        CV_Assert(angleThresh > 0);
+        CV_Assert(minScale > 0.0 && minScale < maxScale);
+        CV_Assert(scaleStep > 0.0);
+        CV_Assert(scaleThresh > 0);
+        CV_Assert(dp > 0.0);
+        CV_Assert(posThresh > 0);
+
+        const double iAngleStep = 1.0 / angleStep;
+        const int angleRange = cvCeil((maxAngle - minAngle) * iAngleStep);
+
+        const double iScaleStep = 1.0 / scaleStep;
+        const int scaleRange = cvCeil((maxScale - minScale) * iScaleStep);
+
+        const double idp = 1.0 / dp;
+        const int histRows = cvCeil(imageSize.height * idp);
+        const int histCols = cvCeil(imageSize.width * idp);
+
+        ensureSizeIsEnough(histRows + 2, std::max(angleRange + 1, std::max(scaleRange + 1, histCols + 2)), CV_32SC1, hist);
+        h_buf.resize(std::max(angleRange + 1, scaleRange + 1));
+
+        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
+
+        buildFeatureList(imageEdges, imageDx, imageDy, imageFeatures,
+            GHT_Guil_Full_setImageFeatures, GHT_Guil_Full_buildImageFeatureList_gpu,
+            false);
+
+        calcOrientation();
+
+        for (size_t i = 0; i < angles.size(); ++i)
+        {
+            const double angle = angles[i].first;
+            const int angleVotes = angles[i].second;
+
+            calcScale(angle);
+
+            for (size_t j = 0; j < scales.size(); ++j)
+            {
+                const double scale = scales[j].first;
+                const int scaleVotes = scales[j].second;
+
+                calcPosition(angle, angleVotes, scale, scaleVotes);
+            }
+        }
+    }
+
+    void GHT_Guil_Full::Feature::create(int levels, int maxCapacity, bool isTempl)
+    {
+        if (!isTempl)
+        {
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p1_pos);
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p2_pos);
+        }
+
+        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, p1_theta);
+
+        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, d12);
+
+        if (isTempl)
+        {
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r1);
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r2);
+        }
+
+        ensureSizeIsEnough(1, levels + 1, CV_32SC1, sizes);
+        sizes.setTo(Scalar::all(0));
+
+        maxSize = 0;
+    }
+
+    void GHT_Guil_Full::Feature::release()
+    {
+        p1_pos.release();
+        p1_theta.release();
+        p2_pos.release();
+
+        d12.release();
+
+        r1.release();
+        r2.release();
+
+        sizes.release();
+
+        maxSize = 0;
+    }
+
+    void GHT_Guil_Full::buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                                         set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center)
+    {
+        CV_Assert(levels > 0);
+
+        const double maxDist = sqrt((double) templSize.width * templSize.width + templSize.height * templSize.height) * maxScale;
+
+        features.create(levels, maxSize, isTempl);
+        set_func(features.p1_pos, features.p1_theta, features.p2_pos, features.d12, features.r1, features.r2);
+
+        buildEdgePointList(edges, dx, dy);
+
+        if (edgePointList.cols > 0)
+        {
+            build_func(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
+                features.sizes.ptr<int>(), maxSize, (float)xi, (float)angleEpsilon, levels, make_float2((float)center.x, (float)center.y), (float)maxDist);
+        }
+    }
+
+    void GHT_Guil_Full::calcOrientation()
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        const double iAngleStep = 1.0 / angleStep;
+        const int angleRange = cvCeil((maxAngle - minAngle) * iAngleStep);
+
+        hist.setTo(Scalar::all(0));
+        GHT_Guil_Full_calcOHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0),
+            hist.ptr<int>(), (float)minAngle, (float)maxAngle, (float)angleStep, angleRange, levels, templFeatures.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf[0], hist.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+
+        angles.clear();
+
+        for (int n = 0; n < angleRange; ++n)
+        {
+            if (h_buf[n] >= angleThresh)
+            {
+                const double angle = minAngle + n * angleStep;
+                angles.push_back(std::make_pair(angle, h_buf[n]));
+            }
+        }
+    }
+
+    void GHT_Guil_Full::calcScale(double angle)
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        const double iScaleStep = 1.0 / scaleStep;
+        const int scaleRange = cvCeil((maxScale - minScale) * iScaleStep);
+
+        hist.setTo(Scalar::all(0));
+        GHT_Guil_Full_calcSHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0),
+            hist.ptr<int>(), (float)angle, (float)angleEpsilon, (float)minScale, (float)maxScale, (float)iScaleStep, scaleRange, levels, templFeatures.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf[0], hist.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+
+        scales.clear();
+
+        for (int s = 0; s < scaleRange; ++s)
+        {
+            if (h_buf[s] >= scaleThresh)
+            {
+                const double scale = minScale + s * scaleStep;
+                scales.push_back(std::make_pair(scale, h_buf[s]));
+            }
+        }
+    }
+
+    void GHT_Guil_Full::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
+    {
+        using namespace cv::gpu::cudev::hough;
+
+        hist.setTo(Scalar::all(0));
+        GHT_Guil_Full_calcPHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0),
+            hist,(float) (float)angle, (float)angleEpsilon, (float)scale, (float)dp, levels, templFeatures.maxSize);
+
+        posCount = GHT_Guil_Full_findPosInHist_gpu(hist, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1),
+            posCount, maxSize, (float)angle, angleVotes, (float)scale, scaleVotes, (float)dp, posThresh);
+    }
+}
+
+Ptr<GeneralizedHough_GPU> cv::gpu::GeneralizedHough_GPU::create(int method)
+{
+    switch (method)
+    {
+    case cv::GeneralizedHough::GHT_POSITION:
+        CV_Assert( !GHT_Ballard_Pos_info_auto.name().empty() );
+        return new GHT_Ballard_Pos();
+
+    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_SCALE):
+        CV_Assert( !GHT_Ballard_PosScale_info_auto.name().empty() );
+        return new GHT_Ballard_PosScale();
+
+    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_ROTATION):
+        CV_Assert( !GHT_Ballard_PosRotation_info_auto.name().empty() );
+        return new GHT_Ballard_PosRotation();
+
+    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_SCALE | cv::GeneralizedHough::GHT_ROTATION):
+        CV_Assert( !GHT_Guil_Full_info_auto.name().empty() );
+        return new GHT_Guil_Full();
+    }
+
+    CV_Error(cv::Error::StsBadArg, "Unsupported method");
+    return Ptr<GeneralizedHough_GPU>();
+}
+
+cv::gpu::GeneralizedHough_GPU::~GeneralizedHough_GPU()
+{
+}
+
+void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat& templ, int cannyThreshold, Point templCenter)
+{
+    CV_Assert(templ.type() == CV_8UC1);
+    CV_Assert(cannyThreshold > 0);
+
+    ensureSizeIsEnough(templ.size(), CV_8UC1, edges_);
+    Canny(templ, cannyBuf_, edges_, cannyThreshold / 2, cannyThreshold);
+
+    if (templCenter == Point(-1, -1))
+        templCenter = Point(templ.cols / 2, templ.rows / 2);
+
+    setTemplateImpl(edges_, cannyBuf_.dx, cannyBuf_.dy, templCenter);
+}
+
+void cv::gpu::GeneralizedHough_GPU::setTemplate(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter)
+{
+    if (templCenter == Point(-1, -1))
+        templCenter = Point(edges.cols / 2, edges.rows / 2);
+
+    setTemplateImpl(edges, dx, dy, templCenter);
+}
+
+void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat& image, GpuMat& positions, int cannyThreshold)
+{
+    CV_Assert(image.type() == CV_8UC1);
+    CV_Assert(cannyThreshold > 0);
+
+    ensureSizeIsEnough(image.size(), CV_8UC1, edges_);
+    Canny(image, cannyBuf_, edges_, cannyThreshold / 2, cannyThreshold);
+
+    detectImpl(edges_, cannyBuf_.dx, cannyBuf_.dy, positions);
+}
+
+void cv::gpu::GeneralizedHough_GPU::detect(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, GpuMat& positions)
+{
+    detectImpl(edges, dx, dy, positions);
+}
+
+void cv::gpu::GeneralizedHough_GPU::download(const GpuMat& d_positions, OutputArray h_positions_, OutputArray h_votes_)
+{
+    if (d_positions.empty())
+    {
+        h_positions_.release();
+        if (h_votes_.needed())
+            h_votes_.release();
+        return;
+    }
+
+    CV_Assert(d_positions.rows == 2 && d_positions.type() == CV_32FC4);
+
+    h_positions_.create(1, d_positions.cols, CV_32FC4);
+    Mat h_positions = h_positions_.getMat();
+    d_positions.row(0).download(h_positions);
+
+    if (h_votes_.needed())
+    {
+        h_votes_.create(1, d_positions.cols, CV_32SC3);
+        Mat h_votes = h_votes_.getMat();
+        GpuMat d_votes(1, d_positions.cols, CV_32SC3, const_cast<int3*>(d_positions.ptr<int3>(1)));
+        d_votes.download(h_votes);
+    }
+}
+
+void cv::gpu::GeneralizedHough_GPU::release()
+{
+    edges_.release();
+    cannyBuf_.release();
+    releaseImpl();
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpuimgproc/src/imgproc.cpp b/modules/gpuimgproc/src/imgproc.cpp
new file mode 100644
index 0000000000..c21a7b837d
--- /dev/null
+++ b/modules/gpuimgproc/src/imgproc.cpp
@@ -0,0 +1,1181 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
+void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
+void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::buildWarpPlaneMaps(Size, Rect, const Mat&, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::buildWarpCylindricalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_no_cuda(); }
+void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::histEven(const GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_no_cuda(); }
+void cv::gpu::histEven(const GpuMat&, GpuMat*, GpuMat&, int*, int*, int*, Stream&) { throw_no_cuda(); }
+void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_no_cuda(); }
+void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_no_cuda(); }
+void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_no_cuda(); }
+void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
+void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
+void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
+void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
+void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_no_cuda(); }
+void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_no_cuda(); }
+void cv::gpu::CannyBuf::create(const Size&, int) { throw_no_cuda(); }
+void cv::gpu::CannyBuf::release() { throw_no_cuda(); }
+cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::gpu::CLAHE>(); }
+void cv::gpu::alphaComp(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// meanShiftFiltering_GPU
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
+{
+    using namespace ::cv::gpu::cudev::imgproc;
+
+    if( src.empty() )
+        CV_Error( cv::Error::StsBadArg, "The input image is empty" );
+
+    if( src.depth() != CV_8U || src.channels() != 4 )
+        CV_Error( cv::Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
+
+    dst.create( src.size(), CV_8UC4 );
+
+    if( !(criteria.type & TermCriteria::MAX_ITER) )
+        criteria.maxCount = 5;
+
+    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
+
+    float eps;
+    if( !(criteria.type & TermCriteria::EPS) )
+        eps = 1.f;
+    eps = (float)std::max(criteria.epsilon, 0.0);
+
+    meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// meanShiftProc_GPU
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
+{
+    using namespace ::cv::gpu::cudev::imgproc;
+
+    if( src.empty() )
+        CV_Error( cv::Error::StsBadArg, "The input image is empty" );
+
+    if( src.depth() != CV_8U || src.channels() != 4 )
+        CV_Error( cv::Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
+
+    dstr.create( src.size(), CV_8UC4 );
+    dstsp.create( src.size(), CV_16SC2 );
+
+    if( !(criteria.type & TermCriteria::MAX_ITER) )
+        criteria.maxCount = 5;
+
+    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
+
+    float eps;
+    if( !(criteria.type & TermCriteria::EPS) )
+        eps = 1.f;
+    eps = (float)std::max(criteria.epsilon, 0.0);
+
+    meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// drawColorDisp
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+        void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+    }
+}}}
+
+namespace
+{
+    template <typename T>
+    void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
+    {
+        using namespace ::cv::gpu::cudev::imgproc;
+
+        dst.create(src.size(), CV_8UC4);
+
+        drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
+    }
+
+    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);
+
+    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
+}
+
+void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& stream)
+{
+    CV_Assert(src.type() == CV_8U || src.type() == CV_16S);
+
+    drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// reprojectImageTo3D
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T, typename D>
+        void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q, int dst_cn, Stream& stream)
+{
+    using namespace cv::gpu::cudev::imgproc;
+
+    typedef void (*func_t)(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    static const func_t funcs[2][4] =
+    {
+        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
+        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
+    };
+
+    CV_Assert(disp.type() == CV_8U || disp.type() == CV_16S);
+    CV_Assert(Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous());
+    CV_Assert(dst_cn == 3 || dst_cn == 4);
+
+    xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
+
+    funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// buildWarpPlaneMaps
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
+                                cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T,
+                                 float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
+{
+    (void)src_size;
+    using namespace ::cv::gpu::cudev::imgproc;
+
+    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
+    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
+    CV_Assert((T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32F && T.isContinuous());
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert(K_Rinv.isContinuous());
+    CV_Assert(R_Kinv.isContinuous());
+
+    map_x.create(dst_roi.size(), CV_32F);
+    map_y.create(dst_roi.size(), CV_32F);
+    cudev::imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
+                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// buildWarpCylyndricalMaps
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                      const float k_rinv[9], const float r_kinv[9], float scale,
+                                      cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
+                                       GpuMat& map_x, GpuMat& map_y, Stream& stream)
+{
+    (void)src_size;
+    using namespace ::cv::gpu::cudev::imgproc;
+
+    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
+    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert(K_Rinv.isContinuous());
+    CV_Assert(R_Kinv.isContinuous());
+
+    map_x.create(dst_roi.size(), CV_32F);
+    map_y.create(dst_roi.size(), CV_32F);
+    cudev::imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// buildWarpSphericalMaps
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                    const float k_rinv[9], const float r_kinv[9], float scale,
+                                    cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
+                                     GpuMat& map_x, GpuMat& map_y, Stream& stream)
+{
+    (void)src_size;
+    using namespace ::cv::gpu::cudev::imgproc;
+
+    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
+    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert(K_Rinv.isContinuous());
+    CV_Assert(R_Kinv.isContinuous());
+
+    map_x.create(dst_roi.size(), CV_32F);
+    map_y.create(dst_roi.size(), CV_32F);
+    cudev::imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// rotate
+
+namespace
+{
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
+
+    template <int DEPTH> struct NppRotateFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
+                                    npp_t* pDst, int nDstStep, NppiRect oDstROI,
+                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
+    };
+
+    template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
+    {
+        typedef typename NppRotateFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream)
+        {
+            (void)dsize;
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppStreamHandler h(stream);
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+            NppiRect srcroi;
+            srcroi.x = srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+            NppiRect dstroi;
+            dstroi.x = dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
+        {0,0,0,0},
+        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
+        {0,0,0,0},
+        {0,0,0,0},
+        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
+    };
+
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+
+    dst.create(dsize, src.type());
+    dst.setTo(Scalar::all(0));
+
+    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// Histogram
+
+namespace
+{
+    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
+    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);
+
+    template<int SDEPTH> struct NppHistogramEvenFuncC1
+    {
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
+            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
+    };
+    template<int SDEPTH> struct NppHistogramEvenFuncC4
+    {
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
+            Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
+    };
+
+    template<int SDEPTH, typename NppHistogramEvenFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
+    struct NppHistogramEvenC1
+    {
+        typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;
+
+        static void hist(const GpuMat& src, GpuMat& hist, GpuMat& buffer, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+        {
+            int levels = histSize + 1;
+            hist.create(1, histSize, CV_32S);
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, levels, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
+                lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppHistogramEvenFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
+    struct NppHistogramEvenC4
+    {
+        typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;
+
+        static void hist(const GpuMat& src, GpuMat hist[4], GpuMat& buffer, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
+        {
+            int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
+            hist[0].create(1, histSize[0], CV_32S);
+            hist[1].create(1, histSize[1], CV_32S);
+            hist[2].create(1, histSize[2], CV_32S);
+            hist[3].create(1, histSize[3], CV_32S);
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
+
+            int buf_size;
+            get_buf_size(sz, levels, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template<int SDEPTH> struct NppHistogramRangeFuncC1
+    {
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
+        typedef Npp32s level_t;
+        enum {LEVEL_TYPE_CODE=CV_32SC1};
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32s* pLevels, int nLevels, Npp8u* pBuffer);
+    };
+    template<> struct NppHistogramRangeFuncC1<CV_32F>
+    {
+        typedef Npp32f src_t;
+        typedef Npp32f level_t;
+        enum {LEVEL_TYPE_CODE=CV_32FC1};
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32f* pLevels, int nLevels, Npp8u* pBuffer);
+    };
+    template<int SDEPTH> struct NppHistogramRangeFuncC4
+    {
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
+        typedef Npp32s level_t;
+        enum {LEVEL_TYPE_CODE=CV_32SC1};
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+    };
+    template<> struct NppHistogramRangeFuncC4<CV_32F>
+    {
+        typedef Npp32f src_t;
+        typedef Npp32f level_t;
+        enum {LEVEL_TYPE_CODE=CV_32FC1};
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+    };
+
+    template<int SDEPTH, typename NppHistogramRangeFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
+    struct NppHistogramRangeC1
+    {
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::src_t src_t;
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
+        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
+
+        static void hist(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buffer, cudaStream_t stream)
+        {
+            CV_Assert(levels.type() == LEVEL_TYPE_CODE && levels.rows == 1);
+
+            hist.create(1, levels.cols - 1, CV_32S);
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, levels.cols, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buffer.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppHistogramRangeFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
+    struct NppHistogramRangeC4
+    {
+        typedef typename NppHistogramRangeFuncC4<SDEPTH>::src_t src_t;
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
+        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
+
+        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buffer, cudaStream_t stream)
+        {
+            CV_Assert(levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1);
+            CV_Assert(levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1);
+            CV_Assert(levels[2].type() == LEVEL_TYPE_CODE && levels[2].rows == 1);
+            CV_Assert(levels[3].type() == LEVEL_TYPE_CODE && levels[3].rows == 1);
+
+            hist[0].create(1, levels[0].cols - 1, CV_32S);
+            hist[1].create(1, levels[1].cols - 1, CV_32S);
+            hist[2].create(1, levels[2].cols - 1, CV_32S);
+            hist[3].create(1, levels[3].cols - 1, CV_32S);
+
+            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
+            int nLevels[] = {levels[0].cols, levels[1].cols, levels[2].cols, levels[3].cols};
+            const level_t* pLevels[] = {levels[0].ptr<level_t>(), levels[1].ptr<level_t>(), levels[2].ptr<level_t>(), levels[3].ptr<level_t>()};
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, nLevels, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buffer.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel)
+{
+    Mat host_levels(1, nLevels, CV_32SC1);
+    nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
+    levels.upload(host_levels);
+}
+
+void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+{
+    GpuMat buf;
+    histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
+}
+
+void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+{
+    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
+
+    typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, GpuMat& buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
+        0,
+        NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
+        NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
+    };
+
+    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+{
+    GpuMat buf;
+    histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
+}
+
+void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+{
+    CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
+
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
+        0,
+        NppHistogramEvenC4<CV_16U, nppiHistogramEven_16u_C4R, nppiHistogramEvenGetBufferSize_16u_C4R>::hist,
+        NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist
+    };
+
+    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream)
+{
+    GpuMat buf;
+    histRange(src, hist, levels, buf, stream);
+}
+
+void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream)
+{
+    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1);
+
+    typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
+        0,
+        NppHistogramRangeC1<CV_16U, nppiHistogramRange_16u_C1R, nppiHistogramRangeGetBufferSize_16u_C1R>::hist,
+        NppHistogramRangeC1<CV_16S, nppiHistogramRange_16s_C1R, nppiHistogramRangeGetBufferSize_16s_C1R>::hist,
+        0,
+        NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist
+    };
+
+    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
+{
+    GpuMat buf;
+    histRange(src, hist, levels, buf, stream);
+}
+
+void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream)
+{
+    CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4);
+
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
+        0,
+        NppHistogramRangeC4<CV_16U, nppiHistogramRange_16u_C4R, nppiHistogramRangeGetBufferSize_16u_C4R>::hist,
+        NppHistogramRangeC4<CV_16S, nppiHistogramRange_16s_C4R, nppiHistogramRangeGetBufferSize_16s_C4R>::hist,
+        0,
+        NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist
+    };
+
+    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+}
+
+namespace hist
+{
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
+}
+
+void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
+{
+    CV_Assert(src.type() == CV_8UC1);
+
+    hist.create(1, 256, CV_32SC1);
+    hist.setTo(Scalar::all(0));
+
+    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    GpuMat hist;
+    GpuMat buf;
+    equalizeHist(src, dst, hist, buf, stream);
+}
+
+void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
+{
+    CV_Assert(src.type() == CV_8UC1);
+
+    dst.create(src.size(), src.type());
+
+    int intBufSize;
+    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
+
+    ensureSizeIsEnough(1, intBufSize + 256 * sizeof(int), CV_8UC1, buf);
+
+    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr());
+    GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize);
+
+    calcHist(src, hist, s);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
+
+    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+// cornerHarris & minEgenVal
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
+    {
+        double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
+
+        if (ksize < 0)
+            scale *= 2.;
+
+        if (src.depth() == CV_8U)
+            scale *= 255.;
+
+        scale = 1./scale;
+
+        Dx.create(src.size(), CV_32F);
+        Dy.create(src.size(), CV_32F);
+
+        if (ksize > 0)
+        {
+            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
+            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
+        }
+        else
+        {
+            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
+            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
+        }
+    }
+}
+
+void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType)
+{
+    GpuMat Dx, Dy;
+    cornerHarris(src, dst, Dx, Dy, blockSize, ksize, k, borderType);
+}
+
+void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)
+{
+    GpuMat buf;
+    cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);
+}
+
+void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
+{
+    using namespace cv::gpu::cudev::imgproc;
+
+    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
+
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
+
+    dst.create(src.size(), CV_32F);
+
+    cornerHarris_gpu(blockSize, static_cast<float>(k), Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
+{
+    GpuMat Dx, Dy;
+    cornerMinEigenVal(src, dst, Dx, Dy, blockSize, ksize, borderType);
+}
+
+void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
+{
+    GpuMat buf;
+    cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);
+}
+
+void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
+{
+    using namespace ::cv::gpu::cudev::imgproc;
+
+    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
+
+    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
+
+    dst.create(src.size(), CV_32F);
+
+    cornerMinEigenVal_gpu(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Canny
+
+void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
+{
+    if (apperture_size > 0)
+    {
+        ensureSizeIsEnough(image_size, CV_32SC1, dx);
+        ensureSizeIsEnough(image_size, CV_32SC1, dy);
+
+        if (apperture_size != 3)
+        {
+            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
+            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+        }
+    }
+
+    ensureSizeIsEnough(image_size, CV_32FC1, mag);
+    ensureSizeIsEnough(image_size, CV_32SC1, map);
+
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
+}
+
+void cv::gpu::CannyBuf::release()
+{
+    dx.release();
+    dy.release();
+    mag.release();
+    map.release();
+    st1.release();
+    st2.release();
+}
+
+namespace canny
+{
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
+
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
+
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
+
+    void getEdges(PtrStepSzi map, PtrStepSzb dst);
+}
+
+namespace
+{
+    void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
+    {
+        using namespace canny;
+
+        buf.map.setTo(Scalar::all(0));
+        calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
+
+        edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
+
+        edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
+
+        getEdges(buf.map, dst);
+    }
+}
+
+void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    CannyBuf buf;
+    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
+}
+
+void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    using namespace canny;
+
+    CV_Assert(src.type() == CV_8UC1);
+
+    if (!deviceSupports(SHARED_ATOMICS))
+        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics");
+
+    if( low_thresh > high_thresh )
+        std::swap( low_thresh, high_thresh);
+
+    dst.create(src.size(), CV_8U);
+    buf.create(src.size(), apperture_size);
+
+    if (apperture_size == 3)
+    {
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
+
+        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
+    }
+    else
+    {
+        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
+        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+
+        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
+    }
+
+    CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+}
+
+void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
+{
+    CannyBuf buf;
+    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
+}
+
+void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
+{
+    using namespace canny;
+
+    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
+    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
+
+    if( low_thresh > high_thresh )
+        std::swap( low_thresh, high_thresh);
+
+    dst.create(dx.size(), CV_8U);
+    buf.create(dx.size(), -1);
+
+    calcMagnitude(dx, dy, buf.mag, L2gradient);
+
+    CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+}
+
+////////////////////////////////////////////////////////////////////////
+// CLAHE
+
+namespace clahe
+{
+    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
+    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
+}
+
+namespace
+{
+    class CLAHE_Impl : public cv::gpu::CLAHE
+    {
+    public:
+        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+        cv::AlgorithmInfo* info() const;
+
+        void apply(cv::InputArray src, cv::OutputArray dst);
+        void apply(InputArray src, OutputArray dst, Stream& stream);
+
+        void setClipLimit(double clipLimit);
+        double getClipLimit() const;
+
+        void setTilesGridSize(cv::Size tileGridSize);
+        cv::Size getTilesGridSize() const;
+
+        void collectGarbage();
+
+    private:
+        double clipLimit_;
+        int tilesX_;
+        int tilesY_;
+
+        GpuMat srcExt_;
+        GpuMat lut_;
+    };
+
+    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+    {
+    }
+
+    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE_GPU",
+        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
+        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
+        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
+
+    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
+    {
+        apply(_src, _dst, Stream::Null());
+    }
+
+    void CLAHE_Impl::apply(InputArray _src, OutputArray _dst, Stream& s)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+
+        _dst.create( src.size(), src.type() );
+        GpuMat dst = _dst.getGpuMat();
+
+        const int histSize = 256;
+
+        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+
+        cv::Size tileSize;
+        GpuMat srcForLut;
+
+        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        {
+            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+            srcForLut = src;
+        }
+        else
+        {
+            cv::gpu::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar(), s);
+
+            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+            srcForLut = srcExt_;
+        }
+
+        const int tileSizeTotal = tileSize.area();
+        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+        int clipLimit = 0;
+        if (clipLimit_ > 0.0)
+        {
+            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+            clipLimit = std::max(clipLimit, 1);
+        }
+
+        clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
+
+        clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
+    }
+
+    void CLAHE_Impl::setClipLimit(double clipLimit)
+    {
+        clipLimit_ = clipLimit;
+    }
+
+    double CLAHE_Impl::getClipLimit() const
+    {
+        return clipLimit_;
+    }
+
+    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+    {
+        tilesX_ = tileGridSize.width;
+        tilesY_ = tileGridSize.height;
+    }
+
+    cv::Size CLAHE_Impl::getTilesGridSize() const
+    {
+        return cv::Size(tilesX_, tilesY_);
+    }
+
+    void CLAHE_Impl::collectGarbage()
+    {
+        srcExt_.release();
+        lut_.release();
+    }
+}
+
+cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double clipLimit, cv::Size tileGridSize)
+{
+    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
+}
+
+////////////////////////////////////////////////////////////////////////
+// alphaComp
+
+namespace
+{
+    template <int DEPTH> struct NppAlphaCompFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp);
+    };
+
+    template <int DEPTH, typename NppAlphaCompFunc<DEPTH>::func_t func> struct NppAlphaComp
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = img1.cols;
+            oSizeROI.height = img2.rows;
+
+            nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream)
+{
+    static const NppiAlphaOp npp_alpha_ops[] = {
+        NPPI_OP_ALPHA_OVER,
+        NPPI_OP_ALPHA_IN,
+        NPPI_OP_ALPHA_OUT,
+        NPPI_OP_ALPHA_ATOP,
+        NPPI_OP_ALPHA_XOR,
+        NPPI_OP_ALPHA_PLUS,
+        NPPI_OP_ALPHA_OVER_PREMUL,
+        NPPI_OP_ALPHA_IN_PREMUL,
+        NPPI_OP_ALPHA_OUT_PREMUL,
+        NPPI_OP_ALPHA_ATOP_PREMUL,
+        NPPI_OP_ALPHA_XOR_PREMUL,
+        NPPI_OP_ALPHA_PLUS_PREMUL,
+        NPPI_OP_ALPHA_PREMUL
+    };
+
+    typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream);
+
+    static const func_t funcs[] =
+    {
+        NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
+        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
+    };
+
+    CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
+    CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
+
+    dst.create(img1.size(), img1.type());
+
+    const func_t func = funcs[img1.depth()];
+
+    func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream));
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpuimgproc/src/match_template.cpp b/modules/gpuimgproc/src/match_template.cpp
new file mode 100644
index 0000000000..d78828bf17
--- /dev/null
+++ b/modules/gpuimgproc/src/match_template.cpp
@@ -0,0 +1,439 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace match_template
+    {
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+            int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+            int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC2(
+            int w, int h,
+            const PtrStepSz<unsigned int> image_sum_r,
+            const PtrStepSz<unsigned int> image_sum_g,
+            unsigned int templ_sum_r,
+            unsigned int templ_sum_g,
+            PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_a,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
+                PtrStepSzf result, cudaStream_t stream);
+
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                int w, int h, const PtrStepSz<unsigned int> image_sum,
+                const PtrStepSz<unsigned long long> image_sqsum,
+                unsigned int templ_sum, unsigned long long templ_sqsum,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
+                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                PtrStepSzf result, cudaStream_t stream);
+
+        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
+                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::cudev::match_template;
+
+namespace
+{
+
+    // Evaluates optimal template's area threshold. If
+    // template's area is less  than the threshold, we use naive match
+    // template version, otherwise FFT-based (if available)
+    int getTemplateThreshold(int method, int depth)
+    {
+        switch (method)
+        {
+        case cv::TM_CCORR:
+            if (depth == CV_32F) return 250;
+            if (depth == CV_8U) return 300;
+            break;
+        case cv::TM_SQDIFF:
+            if (depth == CV_8U) return 300;
+            break;
+        }
+        CV_Error(cv::Error::StsBadArg, "getTemplateThreshold: unsupported match template mode");
+        return 0;
+    }
+
+
+    void matchTemplate_CCORR_32F(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_32F))
+        {
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            return;
+        }
+
+        ConvolveBuf convolve_buf;
+        convolve_buf.user_block_size = buf.user_block_size;
+
+        if (image.channels() == 1)
+            convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
+        else
+        {
+            GpuMat result_;
+            convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
+            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
+        }
+    }
+
+
+    void matchTemplate_CCORR_8U(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_8U))
+        {
+            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            return;
+        }
+
+        if (stream)
+        {
+            stream.enqueueConvert(image, buf.imagef, CV_32F);
+            stream.enqueueConvert(templ, buf.templf, CV_32F);
+        }
+        else
+        {
+            image.convertTo(buf.imagef, CV_32F);
+            templ.convertTo(buf.templf, CV_32F);
+        }
+        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+    }
+
+
+    void matchTemplate_CCORR_NORMED_8U(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+
+        buf.image_sqsums.resize(1);
+        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+
+        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+
+    void matchTemplate_SQDIFF_32F(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        (void)buf;
+        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+
+    void matchTemplate_SQDIFF_8U(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        if (templ.size().area() < getTemplateThreshold(cv::TM_SQDIFF, CV_8U))
+        {
+            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            return;
+        }
+
+        buf.image_sqsums.resize(1);
+        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+
+        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+
+        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+
+    void matchTemplate_SQDIFF_NORMED_8U(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        buf.image_sqsums.resize(1);
+        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+
+        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+
+        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+
+    void matchTemplate_CCOFF_8U(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+
+        if (image.channels() == 1)
+        {
+            buf.image_sums.resize(1);
+            integral(image, buf.image_sums[0], stream);
+
+            unsigned int templ_sum = (unsigned int)sum(templ)[0];
+            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
+        }
+        else
+        {
+            split(image, buf.images);
+            buf.image_sums.resize(buf.images.size());
+            for (int i = 0; i < image.channels(); ++i)
+                integral(buf.images[i], buf.image_sums[i], stream);
+
+            Scalar templ_sum = sum(templ);
+
+            switch (image.channels())
+            {
+            case 2:
+                matchTemplatePrepared_CCOFF_8UC2(
+                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1],
+                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 3:
+                matchTemplatePrepared_CCOFF_8UC3(
+                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2],
+                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 4:
+                matchTemplatePrepared_CCOFF_8UC4(
+                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3],
+                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
+                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
+                break;
+            default:
+                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+            }
+        }
+    }
+
+
+    void matchTemplate_CCOFF_NORMED_8U(
+            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    {
+        if (stream)
+        {
+            stream.enqueueConvert(image, buf.imagef, CV_32F);
+            stream.enqueueConvert(templ, buf.templf, CV_32F);
+        }
+        else
+        {
+            image.convertTo(buf.imagef, CV_32F);
+            templ.convertTo(buf.templf, CV_32F);
+        }
+
+        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+
+        if (image.channels() == 1)
+        {
+            buf.image_sums.resize(1);
+            integral(image, buf.image_sums[0], stream);
+            buf.image_sqsums.resize(1);
+            sqrIntegral(image, buf.image_sqsums[0], stream);
+
+            unsigned int templ_sum = (unsigned int)sum(templ)[0];
+            unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ)[0];
+
+            matchTemplatePrepared_CCOFF_NORMED_8U(
+                    templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0],
+                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
+        }
+        else
+        {
+            split(image, buf.images);
+            buf.image_sums.resize(buf.images.size());
+            buf.image_sqsums.resize(buf.images.size());
+            for (int i = 0; i < image.channels(); ++i)
+            {
+                integral(buf.images[i], buf.image_sums[i], stream);
+                sqrIntegral(buf.images[i], buf.image_sqsums[i], stream);
+            }
+
+            Scalar templ_sum = sum(templ);
+            Scalar templ_sqsum = sqrSum(templ);
+
+            switch (image.channels())
+            {
+            case 2:
+                matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                        templ.cols, templ.rows,
+                        buf.image_sums[0], buf.image_sqsums[0],
+                        buf.image_sums[1], buf.image_sqsums[1],
+                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
+                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 3:
+                matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                        templ.cols, templ.rows,
+                        buf.image_sums[0], buf.image_sqsums[0],
+                        buf.image_sums[1], buf.image_sqsums[1],
+                        buf.image_sums[2], buf.image_sqsums[2],
+                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
+                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 4:
+                matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                        templ.cols, templ.rows,
+                        buf.image_sums[0], buf.image_sqsums[0],
+                        buf.image_sums[1], buf.image_sqsums[1],
+                        buf.image_sums[2], buf.image_sqsums[2],
+                        buf.image_sums[3], buf.image_sqsums[3],
+                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
+                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
+                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            default:
+                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+            }
+        }
+    }
+}
+
+
+void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
+{
+    MatchTemplateBuf buf;
+    matchTemplate(image, templ, result, method, buf, stream);
+}
+
+
+void cv::gpu::matchTemplate(
+        const GpuMat& image, const GpuMat& templ, GpuMat& result, int method,
+        MatchTemplateBuf &buf, Stream& stream)
+{
+    CV_Assert(image.type() == templ.type());
+    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
+
+    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream);
+
+    static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
+                                        ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
+                                        ::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U };
+    static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0,
+                                         ::matchTemplate_CCORR_32F, 0, 0, 0 };
+
+    const Caller* callers = 0;
+    switch (image.depth())
+    {
+        case CV_8U: callers = callers8U; break;
+        case CV_32F: callers = callers32F; break;
+        default: CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported data type");
+    }
+
+    Caller caller = callers[method];
+    CV_Assert(caller);
+    caller(image, templ, result, buf, stream);
+}
+
+#endif
diff --git a/modules/gpuimgproc/src/mssegmentation.cpp b/modules/gpuimgproc/src/mssegmentation.cpp
new file mode 100644
index 0000000000..7f02168e1a
--- /dev/null
+++ b/modules/gpuimgproc/src/mssegmentation.cpp
@@ -0,0 +1,387 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::gpu::meanShiftSegmentation(const GpuMat&, Mat&, int, int, int, TermCriteria) { throw_no_cuda(); }
+
+#else
+
+// Auxiliray stuff
+namespace
+{
+
+//
+// Declarations
+//
+
+class DjSets
+{
+public:
+    DjSets(int n);
+    int find(int elem);
+    int merge(int set1, int set2);
+
+    std::vector<int> parent;
+    std::vector<int> rank;
+    std::vector<int> size;
+private:
+    DjSets(const DjSets&);
+    void operator =(const DjSets&);
+};
+
+
+template <typename T>
+struct GraphEdge
+{
+    GraphEdge() {}
+    GraphEdge(int to_, int next_, const T& val_) : to(to_), next(next_), val(val_) {}
+    int to;
+    int next;
+    T val;
+};
+
+
+template <typename T>
+class Graph
+{
+public:
+    typedef GraphEdge<T> Edge;
+
+    Graph(int numv, int nume_max);
+
+    void addEdge(int from, int to, const T& val=T());
+
+    std::vector<int> start;
+    std::vector<Edge> edges;
+
+    int numv;
+    int nume_max;
+    int nume;
+private:
+    Graph(const Graph&);
+    void operator =(const Graph&);
+};
+
+
+struct SegmLinkVal
+{
+    SegmLinkVal() {}
+    SegmLinkVal(int dr_, int dsp_) : dr(dr_), dsp(dsp_) {}
+    bool operator <(const SegmLinkVal& other) const
+    {
+        return dr + dsp < other.dr + other.dsp;
+    }
+    int dr;
+    int dsp;
+};
+
+
+struct SegmLink
+{
+    SegmLink() {}
+    SegmLink(int from_, int to_, const SegmLinkVal& val_)
+        : from(from_), to(to_), val(val_) {}
+    bool operator <(const SegmLink& other) const
+    {
+        return val < other.val;
+    }
+    int from;
+    int to;
+    SegmLinkVal val;
+};
+
+//
+// Implementation
+//
+
+DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
+{
+    for (int i = 0; i < n; ++i)
+        parent[i] = i;
+}
+
+
+inline int DjSets::find(int elem)
+{
+    int set = elem;
+    while (set != parent[set])
+        set = parent[set];
+    while (elem != parent[elem])
+    {
+        int next = parent[elem];
+        parent[elem] = set;
+        elem = next;
+    }
+    return set;
+}
+
+
+inline int DjSets::merge(int set1, int set2)
+{
+    if (rank[set1] < rank[set2])
+    {
+        parent[set1] = set2;
+        size[set2] += size[set1];
+        return set2;
+    }
+    if (rank[set2] < rank[set1])
+    {
+        parent[set2] = set1;
+        size[set1] += size[set2];
+        return set1;
+    }
+    parent[set1] = set2;
+    rank[set2]++;
+    size[set2] += size[set1];
+    return set2;
+}
+
+
+template <typename T>
+Graph<T>::Graph(int numv_, int nume_max_) : start(numv_, -1), edges(nume_max_)
+{
+    this->numv = numv_;
+    this->nume_max = nume_max_;
+    nume = 0;
+}
+
+
+template <typename T>
+inline void Graph<T>::addEdge(int from, int to, const T& val)
+{
+    edges[nume] = Edge(to, start[from], val);
+    start[from] = nume;
+    nume++;
+}
+
+
+inline int pix(int y, int x, int ncols)
+{
+    return y * ncols + x;
+}
+
+
+inline int sqr(int x)
+{
+    return x * x;
+}
+
+
+inline int dist2(const cv::Vec4b& lhs, const cv::Vec4b& rhs)
+{
+    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
+}
+
+
+inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
+{
+    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
+}
+
+} // anonymous namespace
+
+
+void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
+{
+    CV_Assert(src.type() == CV_8UC4);
+    const int nrows = src.rows;
+    const int ncols = src.cols;
+    const int hr = sr;
+    const int hsp = sp;
+
+    // Perform mean shift procedure and obtain region and spatial maps
+    GpuMat d_rmap, d_spmap;
+    meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
+    Mat rmap(d_rmap);
+    Mat spmap(d_spmap);
+
+    Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
+                                        + (nrows - 1) + (ncols - 1));
+
+    // Make region adjacent graph from image
+    Vec4b r1;
+    Vec4b r2[4];
+    Vec2s sp1;
+    Vec2s sp2[4];
+    int dr[4];
+    int dsp[4];
+    for (int y = 0; y < nrows - 1; ++y)
+    {
+        Vec4b* ry = rmap.ptr<Vec4b>(y);
+        Vec4b* ryp = rmap.ptr<Vec4b>(y + 1);
+        Vec2s* spy = spmap.ptr<Vec2s>(y);
+        Vec2s* spyp = spmap.ptr<Vec2s>(y + 1);
+        for (int x = 0; x < ncols - 1; ++x)
+        {
+            r1 = ry[x];
+            sp1 = spy[x];
+
+            r2[0] = ry[x + 1];
+            r2[1] = ryp[x];
+            r2[2] = ryp[x + 1];
+            r2[3] = ryp[x];
+
+            sp2[0] = spy[x + 1];
+            sp2[1] = spyp[x];
+            sp2[2] = spyp[x + 1];
+            sp2[3] = spyp[x];
+
+            dr[0] = dist2(r1, r2[0]);
+            dr[1] = dist2(r1, r2[1]);
+            dr[2] = dist2(r1, r2[2]);
+            dsp[0] = dist2(sp1, sp2[0]);
+            dsp[1] = dist2(sp1, sp2[1]);
+            dsp[2] = dist2(sp1, sp2[2]);
+
+            r1 = ry[x + 1];
+            sp1 = spy[x + 1];
+
+            dr[3] = dist2(r1, r2[3]);
+            dsp[3] = dist2(sp1, sp2[3]);
+
+            g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+            g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
+            g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
+            g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
+        }
+    }
+    for (int y = 0; y < nrows - 1; ++y)
+    {
+        r1 = rmap.at<Vec4b>(y, ncols - 1);
+        r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
+        sp1 = spmap.at<Vec2s>(y, ncols - 1);
+        sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
+        dr[0] = dist2(r1, r2[0]);
+        dsp[0] = dist2(sp1, sp2[0]);
+        g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+    }
+    for (int x = 0; x < ncols - 1; ++x)
+    {
+        r1 = rmap.at<Vec4b>(nrows - 1, x);
+        r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
+        sp1 = spmap.at<Vec2s>(nrows - 1, x);
+        sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
+        dr[0] = dist2(r1, r2[0]);
+        dsp[0] = dist2(sp1, sp2[0]);
+        g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+    }
+
+    DjSets comps(g.numv);
+
+    // Find adjacent components
+    for (int v = 0; v < g.numv; ++v)
+    {
+        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+        {
+            int c1 = comps.find(v);
+            int c2 = comps.find(g.edges[e_it].to);
+            if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
+                comps.merge(c1, c2);
+        }
+    }
+
+    std::vector<SegmLink> edges;
+    edges.reserve(g.numv);
+
+    // Prepare edges connecting differnet components
+    for (int v = 0; v < g.numv; ++v)
+    {
+        int c1 = comps.find(v);
+        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+        {
+            int c2 = comps.find(g.edges[e_it].to);
+            if (c1 != c2)
+                edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
+        }
+    }
+
+    // Sort all graph's edges connecting differnet components (in asceding order)
+    sort(edges.begin(), edges.end());
+
+    // Exclude small components (starting from the nearest couple)
+    for (size_t i = 0; i < edges.size(); ++i)
+    {
+        int c1 = comps.find(edges[i].from);
+        int c2 = comps.find(edges[i].to);
+        if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
+            comps.merge(c1, c2);
+    }
+
+    // Compute sum of the pixel's colors which are in the same segment
+    Mat h_src(src);
+    std::vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
+    for (int y = 0; y < nrows; ++y)
+    {
+        Vec4b* h_srcy = h_src.ptr<Vec4b>(y);
+        for (int x = 0; x < ncols; ++x)
+        {
+            int parent = comps.find(pix(y, x, ncols));
+            Vec4b col = h_srcy[x];
+            Vec4i& sumcol = sumcols[parent];
+            sumcol[0] += col[0];
+            sumcol[1] += col[1];
+            sumcol[2] += col[2];
+        }
+    }
+
+    // Create final image, color of each segment is the average color of its pixels
+    dst.create(src.size(), src.type());
+
+    for (int y = 0; y < nrows; ++y)
+    {
+        Vec4b* dsty = dst.ptr<Vec4b>(y);
+        for (int x = 0; x < ncols; ++x)
+        {
+            int parent = comps.find(pix(y, x, ncols));
+            const Vec4i& sumcol = sumcols[parent];
+            Vec4b& dstcol = dsty[x];
+            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
+            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
+            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
+            dstcol[3] = 255;
+        }
+    }
+}
+
+#endif // #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
diff --git a/modules/gpuimgproc/src/precomp.cpp b/modules/gpuimgproc/src/precomp.cpp
new file mode 100644
index 0000000000..3c01a2596d
--- /dev/null
+++ b/modules/gpuimgproc/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
diff --git a/modules/gpuimgproc/src/precomp.hpp b/modules/gpuimgproc/src/precomp.hpp
new file mode 100644
index 0000000000..7df02aadd9
--- /dev/null
+++ b/modules/gpuimgproc/src/precomp.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuarithm.hpp"
+#include "opencv2/gpuimgproc.hpp"
+
+#include "opencv2/core/private.hpp"
+#include "opencv2/core/gpu_private.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/gpuimgproc/src/pyramids.cpp b/modules/gpuimgproc/src/pyramids.cpp
new file mode 100644
index 0000000000..9e9fbe3437
--- /dev/null
+++ b/modules/gpuimgproc/src/pyramids.cpp
@@ -0,0 +1,249 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::ImagePyramid::build(const GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::ImagePyramid::getLayer(GpuMat&, Size, Stream&) const { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrDown
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    using namespace cv::gpu::cudev::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
+        {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
+        {pyrDown_gpu<ushort>     , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3>     , pyrDown_gpu<ushort4>     },
+        {pyrDown_gpu<short>      , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3>      , pyrDown_gpu<short4>      },
+        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
+        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
+    };
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
+    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrUp
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    using namespace cv::gpu::cudev::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
+        {0 /*pyrUp_gpu<schar>*/, 0 /*pyrUp_gpu<schar2>*/ , 0 /*pyrUp_gpu<schar3>*/, 0 /*pyrUp_gpu<schar4>*/},
+        {pyrUp_gpu<ushort>     , 0 /*pyrUp_gpu<ushort2>*/, pyrUp_gpu<ushort3>     , pyrUp_gpu<ushort4>     },
+        {pyrUp_gpu<short>      , 0 /*pyrUp_gpu<short2>*/ , pyrUp_gpu<short3>      , pyrUp_gpu<short4>      },
+        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
+        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
+    };
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
+    dst.create(src.rows * 2, src.cols * 2, src.type());
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// ImagePyramid
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace pyramid
+    {
+        template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stream)
+{
+    using namespace cv::gpu::cudev::pyramid;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {kernelDownsampleX2_gpu<uchar1>       , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3>      , kernelDownsampleX2_gpu<uchar4>      },
+        {0 /*kernelDownsampleX2_gpu<char1>*/  , 0 /*kernelDownsampleX2_gpu<char2>*/  , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
+        {kernelDownsampleX2_gpu<ushort1>      , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3>     , kernelDownsampleX2_gpu<ushort4>     },
+        {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
+        {0 /*kernelDownsampleX2_gpu<int1>*/   , 0 /*kernelDownsampleX2_gpu<int2>*/   , 0 /*kernelDownsampleX2_gpu<int3>*/  , 0 /*kernelDownsampleX2_gpu<int4>*/  },
+        {kernelDownsampleX2_gpu<float1>       , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3>      , kernelDownsampleX2_gpu<float4>      }
+    };
+
+    CV_Assert(img.depth() <= CV_32F && img.channels() <= 4);
+
+    const func_t func = funcs[img.depth()][img.channels() - 1];
+    CV_Assert(func != 0);
+
+    layer0_ = img;
+    Size szLastLayer = img.size();
+    nLayers_ = 1;
+
+    if (numLayers <= 0)
+        numLayers = 255; //it will cut-off when any of the dimensions goes 1
+
+    pyramid_.resize(numLayers);
+
+    for (int i = 0; i < numLayers - 1; ++i)
+    {
+        Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+
+        if (szCurLayer.width == 0 || szCurLayer.height == 0)
+            break;
+
+        ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
+        nLayers_++;
+
+        const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
+
+        func(prevLayer, pyramid_[i], StreamAccessor::getStream(stream));
+
+        szLastLayer = szCurLayer;
+    }
+}
+
+void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const
+{
+    using namespace cv::gpu::cudev::pyramid;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+    static const func_t funcs[6][4] =
+    {
+        {kernelInterpolateFrom1_gpu<uchar1>      , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3>      , kernelInterpolateFrom1_gpu<uchar4>      },
+        {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/  , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
+        {kernelInterpolateFrom1_gpu<ushort1>     , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3>     , kernelInterpolateFrom1_gpu<ushort4>     },
+        {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
+        {0 /*kernelInterpolateFrom1_gpu<int1>*/  , 0 /*kernelInterpolateFrom1_gpu<int2>*/   , 0 /*kernelInterpolateFrom1_gpu<int3>*/  , 0 /*kernelInterpolateFrom1_gpu<int4>*/  },
+        {kernelInterpolateFrom1_gpu<float1>      , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3>      , kernelInterpolateFrom1_gpu<float4>      }
+    };
+
+    CV_Assert(outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0);
+
+    ensureSizeIsEnough(outRoi, layer0_.type(), outImg);
+
+    const func_t func = funcs[outImg.depth()][outImg.channels() - 1];
+    CV_Assert(func != 0);
+
+    if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
+    {
+        if (stream)
+            stream.enqueueCopy(layer0_, outImg);
+        else
+            layer0_.copyTo(outImg);
+    }
+
+    float lastScale = 1.0f;
+    float curScale;
+    GpuMat lastLayer = layer0_;
+    GpuMat curLayer;
+
+    for (int i = 0; i < nLayers_ - 1; ++i)
+    {
+        curScale = lastScale * 0.5f;
+        curLayer = pyramid_[i];
+
+        if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
+        {
+            if (stream)
+                stream.enqueueCopy(curLayer, outImg);
+            else
+                curLayer.copyTo(outImg);
+        }
+
+        if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
+            break;
+
+        lastScale = curScale;
+        lastLayer = curLayer;
+    }
+
+    func(lastLayer, outImg, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/src/remap.cpp b/modules/gpuimgproc/src/remap.cpp
new file mode 100644
index 0000000000..315766546b
--- /dev/null
+++ b/modules/gpuimgproc/src/remap.cpp
@@ -0,0 +1,102 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, Scalar, Stream&){ throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T>
+        void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
+                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    }
+}}}
+
+void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream)
+{
+    using namespace cv::gpu::cudev::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
+        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+    static const func_t funcs[6][4] =
+    {
+        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
+        {0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/  , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/},
+        {remap_gpu<ushort>     , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3>    , remap_gpu<ushort4>    },
+        {remap_gpu<short>      , 0 /*remap_gpu<short2>*/ , remap_gpu<short3>     , remap_gpu<short4>     },
+        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
+        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
+    };
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+    dst.create(xmap.size(), src.type());
+
+    Scalar_<float> borderValueFloat;
+    borderValueFloat = borderValue;
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
+        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/src/resize.cpp b/modules/gpuimgproc/src/resize.cpp
new file mode 100644
index 0000000000..32afa54de9
--- /dev/null
+++ b/modules/gpuimgproc/src/resize.cpp
@@ -0,0 +1,162 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
+{
+    (void)src;
+    (void)dst;
+    (void)dsize;
+    (void)fx;
+    (void)fy;
+    (void)interpolation;
+    (void)s;
+
+    throw_no_cuda();
+}
+
+#else // HAVE_CUDA
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T>
+        void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+                        PtrStepSzb dst, int interpolation, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
+{
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR
+            || interpolation == INTER_CUBIC || interpolation == INTER_AREA);
+    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
+
+    if (dsize == Size())
+        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
+    else
+    {
+        fx = static_cast<double>(dsize.width) / src.cols;
+        fy = static_cast<double>(dsize.height) / src.rows;
+    }
+    if (dsize != dst.size())
+        dst.create(dsize, src.type());
+
+    if (dsize == src.size())
+    {
+        if (s)
+            s.enqueueCopy(src, dst);
+        else
+            src.copyTo(dst);
+        return;
+    }
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
+    useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
+
+    if (useNpp)
+    {
+        typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
+                                    double xFactor, double yFactor, int eInterpolation);
+
+        const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };
+
+        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};
+
+        NppiSize srcsz;
+        srcsz.width  = wholeSize.width;
+        srcsz.height = wholeSize.height;
+
+        NppiRect srcrect;
+        srcrect.x = ofs.x;
+        srcrect.y = ofs.y;
+        srcrect.width  = src.cols;
+        srcrect.height = src.rows;
+
+        NppiSize dstsz;
+        dstsz.width  = dst.cols;
+        dstsz.height = dst.rows;
+
+        NppStreamHandler h(stream);
+
+        nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    else
+    {
+        using namespace ::cv::gpu::cudev::imgproc;
+
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        static const func_t funcs[6][4] =
+        {
+            {resize_gpu<uchar>      , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3>     , resize_gpu<uchar4>     },
+            {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/  , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
+            {resize_gpu<ushort>     , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3>    , resize_gpu<ushort4>    },
+            {resize_gpu<short>      , 0 /*resize_gpu<short2>*/ , resize_gpu<short3>     , resize_gpu<short4>     },
+            {0 /*resize_gpu<int>*/  , 0 /*resize_gpu<int2>*/   , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
+            {resize_gpu<float>      , 0 /*resize_gpu<float2>*/ , resize_gpu<float3>     , resize_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
+            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
+    }
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/src/warp.cpp b/modules/gpuimgproc/src/warp.cpp
new file mode 100644
index 0000000000..007091e6a3
--- /dev/null
+++ b/modules/gpuimgproc/src/warp.cpp
@@ -0,0 +1,454 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+
+void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
+void cv::gpu::buildWarpAffineMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
+void cv::gpu::buildWarpPerspectiveMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    }
+}}}
+
+void cv::gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
+{
+    using namespace cv::gpu::cudev::imgproc;
+
+    CV_Assert(M.rows == 2 && M.cols == 3);
+
+    xmap.create(dsize, CV_32FC1);
+    ymap.create(dsize, CV_32FC1);
+
+    float coeffs[2 * 3];
+    Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invertAffineTransform(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
+{
+    using namespace cv::gpu::cudev::imgproc;
+
+    CV_Assert(M.rows == 3 && M.cols == 3);
+
+    xmap.create(dsize, CV_32FC1);
+    ymap.create(dsize, CV_32FC1);
+
+    float coeffs[3 * 3];
+    Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invert(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpPerspectiveMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
+
+    template <int DEPTH> struct NppWarpFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_t* pDst,
+                                    int dstStep, NppiRect dstRoi, const double coeffs[][3],
+                                    int interpolation);
+    };
+
+    template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
+    {
+        typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
+        {
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+
+            NppiRect srcroi;
+            srcroi.x = 0;
+            srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+
+            NppiRect dstroi;
+            dstroi.x = 0;
+            dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            cv::gpu::NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
+                              coeffs, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
+{
+    CV_Assert(M.rows == 2 && M.cols == 3);
+
+    int interpolation = flags & INTER_MAX;
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+
+    dst.create(dsize, src.type());
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
+            }
+        };
+
+        dst.setTo(borderValue);
+
+        double coeffs[2][3];
+        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
+    }
+    else
+    {
+        using namespace cv::gpu::cudev::imgproc;
+
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
+            {0 /*warpAffine_gpu<schar>*/, 0 /*warpAffine_gpu<char2>*/  , 0 /*warpAffine_gpu<char3>*/, 0 /*warpAffine_gpu<char4>*/},
+            {warpAffine_gpu<ushort>     , 0 /*warpAffine_gpu<ushort2>*/, warpAffine_gpu<ushort3>    , warpAffine_gpu<ushort4>    },
+            {warpAffine_gpu<short>      , 0 /*warpAffine_gpu<short2>*/ , warpAffine_gpu<short3>     , warpAffine_gpu<short4>     },
+            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
+            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        int gpuBorderType;
+        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+        float coeffs[2 * 3];
+        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invertAffineTransform(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
+    }
+}
+
+void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
+{
+    CV_Assert(M.rows == 3 && M.cols == 3);
+
+    int interpolation = flags & INTER_MAX;
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+
+    dst.create(dsize, src.type());
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
+            }
+        };
+
+        dst.setTo(borderValue);
+
+        double coeffs[3][3];
+        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
+    }
+    else
+    {
+        using namespace cv::gpu::cudev::imgproc;
+
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
+            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
+            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
+            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
+            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
+            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        int gpuBorderType;
+        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+        float coeffs[3 * 3];
+        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invert(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
+    }
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/interpolation.hpp b/modules/gpuimgproc/test/interpolation.hpp
new file mode 100644
index 0000000000..7a00143e1d
--- /dev/null
+++ b/modules/gpuimgproc/test/interpolation.hpp
@@ -0,0 +1,131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
+#define __OPENCV_TEST_INTERPOLATION_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+template <typename T> T readVal(const cv::Mat& src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+{
+    if (border_type == cv::BORDER_CONSTANT)
+        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
+
+    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
+}
+
+template <typename T> struct NearestInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        return readVal<T>(src, int(y), int(x), c, border_type, borderVal);
+    }
+};
+
+template <typename T> struct LinearInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        int x1 = cvFloor(x);
+        int y1 = cvFloor(y);
+        int x2 = x1 + 1;
+        int y2 = y1 + 1;
+
+        float res = 0;
+
+        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
+        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
+        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
+        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
+
+        return cv::saturate_cast<T>(res);
+    }
+};
+
+template <typename T> struct CubicInterpolator
+{
+    static float bicubicCoeff(float x_)
+    {
+        float x = fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
+    }
+
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        const float xmin = ceilf(x - 2.0f);
+        const float xmax = floorf(x + 2.0f);
+
+        const float ymin = ceilf(y - 2.0f);
+        const float ymax = floorf(y + 2.0f);
+
+        float sum  = 0.0f;
+        float wsum = 0.0f;
+
+        for (float cy = ymin; cy <= ymax; cy += 1.0f)
+        {
+            for (float cx = xmin; cx <= xmax; cx += 1.0f)
+            {
+                const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                sum += w * readVal<T>(src, (int) floorf(cy), (int) floorf(cx), c, border_type, borderVal);
+                wsum += w;
+            }
+        }
+
+        float res = (!wsum)? 0 : sum / wsum;
+
+        return cv::saturate_cast<T>(res);
+    }
+};
+
+#endif // __OPENCV_TEST_INTERPOLATION_HPP__
diff --git a/modules/gpuimgproc/test/test_color.cpp b/modules/gpuimgproc/test/test_color.cpp
new file mode 100644
index 0000000000..4bd53c9194
--- /dev/null
+++ b/modules/gpuimgproc/test/test_color.cpp
@@ -0,0 +1,2503 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// cvtColor
+
+PARAM_TEST_CASE(CvtColor, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    cv::Mat img;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        img = randomMat(size, CV_MAKE_TYPE(depth, 3), 0.0, depth == CV_32F ? 1.0 : 255.0);
+    }
+};
+
+GPU_TEST_P(CvtColor, BGR2RGB)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR2RGBA)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2RGBA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2RGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR2BGRA)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGRA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGRA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGRA2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGRA2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGRA2RGBA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2RGBA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR2GRAY)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, RGB2GRAY)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, GRAY2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, GRAY2BGRA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGRA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGRA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGRA2GRAY)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, RGBA2GRAY)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, RGB2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5652BGR)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5652RGB)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGRA2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, RGBA2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5652BGRA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652BGRA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652BGRA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5652RGBA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652RGBA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652RGBA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, GRAY2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5652GRAY)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, RGB2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5552BGR)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5552RGB)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGRA2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, RGBA2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5552BGRA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552BGRA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552BGRA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5552RGBA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552RGBA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552RGBA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, GRAY2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR5552GRAY)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(CvtColor, BGR2XYZ)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, RGB2XYZ)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2XYZ);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2XYZ4)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGRA2XYZ4)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, XYZ2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, XYZ2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, XYZ42BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, XYZ42BGRA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2YCrCb)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, RGB2YCrCb)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YCrCb);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2YCrCb4)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, RGBA2YCrCb4)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YCrCb2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YCrCb2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YCrCb42RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YCrCb42RGBA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2HSV)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HSV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HSV)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HSV4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGBA2HSV4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, BGR2HLS)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HLS);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HLS)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HLS4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGBA2HLS4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV42BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV42BGRA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS42RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS42RGBA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, BGR2HSV_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HSV_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HSV_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HSV4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGBA2HSV4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, BGR2HLS_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HLS_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HLS_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGB2HLS4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, RGBA2HLS4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV2BGR_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV2RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV42RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HSV42RGBA_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS2BGR_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2BGR_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2BGR_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS2RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS42RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, HLS42RGBA_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+GPU_TEST_P(CvtColor, BGR2YUV)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YUV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, RGB2YUV)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YUV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YUV2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YUV42BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YUV42BGRA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, YUV2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_RGB2YUV);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2YUV4)
+{
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YUV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YUV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, RGBA2YUV4)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YUV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YUV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, RGB2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, BGRA2Lab4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Lab, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Lab);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, LBGR2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, LRGB2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LRGB2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LRGB2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, LBGRA2Lab4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Lab, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Lab);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, Lab2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+GPU_TEST_P(CvtColor, Lab2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+GPU_TEST_P(CvtColor, Lab2BGRA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+GPU_TEST_P(CvtColor, Lab2LBGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LBGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LBGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+GPU_TEST_P(CvtColor, Lab2LRGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LRGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LRGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+GPU_TEST_P(CvtColor, Lab2LRGBA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LRGB, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LRGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+GPU_TEST_P(CvtColor, BGR2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, RGB2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, BGRA2Luv4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Luv, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Luv);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, LBGR2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, LRGB2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LRGB2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LRGB2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, LBGRA2Luv4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Luv, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Luv);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+GPU_TEST_P(CvtColor, Luv2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+GPU_TEST_P(CvtColor, Luv2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+GPU_TEST_P(CvtColor, Luv2BGRA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+GPU_TEST_P(CvtColor, Luv2LBGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LBGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LBGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+GPU_TEST_P(CvtColor, Luv2LRGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LRGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LRGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+GPU_TEST_P(CvtColor, Luv2LRGBA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LRGB, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LRGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+#if defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
+
+GPU_TEST_P(CvtColor, RGBA2mRGBA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = randomMat(size, CV_MAKE_TYPE(depth, 4));
+
+    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2mRGBA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2mRGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1);
+}
+
+#endif // defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
+
+GPU_TEST_P(CvtColor, BayerBG2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerBG2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerGB2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerGB2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerRG2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerRG2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerGR2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerGR2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+GPU_TEST_P(CvtColor, BayerBG2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+GPU_TEST_P(CvtColor, BayerGB2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+GPU_TEST_P(CvtColor, BayerRG2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+GPU_TEST_P(CvtColor, BayerGR2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Demosaicing
+
+struct Demosaicing : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+
+    static void mosaic(const cv::Mat_<cv::Vec3b>& src, cv::Mat_<uchar>& dst, cv::Point firstRed)
+    {
+        dst.create(src.size());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                cv::Vec3b pix = src(y, x);
+
+                cv::Point alternate;
+                alternate.x = (x + firstRed.x) % 2;
+                alternate.y = (y + firstRed.y) % 2;
+
+                if (alternate.y == 0)
+                {
+                    if (alternate.x == 0)
+                    {
+                        // RG
+                        // GB
+                        dst(y, x) = pix[2];
+                    }
+                    else
+                    {
+                        // GR
+                        // BG
+                        dst(y, x) = pix[1];
+                    }
+                }
+                else
+                {
+                    if (alternate.x == 0)
+                    {
+                        // GB
+                        // RG
+                        dst(y, x) = pix[1];
+                    }
+                    else
+                    {
+                        // BG
+                        // GR
+                        dst(y, x) = pix[0];
+                    }
+                }
+            }
+        }
+    }
+};
+
+GPU_TEST_P(Demosaicing, BayerBG2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerBG2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerGB2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGB2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerRG2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerRG2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerGR2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGR2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerBG2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerBG2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+GPU_TEST_P(Demosaicing, BayerGB2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGB2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+GPU_TEST_P(Demosaicing, BayerRG2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerRG2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+GPU_TEST_P(Demosaicing, BayerGR2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGR2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Demosaicing, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// swapChannels
+
+PARAM_TEST_CASE(SwapChannels, cv::gpu::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(SwapChannels, Accuracy)
+{
+    cv::Mat src = readImageType("stereobm/aloe-L.png", CV_8UC4);
+    ASSERT_FALSE(src.empty());
+
+    cv::gpu::GpuMat d_src = loadMat(src, useRoi);
+
+    const int dstOrder[] = {2, 1, 0, 3};
+    cv::gpu::swapChannels(d_src, dstOrder);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, d_src, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, SwapChannels, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_denoising.cpp b/modules/gpuimgproc/test/test_denoising.cpp
new file mode 100644
index 0000000000..2f1a93be1c
--- /dev/null
+++ b/modules/gpuimgproc/test/test_denoising.cpp
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+////////////////////////////////////////////////////////
+// BilateralFilter
+
+PARAM_TEST_CASE(BilateralFilter, cv::gpu::DeviceInfo, cv::Size, MatType)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int kernel_size;
+    float sigma_color;
+    float sigma_spatial;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+
+        kernel_size = 5;
+        sigma_color = 10.f;
+        sigma_spatial = 3.5f;
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(BilateralFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    src.convertTo(src, type);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bilateralFilter(loadMat(src), dst, kernel_size, sigma_color, sigma_spatial);
+
+    cv::Mat dst_gold;
+    cv::bilateralFilter(src, dst_gold, kernel_size, sigma_color, sigma_spatial);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Denoising, BilateralFilter, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(639, 481)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3))
+    ));
+
+
+////////////////////////////////////////////////////////
+// Brute Force Non local means
+
+struct BruteForceNonLocalMeans: testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(BruteForceNonLocalMeans, Regression)
+{
+    using cv::gpu::GpuMat;
+
+    cv::Mat bgr  = readImage("denoising/lena_noised_gaussian_sigma=20_multi_0.png", cv::IMREAD_COLOR);
+    ASSERT_FALSE(bgr.empty());
+
+    cv::Mat gray;
+    cv::cvtColor(bgr, gray, cv::COLOR_BGR2GRAY);
+
+    GpuMat dbgr, dgray;
+    cv::gpu::nonLocalMeans(GpuMat(bgr),  dbgr, 20);
+    cv::gpu::nonLocalMeans(GpuMat(gray), dgray, 20);
+
+#if 0
+    dumpImage("denoising/nlm_denoised_lena_bgr.png", cv::Mat(dbgr));
+    dumpImage("denoising/nlm_denoised_lena_gray.png", cv::Mat(dgray));
+#endif
+
+    cv::Mat bgr_gold  = readImage("denoising/nlm_denoised_lena_bgr.png", cv::IMREAD_COLOR);
+    cv::Mat gray_gold  = readImage("denoising/nlm_denoised_lena_gray.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(bgr_gold.empty() || gray_gold.empty());
+
+    EXPECT_MAT_NEAR(bgr_gold, dbgr, 1e-4);
+    EXPECT_MAT_NEAR(gray_gold, dgray, 1e-4);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Denoising, BruteForceNonLocalMeans, ALL_DEVICES);
+
+////////////////////////////////////////////////////////
+// Fast Force Non local means
+
+struct FastNonLocalMeans: testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(FastNonLocalMeans, Regression)
+{
+    using cv::gpu::GpuMat;
+
+    cv::Mat bgr  = readImage("denoising/lena_noised_gaussian_sigma=20_multi_0.png", cv::IMREAD_COLOR);
+    ASSERT_FALSE(bgr.empty());
+
+    cv::Mat gray;
+    cv::cvtColor(bgr, gray, cv::COLOR_BGR2GRAY);
+
+    GpuMat dbgr, dgray;
+    cv::gpu::FastNonLocalMeansDenoising fnlmd;
+
+    fnlmd.simpleMethod(GpuMat(gray),  dgray, 20);
+    fnlmd.labMethod(GpuMat(bgr),  dbgr, 20, 10);
+
+#if 0
+    dumpImage("denoising/fnlm_denoised_lena_bgr.png", cv::Mat(dbgr));
+    dumpImage("denoising/fnlm_denoised_lena_gray.png", cv::Mat(dgray));
+#endif
+
+    cv::Mat bgr_gold  = readImage("denoising/fnlm_denoised_lena_bgr.png", cv::IMREAD_COLOR);
+    cv::Mat gray_gold  = readImage("denoising/fnlm_denoised_lena_gray.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(bgr_gold.empty() || gray_gold.empty());
+
+    EXPECT_MAT_NEAR(bgr_gold, dbgr, 1);
+    EXPECT_MAT_NEAR(gray_gold, dgray, 1);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Denoising, FastNonLocalMeans, ALL_DEVICES);
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_hough.cpp b/modules/gpuimgproc/test/test_hough.cpp
new file mode 100644
index 0000000000..a044901041
--- /dev/null
+++ b/modules/gpuimgproc/test/test_hough.cpp
@@ -0,0 +1,255 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// HoughLines
+
+PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, cv::Size, UseRoi)
+{
+    static void generateLines(cv::Mat& img)
+    {
+        img.setTo(cv::Scalar::all(0));
+
+        cv::line(img, cv::Point(20, 0), cv::Point(20, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 50), cv::Point(img.cols, 50), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 0), cv::Point(img.cols, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(img.cols, 0), cv::Point(0, img.rows), cv::Scalar::all(255));
+    }
+
+    static void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
+    {
+        dst.setTo(cv::Scalar::all(0));
+
+        for (size_t i = 0; i < lines.size(); ++i)
+        {
+            float rho = lines[i][0], theta = lines[i][1];
+            cv::Point pt1, pt2;
+            double a = std::cos(theta), b = std::sin(theta);
+            double x0 = a*rho, y0 = b*rho;
+            pt1.x = cvRound(x0 + 1000*(-b));
+            pt1.y = cvRound(y0 + 1000*(a));
+            pt2.x = cvRound(x0 - 1000*(-b));
+            pt2.y = cvRound(y0 - 1000*(a));
+            cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+        }
+    }
+};
+
+GPU_TEST_P(HoughLines, Accuracy)
+{
+    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);
+
+    const float rho = 1.0f;
+    const float theta = (float) (1.5 * CV_PI / 180.0);
+    const int threshold = 100;
+
+    cv::Mat src(size, CV_8UC1);
+    generateLines(src);
+
+    cv::gpu::GpuMat d_lines;
+    cv::gpu::HoughLines(loadMat(src, useRoi), d_lines, rho, theta, threshold);
+
+    std::vector<cv::Vec2f> lines;
+    cv::gpu::HoughLinesDownload(d_lines, lines);
+
+    cv::Mat dst(size, CV_8UC1);
+    drawLines(dst, lines);
+
+    ASSERT_MAT_NEAR(src, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// HoughCircles
+
+PARAM_TEST_CASE(HoughCircles, cv::gpu::DeviceInfo, cv::Size, UseRoi)
+{
+    static void drawCircles(cv::Mat& dst, const std::vector<cv::Vec3f>& circles, bool fill)
+    {
+        dst.setTo(cv::Scalar::all(0));
+
+        for (size_t i = 0; i < circles.size(); ++i)
+            cv::circle(dst, cv::Point2f(circles[i][0], circles[i][1]), (int)circles[i][2], cv::Scalar::all(255), fill ? -1 : 1);
+    }
+};
+
+GPU_TEST_P(HoughCircles, Accuracy)
+{
+    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);
+
+    const float dp = 2.0f;
+    const float minDist = 0.0f;
+    const int minRadius = 10;
+    const int maxRadius = 20;
+    const int cannyThreshold = 100;
+    const int votesThreshold = 20;
+
+    std::vector<cv::Vec3f> circles_gold(4);
+    circles_gold[0] = cv::Vec3i(20, 20, minRadius);
+    circles_gold[1] = cv::Vec3i(90, 87, minRadius + 3);
+    circles_gold[2] = cv::Vec3i(30, 70, minRadius + 8);
+    circles_gold[3] = cv::Vec3i(80, 10, maxRadius);
+
+    cv::Mat src(size, CV_8UC1);
+    drawCircles(src, circles_gold, true);
+
+    cv::gpu::GpuMat d_circles;
+    cv::gpu::HoughCircles(loadMat(src, useRoi), d_circles, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+
+    std::vector<cv::Vec3f> circles;
+    cv::gpu::HoughCirclesDownload(d_circles, circles);
+
+    ASSERT_FALSE(circles.empty());
+
+    for (size_t i = 0; i < circles.size(); ++i)
+    {
+        cv::Vec3f cur = circles[i];
+
+        bool found = false;
+
+        for (size_t j = 0; j < circles_gold.size(); ++j)
+        {
+            cv::Vec3f gold = circles_gold[j];
+
+            if (std::fabs(cur[0] - gold[0]) < 5 && std::fabs(cur[1] - gold[1]) < 5 && std::fabs(cur[2] - gold[2]) < 5)
+            {
+                found = true;
+                break;
+            }
+        }
+
+        ASSERT_TRUE(found);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughCircles, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// GeneralizedHough
+
+PARAM_TEST_CASE(GeneralizedHough, cv::gpu::DeviceInfo, UseRoi)
+{
+};
+
+GPU_TEST_P(GeneralizedHough, POSITION)
+{
+    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+    const bool useRoi = GET_PARAM(1);
+
+    cv::Mat templ = readImage("../cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(templ.empty());
+
+    cv::Point templCenter(templ.cols / 2, templ.rows / 2);
+
+    const size_t gold_count = 3;
+    cv::Point pos_gold[gold_count];
+    pos_gold[0] = cv::Point(templCenter.x + 10, templCenter.y + 10);
+    pos_gold[1] = cv::Point(2 * templCenter.x + 40, templCenter.y + 10);
+    pos_gold[2] = cv::Point(2 * templCenter.x + 40, 2 * templCenter.y + 40);
+
+    cv::Mat image(templ.rows * 3, templ.cols * 3, CV_8UC1, cv::Scalar::all(0));
+    for (size_t i = 0; i < gold_count; ++i)
+    {
+        cv::Rect rec(pos_gold[i].x - templCenter.x, pos_gold[i].y - templCenter.y, templ.cols, templ.rows);
+        cv::Mat imageROI = image(rec);
+        templ.copyTo(imageROI);
+    }
+
+    cv::Ptr<cv::gpu::GeneralizedHough_GPU> hough = cv::gpu::GeneralizedHough_GPU::create(cv::GeneralizedHough::GHT_POSITION);
+    hough->set("votesThreshold", 200);
+
+    hough->setTemplate(loadMat(templ, useRoi));
+
+    cv::gpu::GpuMat d_pos;
+    hough->detect(loadMat(image, useRoi), d_pos);
+
+    std::vector<cv::Vec4f> pos;
+    hough->download(d_pos, pos);
+
+    ASSERT_EQ(gold_count, pos.size());
+
+    for (size_t i = 0; i < gold_count; ++i)
+    {
+        cv::Point gold = pos_gold[i];
+
+        bool found = false;
+
+        for (size_t j = 0; j < pos.size(); ++j)
+        {
+            cv::Point2f p(pos[j][0], pos[j][1]);
+
+            if (::fabs(p.x - gold.x) < 2 && ::fabs(p.y - gold.y) < 2)
+            {
+                found = true;
+                break;
+            }
+        }
+
+        ASSERT_TRUE(found);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, GeneralizedHough, testing::Combine(
+    ALL_DEVICES,
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_imgproc.cpp b/modules/gpuimgproc/test/test_imgproc.cpp
new file mode 100644
index 0000000000..6957f54375
--- /dev/null
+++ b/modules/gpuimgproc/test/test_imgproc.cpp
@@ -0,0 +1,843 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// HistEven
+
+struct HistEven : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(HistEven, Accuracy)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty());
+
+    cv::Mat hsv;
+    cv::cvtColor(img, hsv, cv::COLOR_BGR2HSV);
+
+    int hbins = 30;
+    float hranges[] = {0.0f, 180.0f};
+
+    std::vector<cv::gpu::GpuMat> srcs;
+    cv::gpu::split(loadMat(hsv), srcs);
+
+    cv::gpu::GpuMat hist;
+    cv::gpu::histEven(srcs[0], hist, hbins, (int)hranges[0], (int)hranges[1]);
+
+    cv::MatND histnd;
+    int histSize[] = {hbins};
+    const float* ranges[] = {hranges};
+    int channels[] = {0};
+    cv::calcHist(&hsv, 1, channels, cv::Mat(), histnd, 1, histSize, ranges);
+
+    cv::Mat hist_gold = histnd;
+    hist_gold = hist_gold.t();
+    hist_gold.convertTo(hist_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HistEven, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CalcHist
+
+namespace
+{
+    void calcHistGold(const cv::Mat& src, cv::Mat& hist)
+    {
+        hist.create(1, 256, CV_32SC1);
+        hist.setTo(cv::Scalar::all(0));
+
+        int* hist_row = hist.ptr<int>();
+        for (int y = 0; y < src.rows; ++y)
+        {
+            const uchar* src_row = src.ptr(y);
+
+            for (int x = 0; x < src.cols; ++x)
+                ++hist_row[src_row[x]];
+        }
+    }
+}
+
+PARAM_TEST_CASE(CalcHist, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(CalcHist, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::gpu::GpuMat hist;
+    cv::gpu::calcHist(loadMat(src), hist);
+
+    cv::Mat hist_gold;
+    calcHistGold(src, hist_gold);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CalcHist, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// EqualizeHist
+
+PARAM_TEST_CASE(EqualizeHist, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(EqualizeHist, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::equalizeHist(loadMat(src), dst);
+
+    cv::Mat dst_gold;
+    cv::equalizeHist(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, EqualizeHist, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CLAHE
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(ClipLimit, double)
+}
+
+PARAM_TEST_CASE(CLAHE, cv::gpu::DeviceInfo, cv::Size, ClipLimit)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    double clipLimit;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        clipLimit = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(CLAHE, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::Ptr<cv::gpu::CLAHE> clahe = cv::gpu::createCLAHE(clipLimit);
+    cv::gpu::GpuMat dst;
+    clahe->apply(loadMat(src), dst);
+
+    cv::Ptr<cv::CLAHE> clahe_gold = cv::createCLAHE(clipLimit);
+    cv::Mat dst_gold;
+    clahe_gold->apply(src, dst_gold);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CLAHE, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(0.0, 40.0)));
+
+////////////////////////////////////////////////////////
+// Canny
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(AppertureSize, int);
+    IMPLEMENT_PARAM_CLASS(L2gradient, bool);
+}
+
+PARAM_TEST_CASE(Canny, cv::gpu::DeviceInfo, AppertureSize, L2gradient, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int apperture_size;
+    bool useL2gradient;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        apperture_size = GET_PARAM(1);
+        useL2gradient = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Canny, Accuracy)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    double low_thresh = 50.0;
+    double high_thresh = 100.0;
+
+    if (!supportFeature(devInfo, cv::gpu::SHARED_ATOMICS))
+    {
+        try
+        {
+        cv::gpu::GpuMat edges;
+        cv::gpu::Canny(loadMat(img), edges, low_thresh, high_thresh, apperture_size, useL2gradient);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat edges;
+        cv::gpu::Canny(loadMat(img, useRoi), edges, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+        cv::Mat edges_gold;
+        cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+        EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(AppertureSize(3), AppertureSize(5)),
+    testing::Values(L2gradient(false), L2gradient(true)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MeanShift
+
+struct MeanShift : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    cv::Mat img;
+
+    int spatialRad;
+    int colorRad;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        img = readImageType("meanshift/cones.png", CV_8UC4);
+        ASSERT_FALSE(img.empty());
+
+        spatialRad = 30;
+        colorRad = 30;
+    }
+};
+
+GPU_TEST_P(MeanShift, Filtering)
+{
+    cv::Mat img_template;
+    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
+        img_template = readImage("meanshift/con_result.png");
+    else
+        img_template = readImage("meanshift/con_result_CC1X.png");
+    ASSERT_FALSE(img_template.empty());
+
+    cv::gpu::GpuMat d_dst;
+    cv::gpu::meanShiftFiltering(loadMat(img), d_dst, spatialRad, colorRad);
+
+    ASSERT_EQ(CV_8UC4, d_dst.type());
+
+    cv::Mat dst(d_dst);
+
+    cv::Mat result;
+    cv::cvtColor(dst, result, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(img_template, result, 0.0);
+}
+
+GPU_TEST_P(MeanShift, Proc)
+{
+    cv::FileStorage fs;
+    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
+        fs.open(std::string(cvtest::TS::ptr()->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ);
+    else
+        fs.open(std::string(cvtest::TS::ptr()->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    cv::Mat spmap_template;
+    fs["spmap"] >> spmap_template;
+    ASSERT_FALSE(spmap_template.empty());
+
+    cv::gpu::GpuMat rmap_filtered;
+    cv::gpu::meanShiftFiltering(loadMat(img), rmap_filtered, spatialRad, colorRad);
+
+    cv::gpu::GpuMat rmap;
+    cv::gpu::GpuMat spmap;
+    cv::gpu::meanShiftProc(loadMat(img), rmap, spmap, spatialRad, colorRad);
+
+    ASSERT_EQ(CV_8UC4, rmap.type());
+
+    EXPECT_MAT_NEAR(rmap_filtered, rmap, 0.0);
+    EXPECT_MAT_NEAR(spmap_template, spmap, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MeanShift, ALL_DEVICES);
+
+////////////////////////////////////////////////////////////////////////////////
+// MeanShiftSegmentation
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(MinSize, int);
+}
+
+PARAM_TEST_CASE(MeanShiftSegmentation, cv::gpu::DeviceInfo, MinSize)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int minsize;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        minsize = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MeanShiftSegmentation, Regression)
+{
+    cv::Mat img = readImageType("meanshift/cones.png", CV_8UC4);
+    ASSERT_FALSE(img.empty());
+
+    std::ostringstream path;
+    path << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize;
+    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
+        path << ".png";
+    else
+        path << "_CC1X.png";
+    cv::Mat dst_gold = readImage(path.str());
+    ASSERT_FALSE(dst_gold.empty());
+
+    cv::Mat dst;
+    cv::gpu::meanShiftSegmentation(loadMat(img), dst, 10, 10, minsize);
+
+    cv::Mat dst_rgb;
+    cv::cvtColor(dst, dst_rgb, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst_rgb, 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MeanShiftSegmentation, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MinSize(0), MinSize(4), MinSize(20), MinSize(84), MinSize(340), MinSize(1364))));
+
+////////////////////////////////////////////////////////////////////////////
+// Blend
+
+namespace
+{
+    template <typename T>
+    void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
+    {
+        result_gold.create(img1.size(), img1.type());
+
+        int cn = img1.channels();
+
+        for (int y = 0; y < img1.rows; ++y)
+        {
+            const float* weights1_row = weights1.ptr<float>(y);
+            const float* weights2_row = weights2.ptr<float>(y);
+            const T* img1_row = img1.ptr<T>(y);
+            const T* img2_row = img2.ptr<T>(y);
+            T* result_gold_row = result_gold.ptr<T>(y);
+
+            for (int x = 0; x < img1.cols * cn; ++x)
+            {
+                float w1 = weights1_row[x / cn];
+                float w2 = weights2_row[x / cn];
+                result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+            }
+        }
+    }
+}
+
+PARAM_TEST_CASE(Blend, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Blend, Accuracy)
+{
+    int depth = CV_MAT_DEPTH(type);
+
+    cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
+    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
+
+    cv::gpu::GpuMat result;
+    cv::gpu::blendLinear(loadMat(img1, useRoi), loadMat(img2, useRoi), loadMat(weights1, useRoi), loadMat(weights2, useRoi), result);
+
+    cv::Mat result_gold;
+    if (depth == CV_8U)
+        blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
+    else
+        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
+
+    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.0 : 1e-5);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate8U
+
+CV_ENUM(TemplateMethod, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+}
+
+PARAM_TEST_CASE(MatchTemplate8U, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        templ_size = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        method = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MatchTemplate8U, Accuracy)
+{
+    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, templ_size.area() * 1e-1);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
+    testing::Values(Channels(1), Channels(3), Channels(4)),
+    TemplateMethod::all()));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate32F
+
+PARAM_TEST_CASE(MatchTemplate32F, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+
+    int n, m, h, w;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        templ_size = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        method = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MatchTemplate32F, Regression)
+{
+    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, templ_size.area() * 1e-1);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
+    testing::Values(Channels(1), Channels(3), Channels(4)),
+    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplateBlackSource
+
+PARAM_TEST_CASE(MatchTemplateBlackSource, cv::gpu::DeviceInfo, TemplateMethod)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int method;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        method = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MatchTemplateBlackSource, Accuracy)
+{
+    cv::Mat image = readImage("matchtemplate/black.png");
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat pattern = readImage("matchtemplate/cat.png");
+    ASSERT_FALSE(pattern.empty());
+
+    cv::gpu::GpuMat d_dst;
+    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, method);
+
+    cv::Mat dst(d_dst);
+
+    double maxValue;
+    cv::Point maxLoc;
+    cv::minMaxLoc(dst, NULL, &maxValue, NULL, &maxLoc);
+
+    cv::Point maxLocGold = cv::Point(284, 12);
+
+    ASSERT_EQ(maxLocGold, maxLoc);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplateBlackSource, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(TemplateMethod(cv::TM_CCOEFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED))));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate_CCOEF_NORMED
+
+PARAM_TEST_CASE(MatchTemplate_CCOEF_NORMED, cv::gpu::DeviceInfo, std::pair<std::string, std::string>)
+{
+    cv::gpu::DeviceInfo devInfo;
+    std::string imageName;
+    std::string patternName;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        imageName = GET_PARAM(1).first;
+        patternName = GET_PARAM(1).second;
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
+{
+    cv::Mat image = readImage(imageName);
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat pattern = readImage(patternName);
+    ASSERT_FALSE(pattern.empty());
+
+    cv::gpu::GpuMat d_dst;
+    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, cv::TM_CCOEFF_NORMED);
+
+    cv::Mat dst(d_dst);
+
+    cv::Point minLoc, maxLoc;
+    double minVal, maxVal;
+    cv::minMaxLoc(dst, &minVal, &maxVal, &minLoc, &maxLoc);
+
+    cv::Mat dstGold;
+    cv::matchTemplate(image, pattern, dstGold, cv::TM_CCOEFF_NORMED);
+
+    double minValGold, maxValGold;
+    cv::Point minLocGold, maxLocGold;
+    cv::minMaxLoc(dstGold, &minValGold, &maxValGold, &minLocGold, &maxLocGold);
+
+    ASSERT_EQ(minLocGold, minLoc);
+    ASSERT_EQ(maxLocGold, maxLoc);
+    ASSERT_LE(maxVal, 1.0);
+    ASSERT_GE(minVal, -1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate_CCOEF_NORMED, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::make_pair(std::string("matchtemplate/source-0.png"), std::string("matchtemplate/target-0.png")))));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate_CanFindBigTemplate
+
+struct MatchTemplate_CanFindBigTemplate : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
+{
+    cv::Mat scene = readImage("matchtemplate/scene.png");
+    ASSERT_FALSE(scene.empty());
+
+    cv::Mat templ = readImage("matchtemplate/template.png");
+    ASSERT_FALSE(templ.empty());
+
+    cv::gpu::GpuMat d_result;
+    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF_NORMED);
+
+    cv::Mat result(d_result);
+
+    double minVal;
+    cv::Point minLoc;
+    cv::minMaxLoc(result, &minVal, 0, &minLoc, 0);
+
+    ASSERT_GE(minVal, 0);
+    ASSERT_LT(minVal, 1e-3);
+    ASSERT_EQ(344, minLoc.x);
+    ASSERT_EQ(0, minLoc.y);
+}
+
+GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
+{
+    cv::Mat scene = readImage("matchtemplate/scene.png");
+    ASSERT_FALSE(scene.empty());
+
+    cv::Mat templ = readImage("matchtemplate/template.png");
+    ASSERT_FALSE(templ.empty());
+
+    cv::gpu::GpuMat d_result;
+    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF);
+
+    cv::Mat result(d_result);
+
+    double minVal;
+    cv::Point minLoc;
+    cv::minMaxLoc(result, &minVal, 0, &minLoc, 0);
+
+    ASSERT_GE(minVal, 0);
+    ASSERT_EQ(344, minLoc.x);
+    ASSERT_EQ(0, minLoc.y);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate_CanFindBigTemplate, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CornerHarris
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(BlockSize, int);
+    IMPLEMENT_PARAM_CLASS(ApertureSize, int);
+}
+
+PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, BorderType, BlockSize, ApertureSize)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int type;
+    int borderType;
+    int blockSize;
+    int apertureSize;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        borderType = GET_PARAM(2);
+        blockSize = GET_PARAM(3);
+        apertureSize = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(CornerHarris, Accuracy)
+{
+    cv::Mat src = readImageType("stereobm/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    double k = randomDouble(0.1, 0.9);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cornerHarris(loadMat(src), dst, blockSize, apertureSize, k, borderType);
+
+    cv::Mat dst_gold;
+    cv::cornerHarris(src, dst_gold, blockSize, apertureSize, k, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerHarris, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
+    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// cornerMinEigen
+
+PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, BorderType, BlockSize, ApertureSize)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int type;
+    int borderType;
+    int blockSize;
+    int apertureSize;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        borderType = GET_PARAM(2);
+        blockSize = GET_PARAM(3);
+        apertureSize = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(CornerMinEigen, Accuracy)
+{
+    cv::Mat src = readImageType("stereobm/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cornerMinEigenVal(loadMat(src), dst, blockSize, apertureSize, borderType);
+
+    cv::Mat dst_gold;
+    cv::cornerMinEigenVal(src, dst_gold, blockSize, apertureSize, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
+    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_labeling.cpp b/modules/gpuimgproc/test/test_labeling.cpp
new file mode 100644
index 0000000000..4a1927c392
--- /dev/null
+++ b/modules/gpuimgproc/test/test_labeling.cpp
@@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    struct GreedyLabeling
+    {
+        struct dot
+        {
+            int x;
+            int y;
+
+            static dot make(int i, int j)
+            {
+                dot d; d.x = i; d.y = j;
+                return d;
+            }
+        };
+
+        struct InInterval
+        {
+            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
+            const int lo, hi;
+
+            bool operator() (const unsigned char a, const unsigned char b) const
+            {
+                int d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+        GreedyLabeling(cv::Mat img)
+        : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {}
+
+        void operator() (cv::Mat labels) const
+        {
+            InInterval inInt(0, 2);
+            dot* stack = new dot[image.cols * image.rows];
+
+            int cc = -1;
+
+            int* dist_labels = (int*)labels.data;
+            int pitch = (int) labels.step1();
+
+            unsigned char* source = (unsigned char*)image.data;
+            int width = image.cols;
+            int height = image.rows;
+            int step1 = (int)image.step1();
+
+            for (int j = 0; j < image.rows; ++j)
+                for (int i = 0; i < image.cols; ++i)
+                {
+                    if (dist_labels[j * pitch + i] != -1) continue;
+
+                    dot* top = stack;
+                    dot p = dot::make(i, j);
+                    cc++;
+
+                    dist_labels[j * pitch + i] = cc;
+
+                    while (top >= stack)
+                    {
+                        int*  dl = &dist_labels[p.y * pitch + p.x];
+                        unsigned char* sp = &source[p.y * step1 + p.x];
+
+                        dl[0] = cc;
+
+                        //right
+                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                            *top++ = dot::make(p.x + 1, p.y);
+
+                        //left
+                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                            *top++ = dot::make(p.x - 1, p.y);
+
+                        //bottom
+                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+step1]))
+                            *top++ = dot::make(p.x, p.y + 1);
+
+                        //top
+                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-step1]))
+                            *top++ = dot::make(p.x, p.y - 1);
+
+                        p = *--top;
+                    }
+                }
+            delete[] stack;
+        }
+
+        void checkCorrectness(cv::Mat gpu)
+        {
+            cv::Mat diff = gpu - _labels;
+
+            int outliers = 0;
+            for (int j = 0; j < image.rows; ++j)
+                for (int i = 0; i < image.cols - 1; ++i)
+                {
+                    if ( (_labels.at<int>(j,i) == gpu.at<int>(j,i + 1)) && (diff.at<int>(j, i) != diff.at<int>(j,i + 1)))
+                    {
+                        outliers++;
+                    }
+                }
+            ASSERT_TRUE(outliers < gpu.cols + gpu.rows);
+        }
+
+        cv::Mat image;
+        cv::Mat _labels;
+    };
+}
+
+struct Labeling : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+
+    cv::Mat loat_image()
+    {
+        return cv::imread(std::string( cvtest::TS::ptr()->get_data_path() ) + "labeling/label.png");
+    }
+};
+
+GPU_TEST_P(Labeling, DISABLED_ConnectedComponents)
+{
+    cv::Mat image;
+    cvtColor(loat_image(), image, cv::COLOR_BGR2GRAY);
+
+    cv::threshold(image, image, 150, 255, cv::THRESH_BINARY);
+
+    ASSERT_TRUE(image.type() == CV_8UC1);
+
+    GreedyLabeling host(image);
+    host(host._labels);
+
+    cv::gpu::GpuMat mask;
+    mask.create(image.rows, image.cols, CV_8UC1);
+
+    cv::gpu::GpuMat components;
+    components.create(image.rows, image.cols, CV_32SC1);
+
+    cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+    cv::gpu::labelComponents(mask, components);
+
+    host.checkCorrectness(cv::Mat(components));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ConnectedComponents, Labeling, ALL_DEVICES);
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_main.cpp b/modules/gpuimgproc/test/test_main.cpp
new file mode 100644
index 0000000000..eea3d7c008
--- /dev/null
+++ b/modules/gpuimgproc/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_GPU_TEST_MAIN("gpu")
diff --git a/modules/gpuimgproc/test/test_precomp.cpp b/modules/gpuimgproc/test/test_precomp.cpp
new file mode 100644
index 0000000000..0fb6521809
--- /dev/null
+++ b/modules/gpuimgproc/test/test_precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
diff --git a/modules/gpuimgproc/test/test_precomp.hpp b/modules/gpuimgproc/test/test_precomp.hpp
new file mode 100644
index 0000000000..a80f5e5f44
--- /dev/null
+++ b/modules/gpuimgproc/test/test_precomp.hpp
@@ -0,0 +1,63 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_test.hpp"
+
+#include "opencv2/gpuimgproc.hpp"
+#include "opencv2/gpuarithm.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include "interpolation.hpp"
+
+#endif
diff --git a/modules/gpuimgproc/test/test_pyramids.cpp b/modules/gpuimgproc/test/test_pyramids.cpp
new file mode 100644
index 0000000000..6b0540fc10
--- /dev/null
+++ b/modules/gpuimgproc/test/test_pyramids.cpp
@@ -0,0 +1,129 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+////////////////////////////////////////////////////////
+// pyrDown
+
+PARAM_TEST_CASE(PyrDown, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(PyrDown, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size((size.width + 1) / 2, (size.height + 1) / 2), type, useRoi);
+    cv::gpu::pyrDown(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::pyrDown(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////
+// pyrUp
+
+PARAM_TEST_CASE(PyrUp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(PyrUp, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size(size.width * 2, size.height * 2), type, useRoi);
+    cv::gpu::pyrUp(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::pyrUp(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_remap.cpp b/modules/gpuimgproc/test/test_remap.cpp
new file mode 100644
index 0000000000..eb4b9ece85
--- /dev/null
+++ b/modules/gpuimgproc/test/test_remap.cpp
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void remapImpl(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize = xmap.size();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ymap.at<float>(y, x), xmap.at<float>(y, x), c, borderType, borderVal);
+            }
+        }
+    }
+
+    void remapGold(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            remapImpl<unsigned char, NearestInterpolator>,
+            remapImpl<signed char, NearestInterpolator>,
+            remapImpl<unsigned short, NearestInterpolator>,
+            remapImpl<short, NearestInterpolator>,
+            remapImpl<int, NearestInterpolator>,
+            remapImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            remapImpl<unsigned char, LinearInterpolator>,
+            remapImpl<signed char, LinearInterpolator>,
+            remapImpl<unsigned short, LinearInterpolator>,
+            remapImpl<short, LinearInterpolator>,
+            remapImpl<int, LinearInterpolator>,
+            remapImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            remapImpl<unsigned char, CubicInterpolator>,
+            remapImpl<signed char, CubicInterpolator>,
+            remapImpl<unsigned short, CubicInterpolator>,
+            remapImpl<short, CubicInterpolator>,
+            remapImpl<int, CubicInterpolator>,
+            remapImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        funcs[interpolation][src.depth()](src, xmap, ymap, dst, borderType, borderVal);
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(Remap, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    cv::Mat xmap;
+    cv::Mat ymap;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+        borderType = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        // rotation matrix
+
+        const double aplha = CV_PI / 4;
+        static double M[2][3] = { {std::cos(aplha), -std::sin(aplha), size.width / 2.0},
+                                  {std::sin(aplha),  std::cos(aplha), 0.0}};
+
+        xmap.create(size, CV_32FC1);
+        ymap.create(size, CV_32FC1);
+
+        for (int y = 0; y < size.height; ++y)
+        {
+            for (int x = 0; x < size.width; ++x)
+            {
+                xmap.at<float>(y, x) = static_cast<float>(M[0][0] * x + M[0][1] * y + M[0][2]);
+                ymap.at<float>(y, x) = static_cast<float>(M[1][0] * x + M[1][1] * y + M[1][2]);
+            }
+        }
+    }
+};
+
+GPU_TEST_P(Remap, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(xmap.size(), type, useRoi);
+    cv::gpu::remap(loadMat(src, useRoi), dst, loadMat(xmap, useRoi), loadMat(ymap, useRoi), interpolation, borderType, val);
+
+    cv::Mat dst_gold;
+    remapGold(src, xmap, ymap, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_resize.cpp b/modules/gpuimgproc/test/test_resize.cpp
new file mode 100644
index 0000000000..593c891e6a
--- /dev/null
+++ b/modules/gpuimgproc/test/test_resize.cpp
@@ -0,0 +1,250 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator>
+    void resizeImpl(const cv::Mat& src, cv::Mat& dst, double fx, double fy)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize(cv::saturate_cast<int>(src.cols * fx), cv::saturate_cast<int>(src.rows * fy));
+
+        dst.create(dsize, src.type());
+
+        float ifx = static_cast<float>(1.0 / fx);
+        float ify = static_cast<float>(1.0 / fy);
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE);
+            }
+        }
+    }
+
+    void resizeGold(const cv::Mat& src, cv::Mat& dst, double fx, double fy, int interpolation)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst, double fx, double fy);
+
+        static const func_t nearest_funcs[] =
+        {
+            resizeImpl<unsigned char, NearestInterpolator>,
+            resizeImpl<signed char, NearestInterpolator>,
+            resizeImpl<unsigned short, NearestInterpolator>,
+            resizeImpl<short, NearestInterpolator>,
+            resizeImpl<int, NearestInterpolator>,
+            resizeImpl<float, NearestInterpolator>
+        };
+
+
+        static const func_t linear_funcs[] =
+        {
+            resizeImpl<unsigned char, LinearInterpolator>,
+            resizeImpl<signed char, LinearInterpolator>,
+            resizeImpl<unsigned short, LinearInterpolator>,
+            resizeImpl<short, LinearInterpolator>,
+            resizeImpl<int, LinearInterpolator>,
+            resizeImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            resizeImpl<unsigned char, CubicInterpolator>,
+            resizeImpl<signed char, CubicInterpolator>,
+            resizeImpl<unsigned short, CubicInterpolator>,
+            resizeImpl<short, CubicInterpolator>,
+            resizeImpl<int, CubicInterpolator>,
+            resizeImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        funcs[interpolation][src.depth()](src, dst, fx, fy);
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(Resize, cv::gpu::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int interpolation;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Resize, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
+    cv::gpu::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation);
+
+    cv::Mat dst_gold;
+    resizeGold(src, dst_gold, coeff, coeff, interpolation);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    WHOLE_SUBMAT));
+
+/////////////////
+
+PARAM_TEST_CASE(ResizeSameAsHost, cv::gpu::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int interpolation;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+// downscaling only: used for classifiers
+GPU_TEST_P(ResizeSameAsHost, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
+    cv::gpu::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation);
+
+    cv::Mat dst_gold;
+    cv::resize(src, dst_gold, cv::Size(), coeff, coeff, interpolation);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5),
+    testing::Values(Interpolation(cv::INTER_AREA), Interpolation(cv::INTER_NEAREST)),  //, Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(ResizeNPP, cv::gpu::DeviceInfo, MatType, double, Interpolation)
+{
+    cv::gpu::DeviceInfo devInfo;
+    double coeff;
+    int interpolation;
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        coeff = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(ResizeNPP, Accuracy)
+{
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::resize(loadMat(src), dst, cv::Size(), coeff, coeff, interpolation);
+
+    cv::Mat dst_gold;
+    resizeGold(src, dst_gold, coeff, coeff, interpolation);
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 1e-1);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_warp_affine.cpp b/modules/gpuimgproc/test/test_warp_affine.cpp
new file mode 100644
index 0000000000..43bf0f6d9e
--- /dev/null
+++ b/modules/gpuimgproc/test/test_warp_affine.cpp
@@ -0,0 +1,280 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+namespace
+{
+    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    {
+        cv::Mat M(2, 3, CV_64FC1);
+
+        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
+        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
+
+        return M;
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test buildWarpAffineMaps
+
+PARAM_TEST_CASE(BuildWarpAffineMaps, cv::gpu::DeviceInfo, cv::Size, Inverse)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool inverse;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(BuildWarpAffineMaps, Accuracy)
+{
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
+
+    cv::gpu::GpuMat xmap, ymap;
+    cv::gpu::buildWarpAffineMaps(M, inverse, size, xmap, ymap);
+
+    int interpolation = cv::INTER_NEAREST;
+    int borderMode = cv::BORDER_CONSTANT;
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
+
+    cv::Mat dst_gold;
+    cv::warpAffine(src, dst_gold, M, size, flags, borderMode);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BuildWarpAffineMaps, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DIRECT_INVERSE));
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void warpAffineImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                float xcoo = static_cast<float>(M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2));
+                float ycoo = static_cast<float>(M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2));
+
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
+            }
+        }
+    }
+
+    void warpAffineGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            warpAffineImpl<unsigned char, NearestInterpolator>,
+            warpAffineImpl<signed char, NearestInterpolator>,
+            warpAffineImpl<unsigned short, NearestInterpolator>,
+            warpAffineImpl<short, NearestInterpolator>,
+            warpAffineImpl<int, NearestInterpolator>,
+            warpAffineImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            warpAffineImpl<unsigned char, LinearInterpolator>,
+            warpAffineImpl<signed char, LinearInterpolator>,
+            warpAffineImpl<unsigned short, LinearInterpolator>,
+            warpAffineImpl<short, LinearInterpolator>,
+            warpAffineImpl<int, LinearInterpolator>,
+            warpAffineImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            warpAffineImpl<unsigned char, CubicInterpolator>,
+            warpAffineImpl<signed char, CubicInterpolator>,
+            warpAffineImpl<unsigned short, CubicInterpolator>,
+            warpAffineImpl<short, CubicInterpolator>,
+            warpAffineImpl<int, CubicInterpolator>,
+            warpAffineImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        if (inverse)
+            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
+        else
+        {
+            cv::Mat iM;
+            cv::invertAffineTransform(M, iM);
+            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(WarpAffine, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool inverse;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        inverse = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(WarpAffine, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::warpAffine(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
+
+    cv::Mat dst_gold;
+    warpAffineGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(WarpAffineNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolation)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int type;
+    bool inverse;
+    int interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(WarpAffineNPP, Accuracy)
+{
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::warpAffine(loadMat(src), dst, M, src.size(), flags);
+
+    cv::Mat dst_gold;
+    warpAffineGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/test/test_warp_perspective.cpp b/modules/gpuimgproc/test/test_warp_perspective.cpp
new file mode 100644
index 0000000000..d225e58b66
--- /dev/null
+++ b/modules/gpuimgproc/test/test_warp_perspective.cpp
@@ -0,0 +1,283 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+namespace
+{
+    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    {
+        cv::Mat M(3, 3, CV_64FC1);
+
+        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
+        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
+        M.at<double>(2, 0) = 0.0            ; M.at<double>(2, 1) =  0.0            ; M.at<double>(2, 2) = 1.0;
+
+        return M;
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test buildWarpPerspectiveMaps
+
+PARAM_TEST_CASE(BuildWarpPerspectiveMaps, cv::gpu::DeviceInfo, cv::Size, Inverse)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool inverse;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(BuildWarpPerspectiveMaps, Accuracy)
+{
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+
+    cv::gpu::GpuMat xmap, ymap;
+    cv::gpu::buildWarpPerspectiveMaps(M, inverse, size, xmap, ymap);
+
+    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
+    int interpolation = cv::INTER_NEAREST;
+    int borderMode = cv::BORDER_CONSTANT;
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
+
+    cv::Mat dst_gold;
+    cv::warpPerspective(src, dst_gold, M, size, flags, borderMode);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BuildWarpPerspectiveMaps, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DIRECT_INVERSE));
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void warpPerspectiveImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                float coeff = static_cast<float>(M.at<double>(2, 0) * x + M.at<double>(2, 1) * y + M.at<double>(2, 2));
+
+                float xcoo = static_cast<float>((M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2)) / coeff);
+                float ycoo = static_cast<float>((M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2)) / coeff);
+
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
+            }
+        }
+    }
+
+    void warpPerspectiveGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, NearestInterpolator>,
+            warpPerspectiveImpl<signed char, NearestInterpolator>,
+            warpPerspectiveImpl<unsigned short, NearestInterpolator>,
+            warpPerspectiveImpl<short, NearestInterpolator>,
+            warpPerspectiveImpl<int, NearestInterpolator>,
+            warpPerspectiveImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, LinearInterpolator>,
+            warpPerspectiveImpl<signed char, LinearInterpolator>,
+            warpPerspectiveImpl<unsigned short, LinearInterpolator>,
+            warpPerspectiveImpl<short, LinearInterpolator>,
+            warpPerspectiveImpl<int, LinearInterpolator>,
+            warpPerspectiveImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, CubicInterpolator>,
+            warpPerspectiveImpl<signed char, CubicInterpolator>,
+            warpPerspectiveImpl<unsigned short, CubicInterpolator>,
+            warpPerspectiveImpl<short, CubicInterpolator>,
+            warpPerspectiveImpl<int, CubicInterpolator>,
+            warpPerspectiveImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        if (inverse)
+            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
+        else
+        {
+            cv::Mat iM;
+            cv::invert(M, iM);
+            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool inverse;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        inverse = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(WarpPerspective, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::warpPerspective(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
+
+    cv::Mat dst_gold;
+    warpPerspectiveGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(WarpPerspectiveNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolation)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int type;
+    bool inverse;
+    int interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
+{
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::warpPerspective(loadMat(src), dst, M, src.size(), flags);
+
+    cv::Mat dst_gold;
+    warpPerspectiveGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
+#endif // HAVE_CUDA
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 4678532afb..bb444f0700 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -19,6 +19,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   if(HAVE_opencv_gpu)
     ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpuarithm/include")
     ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpufilters/include")
+    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpuimgproc/include")
     ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
   endif()
 
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 3b05553666..57fdeb0930 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -2,7 +2,7 @@ SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc ope
                                      opencv_ml opencv_video opencv_objdetect opencv_features2d
                                      opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
                                      opencv_nonfree opencv_softcascade opencv_superres
-                                     opencv_gpucodec opencv_gpuarithm opencv_gpufilters opencv_gpunvidia)
+                                     opencv_gpucodec opencv_gpuarithm opencv_gpufilters opencv_gpunvidia opencv_gpuimgproc)
 
 ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})