From: Marina Kolpakova <no@email>
Date: Thu, 28 Jun 2012 15:41:27 +0000 (+0000)
Subject: merged the trunk r8669:8702
X-Git-Tag: accepted/2.0/20130307.220821~465
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6cca6a45482ae0650d13ee1550724b32c9a76b7c;p=profile%2Fivi%2Fopencv.git

merged the trunk r8669:8702
---

diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index 6ccd5b0..63d6455 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -114,6 +114,7 @@ add_definitions(-D__TBB_DYNAMIC_LOAD_ENABLED=0         #required
 if(tbb_need_GENERIC_DWORD_LOAD_STORE)
   #needed by TBB 4.0 update 1,2; fixed in TBB 4.0 update 3 but it has 2 new problems
   add_definitions(-D__TBB_USE_GENERIC_DWORD_LOAD_STORE=1)
+  set(tbb_need_GENERIC_DWORD_LOAD_STORE ON PARENT_SCOPE)
 endif()
 
 add_library(tbb STATIC ${lib_srcs} ${lib_hdrs} "${CMAKE_CURRENT_SOURCE_DIR}/android_additional.h" "${CMAKE_CURRENT_SOURCE_DIR}/${tbb_version_file}")
diff --git a/cmake/OpenCVConfig.cmake b/cmake/OpenCVConfig.cmake
new file mode 100644
index 0000000..cb055cf
--- /dev/null
+++ b/cmake/OpenCVConfig.cmake
@@ -0,0 +1,155 @@
+# ===================================================================================
+#  The OpenCV CMake configuration file
+#
+#             ** File generated automatically, do not modify **
+#
+#  Usage from an external project:
+#    In your CMakeLists.txt, add these lines:
+#
+#    FIND_PACKAGE(OpenCV REQUIRED)
+#    TARGET_LINK_LIBRARIES(MY_TARGET_NAME ${OpenCV_LIBS})
+#
+#    Or you can search for specific OpenCV modules:
+#
+#    FIND_PACKAGE(OpenCV REQUIRED core highgui)
+#
+#    If the module is found then OPENCV_<MODULE>_FOUND is set to TRUE.
+#
+#    This file will define the following variables:
+#      - OpenCV_LIBS                     : The list of libraries to links against.
+#      - OpenCV_LIB_DIR                  : The directory(es) where lib files are. Calling LINK_DIRECTORIES
+#                                          with this path is NOT needed.
+#      - OpenCV_INCLUDE_DIRS             : The OpenCV include directories.
+#      - OpenCV_COMPUTE_CAPABILITIES     : The version of compute capability
+#      - OpenCV_ANDROID_NATIVE_API_LEVEL : Minimum required level of Android API
+#      - OpenCV_VERSION                  : The version of this OpenCV build. Example: "2.4.0"
+#      - OpenCV_VERSION_MAJOR            : Major version part of OpenCV_VERSION. Example: "2"
+#      - OpenCV_VERSION_MINOR            : Minor version part of OpenCV_VERSION. Example: "4"
+#      - OpenCV_VERSION_PATCH            : Patch version part of OpenCV_VERSION. Example: "0"
+#
+#    Advanced variables:
+#      - OpenCV_SHARED
+#      - OpenCV_CONFIG_PATH
+#      - OpenCV_INSTALL_PATH  (not set on Windows)
+#      - OpenCV_LIB_COMPONENTS
+#      - OpenCV_USE_MANGLED_PATHS
+#      - OpenCV_HAVE_ANDROID_CAMERA
+#
+# ===================================================================================
+#
+#    Windows pack specific options:
+#      - OpenCV_STATIC
+#      - OpenCV_CUDA
+
+if(NOT DEFINED OpenCV_STATIC)
+  # look for global setting
+  if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
+    set(OpenCV_STATIC OFF)
+  else()
+    set(OpenCV_STATIC ON)
+  endif()
+endif()
+
+if(NOT DEFINED OpenCV_CUDA)
+  # if user' app uses CUDA, then it probably wants CUDA-enabled OpenCV binaries
+  if(CUDA_FOUND)
+    set(OpenCV_CUDA ON)
+  endif()
+endif()
+
+if(MSVC)
+  if(CMAKE_CL_64)
+    set(OpenCV_ARCH x64)
+    set(OpenCV_TBB_ARCH intel64)
+  else()
+    set(OpenCV_ARCH x86)
+    set(OpenCV_TBB_ARCH ia32)
+  endif()
+  if(MSVC_VERSION EQUAL 1400)
+    set(OpenCV_RUNTIME vc8)
+  elseif(MSVC_VERSION EQUAL 1500)
+    set(OpenCV_RUNTIME vc9)
+  elseif(MSVC_VERSION EQUAL 1600)
+    set(OpenCV_RUNTIME vc10)
+  elseif(MSVC_VERSION EQUAL 1700)
+    set(OpenCV_RUNTIME vc11)
+  endif()
+elseif(MINGW)
+  set(OpenCV_RUNTIME mingw)
+
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
+                  OUTPUT_VARIABLE OPENCV_GCC_TARGET_MACHINE
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "64")
+    set(MINGW64 1)
+    set(OpenCV_ARCH x64)
+  else()
+    set(OpenCV_ARCH x86)
+  endif()
+endif()
+
+if(CMAKE_VERSION VERSION_GREATER 2.6.2)
+  unset(OpenCV_CONFIG_PATH CACHE)
+endif()
+
+get_filename_component(OpenCV_CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" PATH CACHE)
+if(OpenCV_RUNTIME AND OpenCV_ARCH)
+  if(OpenCV_STATIC AND EXISTS "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib/OpenCVConfig.cmake")
+    if(OpenCV_CUDA AND EXISTS "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib/OpenCVConfig.cmake")
+      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib")
+    else()
+      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/staticlib")
+    endif()
+  elseif(EXISTS "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib/OpenCVConfig.cmake")
+    if(OpenCV_CUDA AND EXISTS "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib/OpenCVConfig.cmake")
+      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib")
+    else()
+      set(OpenCV_LIB_PATH "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/lib")
+    endif()
+  endif()
+endif()
+
+if(OpenCV_LIB_PATH AND EXISTS "${OpenCV_LIB_PATH}/OpenCVConfig.cmake")
+  set(OpenCV_LIB_DIR_OPT "${OpenCV_LIB_PATH}" CACHE PATH "Path where release OpenCV libraries are located" FORCE)
+  set(OpenCV_LIB_DIR_DBG "${OpenCV_LIB_PATH}" CACHE PATH "Path where debug OpenCV libraries are located" FORCE)
+  set(OpenCV_3RDPARTY_LIB_DIR_OPT "${OpenCV_LIB_PATH}" CACHE PATH "Path where release 3rdpaty OpenCV dependencies are located" FORCE)
+  set(OpenCV_3RDPARTY_LIB_DIR_DBG "${OpenCV_LIB_PATH}" CACHE PATH "Path where debug 3rdpaty OpenCV dependencies are located" FORCE)
+
+  include("${OpenCV_LIB_PATH}/OpenCVConfig.cmake")
+
+  if(OpenCV_CUDA)
+    set(_OpenCV_LIBS "")
+    foreach(_lib ${OpenCV_LIBS})
+      string(REPLACE "${OpenCV_CONFIG_PATH}/gpu/${OpenCV_ARCH}/${OpenCV_RUNTIME}" "${OpenCV_CONFIG_PATH}/${OpenCV_ARCH}/${OpenCV_RUNTIME}" _lib2 "${_lib}")
+      if(NOT EXISTS "${_lib}" AND EXISTS "${_lib2}")
+        list(APPEND _OpenCV_LIBS "${_lib2}")
+      else()
+        list(APPEND _OpenCV_LIBS "${_lib}")
+      endif()
+    endforeach()
+    set(OpenCV_LIBS ${_OpenCV_LIBS})
+  endif()
+  set(OpenCV_FOUND TRUE CACHE BOOL "" FORCE)
+  set(OPENCV_FOUND TRUE CACHE BOOL "" FORCE)
+
+  if(NOT OpenCV_FIND_QUIETLY)
+    message(STATUS "Found OpenCV ${OpenCV_VERSION} in ${OpenCV_LIB_PATH}")
+    if(NOT OpenCV_LIB_PATH MATCHES "/staticlib")
+      get_filename_component(_OpenCV_LIB_PATH "${OpenCV_LIB_PATH}/../bin" ABSOLUTE)
+      file(TO_NATIVE_PATH "${_OpenCV_LIB_PATH}" _OpenCV_LIB_PATH)
+      message(STATUS "You might need to add ${_OpenCV_LIB_PATH} to your PATH to be able to run your applications.")
+      if(OpenCV_LIB_PATH MATCHES "/gpu/")
+        string(REPLACE "\\gpu" "" _OpenCV_LIB_PATH2 "${_OpenCV_LIB_PATH}")
+        message(STATUS "GPU support is enabled so you might also need ${_OpenCV_LIB_PATH2} in your PATH (it must go after the ${_OpenCV_LIB_PATH}).")
+      endif()
+    endif()
+  endif()
+else()
+  if(NOT OpenCV_FIND_QUIETLY)
+    message(WARNING "Found OpenCV 2.4.0 Windows Super Pack but it has not binaries compatible with your configuration.
+    You should manually point CMake variable OpenCV_DIR to your build of OpenCV library.")
+  endif()
+  set(OpenCV_FOUND FALSE CACHE BOOL "" FORCE)
+  set(OPENCV_FOUND FALSE CACHE BOOL "" FORCE)
+endif()
+
diff --git a/cmake/OpenCVDetectTBB.cmake b/cmake/OpenCVDetectTBB.cmake
index b15f9f7..c37a8c3 100644
--- a/cmake/OpenCVDetectTBB.cmake
+++ b/cmake/OpenCVDetectTBB.cmake
@@ -2,7 +2,10 @@ if(ANDROID)
   add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/tbb")
   include_directories(SYSTEM ${TBB_INCLUDE_DIRS})
   set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} tbb)
-  add_definitions(-DTBB_USE_GCC_BUILTINS=1 -D__TBB_GCC_BUILTIN_ATOMICS_PRESENT=1 -D__TBB_USE_GENERIC_DWORD_LOAD_STORE=1)
+  add_definitions(-DTBB_USE_GCC_BUILTINS=1 -D__TBB_GCC_BUILTIN_ATOMICS_PRESENT=1)
+  if(tbb_need_GENERIC_DWORD_LOAD_STORE)
+    add_definitions(-D__TBB_USE_GENERIC_DWORD_LOAD_STORE=1)
+  endif()
   set(HAVE_TBB 1)
 elseif(UNIX AND NOT APPLE)
   PKG_CHECK_MODULES(TBB tbb)
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 317ea4c..573d7eb 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -72,17 +72,13 @@ macro(ocv_add_dependencies full_modname)
       list(APPEND ${__depsvar} "${d}")
     endif()
   endforeach()
+  unset(__depsvar)
+
+  ocv_list_unique(OPENCV_MODULE_${full_modname}_REQ_DEPS)
+  ocv_list_unique(OPENCV_MODULE_${full_modname}_OPT_DEPS)
 
-  if(OPENCV_MODULE_${full_modname}_REQ_DEPS)
-    list(REMOVE_DUPLICATES OPENCV_MODULE_${full_modname}_REQ_DEPS)
-  endif()
-  if(OPENCV_MODULE_${full_modname}_OPT_DEPS)
-    list(REMOVE_DUPLICATES OPENCV_MODULE_${full_modname}_OPT_DEPS)
-  endif()
   set(OPENCV_MODULE_${full_modname}_REQ_DEPS ${OPENCV_MODULE_${full_modname}_REQ_DEPS} CACHE INTERNAL "Required dependencies of ${full_modname} module")
   set(OPENCV_MODULE_${full_modname}_OPT_DEPS ${OPENCV_MODULE_${full_modname}_OPT_DEPS} CACHE INTERNAL "Optional dependencies of ${full_modname} module")
-
-  unset(__depsvar)
 endmacro()
 
 # declare new OpenCV module in current folder
@@ -105,19 +101,22 @@ macro(ocv_add_module _name)
 ")
     endif()
 
-    #remember module details
     if(NOT DEFINED the_description)
       set(the_description "The ${name} OpenCV module")
     endif()
-    set(OPENCV_MODULE_${the_module}_DESCRIPTION "${the_description}" CACHE INTERNAL "Brief description of ${the_module} module")
-    set(OPENCV_MODULE_${the_module}_LOCATION    "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "Location of ${the_module} module sources")
 
-    #create option to enable/disable this module
     if(NOT DEFINED BUILD_${the_module}_INIT)
       set(BUILD_${the_module}_INIT ON)
     endif()
+
+    # create option to enable/disable this module
     option(BUILD_${the_module} "Include ${the_module} module into the OpenCV build" ${BUILD_${the_module}_INIT})
 
+    # remember the module details
+    set(OPENCV_MODULE_${the_module}_DESCRIPTION "${the_description}" CACHE INTERNAL "Brief description of ${the_module} module")
+    set(OPENCV_MODULE_${the_module}_LOCATION    "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "Location of ${the_module} module sources")
+
+    # parse list of dependencies
     if("${ARGV1}" STREQUAL "INTERNAL" OR "${ARGV1}" STREQUAL "BINDINGS")
       set(OPENCV_MODULE_${the_module}_CLASS "${ARGV1}" CACHE INTERNAL "The cathegory of the module")
       set(__ocv_argn__ ${ARGN})
@@ -143,28 +142,19 @@ macro(ocv_add_module _name)
       set(OPENCV_MODULES_DISABLED_USER ${OPENCV_MODULES_DISABLED_USER} "${the_module}" CACHE INTERNAL "List of OpenCV modules explicitly disabled by user")
     endif()
 
-    #TODO: add submodules if any
+    # TODO: add submodules if any
 
-    #stop processing of current file
+    # stop processing of current file
     return()
   else(OPENCV_INITIAL_PASS)
     if(NOT BUILD_${the_module})
-      #extra protection from redefinition
-      return()
+      return() # extra protection from redefinition
     endif()
     project(${the_module})
   endif(OPENCV_INITIAL_PASS)
 endmacro()
 
-# Internal macro; disables OpenCV module
-# ocv_module_turn_off(<module name>)
-macro(__ocv_module_turn_off the_module)
-  list(APPEND OPENCV_MODULES_DISABLED_AUTO "${the_module}")
-  list(REMOVE_ITEM OPENCV_MODULES_BUILD "${the_module}")
-  list(REMOVE_ITEM OPENCV_MODULES_PUBLIC "${the_module}")
-  set(HAVE_${the_module} OFF CACHE INTERNAL "Module ${the_module} can not be built in current configuration")
-endmacro()
-
+# excludes module from current configuration
 macro(ocv_module_disable module)
   set(__modname ${module})
   if(NOT __modname MATCHES "^opencv_")
@@ -175,41 +165,46 @@ macro(ocv_module_disable module)
   set(OPENCV_MODULE_${__modname}_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "Location of ${__modname} module sources")
   set(OPENCV_MODULES_DISABLED_FORCE "${OPENCV_MODULES_DISABLED_FORCE}" CACHE INTERNAL "List of OpenCV modules which can not be build in current configuration")
   unset(__modname)
-  return()#leave the current folder
+  return() # leave the current folder
 endmacro()
 
 
+# Internal macro; partly disables OpenCV module
+macro(__ocv_module_turn_off the_module)
+  list(APPEND OPENCV_MODULES_DISABLED_AUTO "${the_module}")
+  list(REMOVE_ITEM OPENCV_MODULES_BUILD "${the_module}")
+  list(REMOVE_ITEM OPENCV_MODULES_PUBLIC "${the_module}")
+  set(HAVE_${the_module} OFF CACHE INTERNAL "Module ${the_module} can not be built in current configuration")
+endmacro()
+
+# Internal macro for dependencies tracking
 macro(__ocv_flatten_module_required_dependencies the_module)
   set(__flattened_deps "")
   set(__resolved_deps "")
   set(__req_depends ${OPENCV_MODULE_${the_module}_REQ_DEPS})
 
   while(__req_depends)
-    list(GET __req_depends 0 __dep)
-    list(REMOVE_AT __req_depends 0)
+    ocv_list_pop_front(__req_depends __dep)
     if(__dep STREQUAL the_module)
-      #TODO: think how to deal with cyclic dependency
-      __ocv_module_turn_off(${the_module})
+      __ocv_module_turn_off(${the_module}) # TODO: think how to deal with cyclic dependency
       break()
-    elseif("${OPENCV_MODULES_DISABLED_USER};${OPENCV_MODULES_DISABLED_AUTO}" MATCHES "(^|;)${__dep}(;|$)")
-      #depends on disabled module
-      __ocv_module_turn_off(${the_module})
+    elseif(";${OPENCV_MODULES_DISABLED_USER};${OPENCV_MODULES_DISABLED_AUTO};" MATCHES ";${__dep};")
+      __ocv_module_turn_off(${the_module}) # depends on disabled module
       break()
-    elseif("${OPENCV_MODULES_BUILD}" MATCHES "(^|;)${__dep}(;|$)")
-      if(__resolved_deps MATCHES "(^|;)${__dep}(;|$)")
-        #all dependencies of this module are already resolved
-        list(APPEND __flattened_deps "${__dep}")
+    elseif(";${OPENCV_MODULES_BUILD};" MATCHES ";${__dep};")
+      if(";${__resolved_deps};" MATCHES ";${__dep};")
+        list(APPEND __flattened_deps "${__dep}") # all dependencies of this module are already resolved
       else()
-        #put all required subdependencies before this dependency and mark it as resolved
+        # put all required subdependencies before this dependency and mark it as resolved
         list(APPEND __resolved_deps "${__dep}")
         list(INSERT __req_depends 0 ${OPENCV_MODULE_${__dep}_REQ_DEPS} ${__dep})
       endif()
     elseif(__dep MATCHES "^opencv_")
-      #depends on missing module
-      __ocv_module_turn_off(${the_module})
+      __ocv_module_turn_off(${the_module}) # depends on missing module
+      message(WARNING "Unknown \"${__dep}\" module is listened in the dependencies of \"${the_module}\" module")
       break()
     else()
-      #skip non-modules
+      # skip non-modules
     endif()
   endwhile()
 
@@ -220,37 +215,33 @@ macro(__ocv_flatten_module_required_dependencies the_module)
     set(OPENCV_MODULE_${the_module}_DEPS "")
   endif()
 
-  unset(__resolved_deps)
-  unset(__flattened_deps)
-  unset(__req_depends)
-  unset(__dep)
+  ocv_clear_vars(__resolved_deps __flattened_deps __req_depends __dep)
 endmacro()
 
+# Internal macro for dependencies tracking
 macro(__ocv_flatten_module_optional_dependencies the_module)
-  set(__flattened_deps ${OPENCV_MODULE_${the_module}_DEPS})
-  set(__resolved_deps ${OPENCV_MODULE_${the_module}_DEPS})
-  set(__opt_depends ${OPENCV_MODULE_${the_module}_OPT_DEPS})
+  set(__flattened_deps "")
+  set(__resolved_deps "")
+  set(__opt_depends ${OPENCV_MODULE_${the_module}_REQ_DEPS} ${OPENCV_MODULE_${the_module}_OPT_DEPS})
 
   while(__opt_depends)
-    list(GET __opt_depends 0 __dep)
-    list(REMOVE_AT __opt_depends 0)
+    ocv_list_pop_front(__opt_depends __dep)
     if(__dep STREQUAL the_module)
-      #TODO: think how to deal with cyclic dependency
-      __ocv_module_turn_off(${the_module})
+      __ocv_module_turn_off(${the_module}) # TODO: think how to deal with cyclic dependency
       break()
-    elseif("${OPENCV_MODULES_BUILD}" MATCHES "(^|;)${__dep}(;|$)")
-      if(__resolved_deps MATCHES "(^|;)${__dep}(;|$)")
-        #all dependencies of this module are already resolved
-        list(APPEND __flattened_deps "${__dep}")
+    elseif(";${OPENCV_MODULES_BUILD};" MATCHES ";${__dep};")
+      if(";${__resolved_deps};" MATCHES ";${__dep};")
+        list(APPEND __flattened_deps "${__dep}") # all dependencies of this module are already resolved
       else()
-        #put all subdependencies before this dependency and mark it as resolved
+        # put all subdependencies before this dependency and mark it as resolved
         list(APPEND __resolved_deps "${__dep}")
         list(INSERT __opt_depends 0 ${OPENCV_MODULE_${__dep}_REQ_DEPS} ${OPENCV_MODULE_${__dep}_OPT_DEPS} ${__dep})
       endif()
     else()
-      #skip non-modules or missing modules
+      # skip non-modules or missing modules
     endif()
   endwhile()
+
   if(__flattened_deps)
     list(REMOVE_DUPLICATES __flattened_deps)
     set(OPENCV_MODULE_${the_module}_DEPS ${__flattened_deps})
@@ -258,10 +249,7 @@ macro(__ocv_flatten_module_optional_dependencies the_module)
     set(OPENCV_MODULE_${the_module}_DEPS "")
   endif()
 
-  unset(__resolved_deps)
-  unset(__flattened_deps)
-  unset(__opt_depends)
-  unset(__dep)
+  ocv_clear_vars(__resolved_deps __flattened_deps __opt_depends __dep)
 endmacro()
 
 macro(__ocv_flatten_module_dependencies)
@@ -269,16 +257,16 @@ macro(__ocv_flatten_module_dependencies)
     set(HAVE_${m} OFF CACHE INTERNAL "Module ${m} will not be built in current configuration")
   endforeach()
   foreach(m ${OPENCV_MODULES_BUILD})
-    set(HAVE_${m} ON CACHE INTERNAL "Module ${m} will not be built in current configuration")
+    set(HAVE_${m} ON CACHE INTERNAL "Module ${m} will be built in current configuration")
     __ocv_flatten_module_required_dependencies(${m})
   endforeach()
 
   foreach(m ${OPENCV_MODULES_BUILD})
     __ocv_flatten_module_optional_dependencies(${m})
 
-    #dependencies from other modules
+    # save dependencies from other modules
     set(OPENCV_MODULE_${m}_DEPS ${OPENCV_MODULE_${m}_DEPS} CACHE INTERNAL "Flattened dependencies of ${m} module")
-    #extra dependencies
+    # save extra dependencies
     set(OPENCV_MODULE_${m}_DEPS_EXT ${OPENCV_MODULE_${m}_REQ_DEPS} ${OPENCV_MODULE_${m}_OPT_DEPS})
     if(OPENCV_MODULE_${m}_DEPS_EXT AND OPENCV_MODULE_${m}_DEPS)
       list(REMOVE_ITEM OPENCV_MODULE_${m}_DEPS_EXT ${OPENCV_MODULE_${m}_DEPS})
@@ -287,8 +275,15 @@ macro(__ocv_flatten_module_dependencies)
     set(OPENCV_MODULE_${m}_DEPS_EXT ${OPENCV_MODULE_${m}_DEPS_EXT} CACHE INTERNAL "Extra dependencies of ${m} module")
   endforeach()
 
+  # order modules by dependencies
+  set(OPENCV_MODULES_BUILD_ "")
+  foreach(m ${OPENCV_MODULES_BUILD})
+    list(APPEND OPENCV_MODULES_BUILD_ ${OPENCV_MODULE_${m}_DEPS} ${m})
+  endforeach()
+  ocv_list_unique(OPENCV_MODULES_BUILD_)
+
   set(OPENCV_MODULES_PUBLIC        ${OPENCV_MODULES_PUBLIC}        CACHE INTERNAL "List of OpenCV modules marked for export")
-  set(OPENCV_MODULES_BUILD         ${OPENCV_MODULES_BUILD}         CACHE INTERNAL "List of OpenCV modules included into the build")
+  set(OPENCV_MODULES_BUILD         ${OPENCV_MODULES_BUILD_}         CACHE INTERNAL "List of OpenCV modules included into the build")
   set(OPENCV_MODULES_DISABLED_AUTO ${OPENCV_MODULES_DISABLED_AUTO} CACHE INTERNAL "List of OpenCV modules implicitly disabled due to dependencies")
 endmacro()
 
@@ -300,10 +295,11 @@ macro(ocv_glob_modules)
   endif()
   set(__directories_observed "")
 
-  #collect modules
+  # collect modules
   set(OPENCV_INITIAL_PASS ON)
   foreach(__path ${ARGN})
     ocv_get_real_path(__path "${__path}")
+
     list(FIND __directories_observed "${__path}" __pathIdx)
     if(__pathIdx GREATER -1)
       message(FATAL_ERROR "The directory ${__path} is observed for OpenCV modules second time.")
@@ -316,37 +312,36 @@ macro(ocv_glob_modules)
       foreach(mod ${__ocvmodules})
         ocv_get_real_path(__modpath "${__path}/${mod}")
         if(EXISTS "${__modpath}/CMakeLists.txt")
+
           list(FIND __directories_observed "${__modpath}" __pathIdx)
           if(__pathIdx GREATER -1)
             message(FATAL_ERROR "The module from ${__modpath} is already loaded.")
           endif()
           list(APPEND __directories_observed "${__modpath}")
 
-          add_subdirectory("${__modpath}" "${CMAKE_CURRENT_BINARY_DIR}/${mod}/.${mod}")
+          if(OCV_MODULE_RELOCATE_ON_INITIAL_PASS)
+            file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${mod}/.${mod}")
+            file(COPY "${__modpath}/CMakeLists.txt" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/${mod}/.${mod}")
+            add_subdirectory("${CMAKE_CURRENT_BINARY_DIR}/${mod}/.${mod}" "${CMAKE_CURRENT_BINARY_DIR}/${mod}/.${mod}")
+            if("${OPENCV_MODULE_opencv_${mod}_LOCATION}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}/${mod}/.${mod}")
+              set(OPENCV_MODULE_opencv_${mod}_LOCATION "${__modpath}" CACHE PATH "" FORCE)
+            endif()
+          else()
+            add_subdirectory("${__modpath}" "${CMAKE_CURRENT_BINARY_DIR}/${mod}/.${mod}")
+          endif()
         endif()
       endforeach()
     endif()
   endforeach()
-  unset(__ocvmodules)
-  unset(__directories_observed)
-  unset(__path)
-  unset(__modpath)
-  unset(__pathIdx)
+  ocv_clear_vars(__ocvmodules __directories_observed __path __modpath __pathIdx)
 
-  #resolve dependencies
+  # resolve dependencies
   __ocv_flatten_module_dependencies()
 
-  #order modules by dependencies
-  set(OPENCV_MODULES_BUILD_ "")
-  foreach(m ${OPENCV_MODULES_BUILD})
-    list(APPEND OPENCV_MODULES_BUILD_ ${OPENCV_MODULE_${m}_DEPS} ${m})
-  endforeach()
-  ocv_list_unique(OPENCV_MODULES_BUILD_)
-
-  #create modules
+  # create modules
   set(OPENCV_INITIAL_PASS OFF PARENT_SCOPE)
   set(OPENCV_INITIAL_PASS OFF)
-  foreach(m ${OPENCV_MODULES_BUILD_})
+  foreach(m ${OPENCV_MODULES_BUILD})
     if(m MATCHES "^opencv_")
       string(REGEX REPLACE "^opencv_" "" __shortname "${m}")
       add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/${__shortname}")
@@ -389,7 +384,7 @@ endmacro()
 macro(ocv_module_include_directories)
   ocv_include_directories("${OPENCV_MODULE_${the_module}_LOCATION}/include"
                           "${OPENCV_MODULE_${the_module}_LOCATION}/src"
-                          "${CMAKE_CURRENT_BINARY_DIR}"#for precompiled headers
+                          "${CMAKE_CURRENT_BINARY_DIR}" # for precompiled headers
                           )
   ocv_include_modules(${OPENCV_MODULE_${the_module}_DEPS} ${ARGN})
 endmacro()
@@ -417,7 +412,7 @@ macro(ocv_set_module_sources)
   endif()
 
   # use full paths for module to be independent from the module location
-  ocv_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS)
+  ocv_convert_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS)
 
   set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
   set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
@@ -446,9 +441,11 @@ endmacro()
 #   ocv_create_module(SKIP_LINK)
 macro(ocv_create_module)
   add_library(${the_module} ${OPENCV_MODULE_TYPE} ${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES})
+
   if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
     target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
   endif()
+
   add_dependencies(opencv_modules ${the_module})
 
   if(ENABLE_SOLUTION_FOLDERS)
@@ -485,7 +482,7 @@ macro(ocv_create_module)
     if(CMAKE_CROSSCOMPILING)
       set_target_properties(${the_module} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk")
     endif()
-      set_target_properties(${the_module} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc /DEBUG")
+    set_target_properties(${the_module} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc /DEBUG")
   endif()
 
   install(TARGETS ${the_module}
@@ -495,7 +492,7 @@ macro(ocv_create_module)
     )
 
   # only "public" headers need to be installed
-  if(OPENCV_MODULE_${the_module}_HEADERS AND OPENCV_MODULES_PUBLIC MATCHES "(^|;)${the_module}(;|$)")
+  if(OPENCV_MODULE_${the_module}_HEADERS AND ";${OPENCV_MODULES_PUBLIC};" MATCHES ";${the_module};")
     foreach(hdr ${OPENCV_MODULE_${the_module}_HEADERS})
       string(REGEX REPLACE "^.*opencv2/" "opencv2/" hdr2 "${hdr}")
       if(hdr2 MATCHES "^(opencv2/.*)/[^/]+.h(..)?$")
@@ -510,30 +507,17 @@ endmacro()
 # Usage:
 # ocv_add_precompiled_headers(${the_module})
 macro(ocv_add_precompiled_headers the_target)
-    if("${the_target}" MATCHES "^opencv_test_.*$")
-        SET(pch_path "test/test_")
+  if("${the_target}" MATCHES "^opencv_test_.*$")
+    SET(pch_path "test/test_")
   elseif("${the_target}" MATCHES "opencv_perf_gpu_cpu")
-      SET(pch_path "perf_cpu/perf_cpu_")
-    elseif("${the_target}" MATCHES "^opencv_perf_.*$")
-        SET(pch_path "perf/perf_")
-    else()
-        SET(pch_path "src/")
-    endif()
-    set(pch_header "${CMAKE_CURRENT_SOURCE_DIR}/${pch_path}precomp.hpp")
-
-    if(PCHSupport_FOUND AND ENABLE_PRECOMPILED_HEADERS AND EXISTS "${pch_header}")
-        if(CMAKE_GENERATOR MATCHES Visual)
-            set(${the_target}_pch "${CMAKE_CURRENT_SOURCE_DIR}/${pch_path}precomp.cpp")
-            add_native_precompiled_header(${the_target} ${pch_header})
-        elseif(CMAKE_GENERATOR MATCHES Xcode)
-            add_native_precompiled_header(${the_target} ${pch_header})
-        elseif(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_GENERATOR MATCHES "Makefiles|Ninja")
-            add_precompiled_header(${the_target} ${pch_header})
-        endif()
-    endif()
-    unset(pch_header)
-    unset(pch_path)
-    unset(${the_target}_pch)
+    SET(pch_path "perf_cpu/perf_cpu_")
+  elseif("${the_target}" MATCHES "^opencv_perf_.*$")
+    SET(pch_path "perf/perf_")
+  else()
+    SET(pch_path "src/")
+  endif()
+  ocv_add_precompiled_header_to_target(${the_target} "${CMAKE_CURRENT_SOURCE_DIR}/${pch_path}precomp.hpp")
+  unset(pch_path)
 endmacro()
 
 # short command for adding simple OpenCV module
@@ -563,7 +547,7 @@ macro(ocv_check_dependencies)
   endforeach()
 endmacro()
 
-#auxiliary macro to parse arguments of ocv_add_accuracy_tests and ocv_add_perf_tests commands
+# auxiliary macro to parse arguments of ocv_add_accuracy_tests and ocv_add_perf_tests commands
 macro(__ocv_parse_test_sources tests_type)
   set(OPENCV_${tests_type}_${the_module}_SOURCES "")
   set(OPENCV_${tests_type}_${the_module}_DEPS "")
@@ -593,7 +577,7 @@ endmacro()
 
 # this is a command for adding OpenCV performance tests to the module
 # ocv_add_perf_tests(<extra_dependencies>)
-macro(ocv_add_perf_tests)
+function(ocv_add_perf_tests)
   set(perf_path "${CMAKE_CURRENT_SOURCE_DIR}/perf")
   if(BUILD_PERF_TESTS AND EXISTS "${perf_path}")
     __ocv_parse_test_sources(PERF ${ARGN})
@@ -604,7 +588,7 @@ macro(ocv_add_perf_tests)
 
     if(OCV_DEPENDENCIES_FOUND)
       set(the_target "opencv_perf_${name}")
-      #project(${the_target})
+      # project(${the_target})
 
       ocv_module_include_directories(${perf_deps} "${perf_path}")
 
@@ -636,14 +620,14 @@ macro(ocv_add_perf_tests)
         add_dependencies(perf ${the_target})
       endif()
     else(OCV_DEPENDENCIES_FOUND)
-      #TODO: warn about unsatisfied dependencies
+      # TODO: warn about unsatisfied dependencies
     endif(OCV_DEPENDENCIES_FOUND)
   endif()
-endmacro()
+endfunction()
 
 # this is a command for adding OpenCV accuracy/regression tests to the module
 # ocv_add_accuracy_tests([FILES <source group name> <list of sources>] [DEPENDS_ON] <list of extra dependencies>)
-macro(ocv_add_accuracy_tests)
+function(ocv_add_accuracy_tests)
   set(test_path "${CMAKE_CURRENT_SOURCE_DIR}/test")
   ocv_check_dependencies(${test_deps})
   if(BUILD_TESTS AND EXISTS "${test_path}")
@@ -655,7 +639,7 @@ macro(ocv_add_accuracy_tests)
 
     if(OCV_DEPENDENCIES_FOUND)
       set(the_target "opencv_test_${name}")
-      #project(${the_target})
+      # project(${the_target})
 
       ocv_module_include_directories(${test_deps} "${test_path}")
 
@@ -687,12 +671,12 @@ macro(ocv_add_accuracy_tests)
 
       ocv_add_precompiled_headers(${the_target})
     else(OCV_DEPENDENCIES_FOUND)
-      #TODO: warn about unsatisfied dependencies
+      # TODO: warn about unsatisfied dependencies
     endif(OCV_DEPENDENCIES_FOUND)
   endif()
-endmacro()
+endfunction()
 
-# internal macro; finds all link dependencies of module
+# internal macro; finds all link dependencies of the module
 # should be used at the end of CMake processing
 macro(__ocv_track_module_link_dependencies the_module optkind)
   set(${the_module}_MODULE_DEPS_${optkind}   "")
@@ -742,7 +726,7 @@ macro(__ocv_track_module_link_dependencies the_module optkind)
     #ocv_list_reverse(${the_module}_EXTRA_DEPS_${optkind})
 
     if(__has_cycle)
-      #not sure if it can work
+      # not sure if it can work
       list(APPEND ${the_module}_MODULE_DEPS_${optkind} "${the_module}")
     endif()
 
@@ -754,13 +738,13 @@ macro(__ocv_track_module_link_dependencies the_module optkind)
   endif()#STATIC_LIBRARY
   unset(__module_type)
 
-#message("${the_module}_MODULE_DEPS_${optkind}")
-#message("       ${${the_module}_MODULE_DEPS_${optkind}}")
-#message("       ${OPENCV_MODULE_${the_module}_DEPS}")
-#message("")
-#message("${the_module}_EXTRA_DEPS_${optkind}")
-#message("       ${${the_module}_EXTRA_DEPS_${optkind}}")
-#message("")
+  #message("${the_module}_MODULE_DEPS_${optkind}")
+  #message("       ${${the_module}_MODULE_DEPS_${optkind}}")
+  #message("       ${OPENCV_MODULE_${the_module}_DEPS}")
+  #message("")
+  #message("${the_module}_EXTRA_DEPS_${optkind}")
+  #message("       ${${the_module}_EXTRA_DEPS_${optkind}}")
+  #message("")
 endmacro()
 
 # creates lists of build dependencies needed for external projects
diff --git a/cmake/OpenCVPCHSupport.cmake b/cmake/OpenCVPCHSupport.cmake
index 55b712c..16dbb06 100644
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@@ -1,4 +1,4 @@
-# taken from http://www.vtk.org/Bug/view.php?id=1260 and slightly adjusted
+# taken from http://public.kitware.com/Bug/view.php?id=1260 and slightly adjusted
 
 # - Try to find precompiled headers support for GCC 3.4 and 4.x
 # Once done this will define:
@@ -26,8 +26,8 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
     SET(_PCH_include_prefix "-I")
     SET(_PCH_isystem_prefix "-isystem")
 
-ELSEIF(WIN32)
-    SET(PCHSupport_FOUND TRUE) # for experimental msvc support
+ELSEIF(CMAKE_GENERATOR MATCHES "^Visual.*$")
+    SET(PCHSupport_FOUND TRUE)
     SET(_PCH_include_prefix "/I")
     SET(_PCH_isystem_prefix "/I")
 ELSE()
@@ -324,3 +324,17 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input)
     endif()
 
 ENDMACRO(ADD_NATIVE_PRECOMPILED_HEADER)
+
+macro(ocv_add_precompiled_header_to_target the_target pch_header)
+  if(PCHSupport_FOUND AND ENABLE_PRECOMPILED_HEADERS AND EXISTS "${pch_header}")
+    if(CMAKE_GENERATOR MATCHES Visual)
+      string(REGEX REPLACE "hpp$" "cpp" ${the_target}_pch "${pch_header}")
+      add_native_precompiled_header(${the_target} ${pch_header})
+      unset(${the_target}_pch)
+    elseif(CMAKE_GENERATOR MATCHES Xcode)
+      add_native_precompiled_header(${the_target} ${pch_header})
+    elseif(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_GENERATOR MATCHES "Makefiles|Ninja")
+      add_precompiled_header(${the_target} ${pch_header})
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index aef7525..8c20906 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -384,6 +384,17 @@ macro(ocv_list_add_suffix LST SUFFIX)
 endmacro()
 
 
+# gets and removes the first element from list
+macro(ocv_list_pop_front LST VAR)
+  if(${LST})
+    list(GET ${LST} 0 ${VAR})
+    list(REMOVE_AT ${LST} 0)
+  else()
+    set(${VAR} "")
+  endif()
+endmacro()
+
+
 # simple regex escaping routine (does not cover all cases!!!)
 macro(ocv_regex_escape var regex)
   string(REGEX REPLACE "([+.*^$])" "\\\\1" ${var} "${regex}")
@@ -401,7 +412,7 @@ endmacro()
 
 
 # convert list of paths to full paths
-macro(ocv_to_full_paths VAR)
+macro(ocv_convert_to_full_paths VAR)
   if(${VAR})
     set(__tmp "")
     foreach(path ${${VAR}})
diff --git a/doc/tutorials/introduction/android_binary_package/android_binary_package.rst b/doc/tutorials/introduction/android_binary_package/android_binary_package.rst
index 65592ff..c25f32e 100644
--- a/doc/tutorials/introduction/android_binary_package/android_binary_package.rst
+++ b/doc/tutorials/introduction/android_binary_package/android_binary_package.rst
@@ -102,7 +102,7 @@ You need the following tools to be installed:
 #. **Eclipse IDE**
 
    Check the `Android SDK System Requirements <http://developer.android.com/sdk/requirements.html>`_ document for a list of Eclipse versions that are compatible with the Android SDK. 
-   For OpenCV 2.4.0 we recommend Eclipse 3.7 (Indigo) or later versions. They work well for OpenCV under both Windows and Linux.
+   For OpenCV 2.4.x we recommend Eclipse 3.7 (Indigo) or later versions. They work well for OpenCV under both Windows and Linux.
    
    If you have no Eclipse installed, you can get it from the `download page  <http://www.eclipse.org/downloads/>`_.
       
@@ -154,12 +154,12 @@ Get the OpenCV package for Android development
    
    .. code-block:: bash
 
-      tar -jxvf ~/Downloads/OpenCV-2.4.0-android-bin.tar.bz2
+      tar -jxvf ~/Downloads/OpenCV-2.4.1-android-bin2.tar.bz2
       
    For this tutorial I have unpacked OpenCV to the :file:`C:\\Work\\android-opencv\\` directory.
 
-.. |opencv_android_bin_pack| replace:: OpenCV-2.4.0-android-bin.tar.bz2
-.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.0/OpenCV-2.4.0-android-bin.tar.bz2/download
+.. |opencv_android_bin_pack| replace:: OpenCV-2.4.1-android-bin2.tar.bz2
+.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.1/OpenCV-2.4.1-android-bin2.tar.bz2/download
 .. |opencv_android_bin_pack_url| replace:: |opencv_android_bin_pack|
 .. |seven_zip| replace:: 7-Zip
 .. _seven_zip: http://www.7-zip.org/
@@ -214,7 +214,7 @@ Open OpenCV library and samples in Eclipse
          :align: center
 
    * Click :guilabel:`OK` to close preferences dialog.
-   
+
 #. Import OpenCV and samples into workspace.
 
    OpenCV library is packed as a ready-for-use `Android Library Project
diff --git a/modules/contrib/src/rgbdodometry.cpp b/modules/contrib/src/rgbdodometry.cpp
index 81525f5..59f3254 100644
--- a/modules/contrib/src/rgbdodometry.cpp
+++ b/modules/contrib/src/rgbdodometry.cpp
@@ -48,10 +48,12 @@
 #include <iostream>
 
 #if defined(HAVE_EIGEN) && EIGEN_WORLD_VERSION == 3
-#include <Eigen/Core>
-#include <unsupported/Eigen/MatrixFunctions>
-
-#include <Eigen/Dense>
+#  include <Eigen/Core>
+#  ifdef ANDROID
+     template <typename Scalar> Scalar log2(Scalar v) { using std::log; return log(v)/log(Scalar(2)); }
+#  endif
+#  include <unsupported/Eigen/MatrixFunctions>
+#  include <Eigen/Dense>
 #endif
 
 #include <limits>
@@ -581,7 +583,7 @@ bool cv::RGBDOdometry( cv::Mat& Rt, const Mat& initRt,
         const double fy = levelCameraMatrix.at<double>(1,1);
         const double determinantThreshold = 1e-6;
 
-        Mat corresps( levelImage0.size(), levelImage0.type(), CV_32SC1 );
+        Mat corresps( levelImage0.size(), levelImage0.type() );
 
         // Run transformation search on current level iteratively.
         for( int iter = 0; iter < (*iterCountsPtr)[level]; iter ++ )
diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp
index 469186a..f4b60fc 100644
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -85,7 +85,6 @@ template<typename _Tp, int cn> class CV_EXPORTS Vec;
 template<typename _Tp, int m, int n> class CV_EXPORTS Matx;
 
 typedef std::string String;
-typedef std::basic_string<wchar_t> WString;
 
 class Mat;
 class SparseMat;
@@ -110,8 +109,12 @@ template<typename _Tp> class CV_EXPORTS MatIterator_;
 template<typename _Tp> class CV_EXPORTS MatConstIterator_;
 template<typename _Tp> class CV_EXPORTS MatCommaInitializer_;
 
+#if !defined(ANDROID) || (defined(_GLIBCXX_USE_WCHAR_T) && _GLIBCXX_USE_WCHAR_T)
+typedef std::basic_string<wchar_t> WString;
+
 CV_EXPORTS string fromUtf16(const WString& str);
 CV_EXPORTS WString toUtf16(const string& str);
+#endif
 
 CV_EXPORTS string format( const char* fmt, ... );
 CV_EXPORTS string tempfile( const char* suffix CV_DEFAULT(0));
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index d27fc07..edaadc8 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -151,7 +151,7 @@ cv::string cv::FileStorage::getDefaultObjectName(const string& _filename)
 
 namespace cv
 {
-#if !defined(ANDROID) || defined(_GLIBCXX_USE_WCHAR_T)
+#if !defined(ANDROID) || (defined(_GLIBCXX_USE_WCHAR_T) && _GLIBCXX_USE_WCHAR_T)
 string fromUtf16(const WString& str)
 {
     cv::AutoBuffer<char> _buf(str.size()*4 + 1);
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
new file mode 100644
index 0000000..f413432
--- /dev/null
+++ b/modules/gpu/perf/perf_core.cpp
@@ -0,0 +1,1883 @@
+#include "perf_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+//////////////////////////////////////////////////////////////////////
+// Merge
+
+GPU_PERF_TEST(Merge, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+
+    std::vector<cv::gpu::GpuMat> src(channels);
+    for (int i = 0; i < channels; ++i)
+        src[i] = cv::gpu::GpuMat(size, depth, cv::Scalar::all(i));
+
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::merge(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::merge(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Merge, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F),
+    testing::Values<Channels>(2, 3, 4)));
+
+//////////////////////////////////////////////////////////////////////
+// Split
+
+GPU_PERF_TEST(Split, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+
+    cv::gpu::GpuMat src(size, CV_MAKE_TYPE(depth, channels), cv::Scalar(1, 2, 3, 4));
+
+    std::vector<cv::gpu::GpuMat> dst;
+
+    cv::gpu::split(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::split(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Split, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F),
+    testing::Values<Channels>(2, 3, 4)));
+
+//////////////////////////////////////////////////////////////////////
+// Add_Mat
+
+GPU_PERF_TEST(Add_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0.0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::add(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::add(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Add_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Add_Scalar
+
+GPU_PERF_TEST(Add_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s(1, 2, 3, 4);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::add(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::add(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Add_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Subtract_Mat
+
+GPU_PERF_TEST(Subtract_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0.0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::subtract(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::subtract(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Subtract_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Subtract_Scalar
+
+GPU_PERF_TEST(Subtract_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s(1, 2, 3, 4);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::subtract(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::subtract(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Subtract_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Multiply_Mat
+
+GPU_PERF_TEST(Multiply_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0.0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::multiply(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::multiply(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Multiply_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Multiply_Scalar
+
+GPU_PERF_TEST(Multiply_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s(1, 2, 3, 4);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::multiply(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::multiply(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Multiply_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Divide_Mat
+
+GPU_PERF_TEST(Divide_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0.0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::divide(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::divide(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Divide_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Divide_Scalar
+
+GPU_PERF_TEST(Divide_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s(1, 2, 3, 4);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::divide(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::divide(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Divide_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Divide_Scalar_Inv
+
+GPU_PERF_TEST(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    double scale = 100.0;
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::divide(scale, src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::divide(scale, src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Divide_Scalar_Inv, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// AbsDiff_Mat
+
+GPU_PERF_TEST(AbsDiff_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0.0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::absdiff(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::absdiff(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, AbsDiff_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// AbsDiff_Scalar
+
+GPU_PERF_TEST(AbsDiff_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s(1, 2, 3, 4);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::absdiff(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::absdiff(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, AbsDiff_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Abs
+
+GPU_PERF_TEST(Abs, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::abs(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::abs(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Abs, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_16S, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Sqr
+
+GPU_PERF_TEST(Sqr, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::sqr(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::sqr(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Sqr, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16S, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Sqrt
+
+GPU_PERF_TEST(Sqrt, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::sqrt(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::sqrt(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Sqrt, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16S, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Log
+
+GPU_PERF_TEST(Log, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 1.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::log(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::log(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Log, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16S, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Exp
+
+GPU_PERF_TEST(Exp, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 1.0, 10.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::exp(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::exp(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Exp, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16S, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Pow
+
+GPU_PERF_TEST(Pow, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 1.0, 10.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::pow(src, 2.3, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::pow(src, 2.3, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Pow, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16S, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Compare_Mat
+
+CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
+#define ALL_CMP_CODES testing::Values(CmpCode(cv::CMP_EQ), CmpCode(cv::CMP_NE), CmpCode(cv::CMP_GT), CmpCode(cv::CMP_GE), CmpCode(cv::CMP_LT), CmpCode(cv::CMP_LE))
+
+GPU_PERF_TEST(Compare_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth, CmpCode)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int cmp_code = GET_PARAM(3);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::compare(src1, src2, dst, cmp_code);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::compare(src1, src2, dst, cmp_code);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Compare_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F),
+    ALL_CMP_CODES));
+
+//////////////////////////////////////////////////////////////////////
+// Compare_Scalar
+
+GPU_PERF_TEST(Compare_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, CmpCode)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int cmp_code = GET_PARAM(3);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s = cv::Scalar::all(50);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::compare(src, s, dst, cmp_code);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::compare(src, s, dst, cmp_code);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Compare_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F),
+    ALL_CMP_CODES));
+
+//////////////////////////////////////////////////////////////////////
+// Bitwise_Not
+
+GPU_PERF_TEST(Bitwise_Not, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bitwise_not(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_not(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Bitwise_Not, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S)));
+
+//////////////////////////////////////////////////////////////////////
+// Bitwise_And_Mat
+
+GPU_PERF_TEST(Bitwise_And_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bitwise_and(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_and(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Bitwise_And_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S)));
+
+//////////////////////////////////////////////////////////////////////
+// Bitwise_And_Scalar
+
+GPU_PERF_TEST(Bitwise_And_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s = cv::Scalar(50, 50, 50, 50);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bitwise_and(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_and(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Bitwise_And_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S),
+    testing::Values<Channels>(1, 3, 4)));
+
+//////////////////////////////////////////////////////////////////////
+// Bitwise_Or_Mat
+
+GPU_PERF_TEST(Bitwise_Or_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bitwise_or(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_or(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Bitwise_Or_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S)));
+
+//////////////////////////////////////////////////////////////////////
+// Bitwise_Or_Scalar
+
+GPU_PERF_TEST(Bitwise_Or_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s = cv::Scalar(50, 50, 50, 50);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bitwise_or(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_or(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Bitwise_Or_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S),
+    testing::Values<Channels>(1, 3, 4)));
+
+//////////////////////////////////////////////////////////////////////
+// Bitwise_Xor_Mat
+
+GPU_PERF_TEST(Bitwise_Xor_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0, 100.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bitwise_xor(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_xor(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Bitwise_Xor_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S)));
+
+//////////////////////////////////////////////////////////////////////
+// Bitwise_Xor_Scalar
+
+GPU_PERF_TEST(Bitwise_Xor_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar s = cv::Scalar(50, 50, 50, 50);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::bitwise_xor(src, s, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::bitwise_xor(src, s, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Bitwise_Xor_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S),
+    testing::Values<Channels>(1, 3, 4)));
+
+//////////////////////////////////////////////////////////////////////
+// RShift
+
+GPU_PERF_TEST(RShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar_<int> val = cv::Scalar_<int>::all(4);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::rshift(src, val, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::rshift(src, val, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, RShift, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S),
+    testing::Values<Channels>(1, 3, 4)));
+
+//////////////////////////////////////////////////////////////////////
+// LShift
+
+GPU_PERF_TEST(LShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar_<int> val = cv::Scalar_<int>::all(4);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::lshift(src, val, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::lshift(src, val, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, LShift, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S),
+    testing::Values<Channels>(1, 3, 4)));
+
+//////////////////////////////////////////////////////////////////////
+// Min_Mat
+
+GPU_PERF_TEST(Min_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0, 255.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0, 255.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::min(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::min(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Min_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Min_Scalar
+
+GPU_PERF_TEST(Min_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    double val = 50.0;
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::min(src, val, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::min(src, val, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Min_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Max_Mat
+
+GPU_PERF_TEST(Max_Mat, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src1_host(size, depth);
+    fill(src1_host, 0, 255.0);
+
+    cv::Mat src2_host(size, depth);
+    fill(src2_host, 0, 255.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::max(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::max(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Max_Mat, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// Max_Scalar
+
+GPU_PERF_TEST(Max_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    double val = 50.0;
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::max(src, val, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::max(src, val, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Max_Scalar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F)));
+
+//////////////////////////////////////////////////////////////////////
+// AddWeighted
+
+GPU_PERF_TEST(AddWeighted, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth1 = GET_PARAM(2);
+    int depth2 = GET_PARAM(3);
+    int dst_depth = GET_PARAM(4);
+
+    cv::Mat src1_host(size, depth1);
+    fill(src1_host, 0, 100.0);
+
+    cv::Mat src2_host(size, depth2);
+    fill(src2_host, 0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, AddWeighted, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F),
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F),
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// GEMM
+#ifdef HAVE_CUBLAS
+
+CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T)
+#define ALL_GEMM_FLAGS testing::Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))
+
+GPU_PERF_TEST(GEMM, cv::gpu::DeviceInfo, cv::Size, MatType, GemmFlags)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+    int flags = GET_PARAM(3);
+
+    cv::Mat src1_host(size, type);
+    fill(src1_host, 0.0, 10.0);
+
+    cv::Mat src2_host(size, type);
+    fill(src2_host, 0.0, 10.0);
+
+    cv::Mat src3_host(size, type);
+    fill(src3_host, 0.0, 10.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat src3(src3_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::gemm(src1, src2, 1.0, src3, 1.0, dst, flags);
+
+    declare.time(5.0);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::gemm(src1, src2, 1.0, src3, 1.0, dst, flags);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, GEMM, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
+    testing::Values<MatType>(CV_32FC1, CV_32FC2, CV_64FC1, CV_64FC2),
+    ALL_GEMM_FLAGS));
+
+#endif
+//////////////////////////////////////////////////////////////////////
+// Transpose
+
+GPU_PERF_TEST(Transpose, cv::gpu::DeviceInfo, cv::Size, MatType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::transpose(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::transpose(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Transpose, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatType>(CV_8UC1, CV_8UC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32SC2, CV_64FC1)));
+
+//////////////////////////////////////////////////////////////////////
+// Flip
+
+enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
+CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
+#define ALL_FLIP_CODES testing::Values(FlipCode(FLIP_BOTH), FlipCode(FLIP_X), FlipCode(FLIP_Y))
+
+GPU_PERF_TEST(Flip, cv::gpu::DeviceInfo, cv::Size, MatType, FlipCode)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+    int flipCode = GET_PARAM(3);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::flip(src, dst, flipCode);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::flip(src, dst, flipCode);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Flip, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatType>(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
+    ALL_FLIP_CODES));
+
+//////////////////////////////////////////////////////////////////////
+// LUT_OneChannel
+
+GPU_PERF_TEST(LUT_OneChannel, cv::gpu::DeviceInfo, cv::Size, MatType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 100.0);
+
+    cv::Mat lut(1, 256, CV_8UC1);
+    fill(lut, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::LUT(src, lut, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::LUT(src, lut, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, LUT_OneChannel, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatType>(CV_8UC1, CV_8UC3)));
+
+//////////////////////////////////////////////////////////////////////
+// LUT_MultiChannel
+
+GPU_PERF_TEST(LUT_MultiChannel, cv::gpu::DeviceInfo, cv::Size, MatType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 100.0);
+
+    cv::Mat lut(1, 256, CV_MAKE_TYPE(CV_8U, src_host.channels()));
+    fill(lut, 0.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::LUT(src, lut, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::LUT(src, lut, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, LUT_MultiChannel, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatType>(CV_8UC3)));
+
+//////////////////////////////////////////////////////////////////////
+// Magnitude_Complex
+
+GPU_PERF_TEST(Magnitude_Complex, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+
+    cv::Mat src_host(size, CV_32FC2);
+    fill(src_host, -100.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::magnitude(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::magnitude(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Magnitude_Complex, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES));
+
+//////////////////////////////////////////////////////////////////////
+// Magnitude_Sqr_Complex
+
+GPU_PERF_TEST(Magnitude_Sqr_Complex, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+
+    cv::Mat src_host(size, CV_32FC2);
+    fill(src_host, -100.0, 100.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::magnitudeSqr(src, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::magnitudeSqr(src, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Magnitude_Sqr_Complex, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES));
+
+//////////////////////////////////////////////////////////////////////
+// Magnitude
+
+GPU_PERF_TEST(Magnitude, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+
+    cv::Mat src1_host(size, CV_32FC1);
+    fill(src1_host, -100.0, 100.0);
+
+    cv::Mat src2_host(size, CV_32FC1);
+    fill(src2_host, -100.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::magnitude(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::magnitude(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Magnitude, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES));
+
+//////////////////////////////////////////////////////////////////////
+// Magnitude_Sqr
+
+GPU_PERF_TEST(Magnitude_Sqr, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+
+    cv::Mat src1_host(size, CV_32FC1);
+    fill(src1_host, -100.0, 100.0);
+
+    cv::Mat src2_host(size, CV_32FC1);
+    fill(src2_host, -100.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::magnitudeSqr(src1, src2, dst);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::magnitudeSqr(src1, src2, dst);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Magnitude_Sqr, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES));
+
+//////////////////////////////////////////////////////////////////////
+// Phase
+
+IMPLEMENT_PARAM_CLASS(AngleInDegrees, bool)
+
+GPU_PERF_TEST(Phase, cv::gpu::DeviceInfo, cv::Size, AngleInDegrees)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    bool angleInDegrees = GET_PARAM(2);
+
+    cv::Mat src1_host(size, CV_32FC1);
+    fill(src1_host, -100.0, 100.0);
+
+    cv::Mat src2_host(size, CV_32FC1);
+    fill(src2_host, -100.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::phase(src1, src2, dst, angleInDegrees);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::phase(src1, src2, dst, angleInDegrees);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Phase, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<AngleInDegrees>(false, true)));
+
+//////////////////////////////////////////////////////////////////////
+// CartToPolar
+
+GPU_PERF_TEST(CartToPolar, cv::gpu::DeviceInfo, cv::Size, AngleInDegrees)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    bool angleInDegrees = GET_PARAM(2);
+
+    cv::Mat src1_host(size, CV_32FC1);
+    fill(src1_host, -100.0, 100.0);
+
+    cv::Mat src2_host(size, CV_32FC1);
+    fill(src2_host, -100.0, 100.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    cv::gpu::GpuMat magnitude;
+    cv::gpu::GpuMat angle;
+
+    cv::gpu::cartToPolar(src1, src2, magnitude, angle, angleInDegrees);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::cartToPolar(src1, src2, magnitude, angle, angleInDegrees);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, CartToPolar, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<AngleInDegrees>(false, true)));
+
+//////////////////////////////////////////////////////////////////////
+// PolarToCart
+
+GPU_PERF_TEST(PolarToCart, cv::gpu::DeviceInfo, cv::Size, AngleInDegrees)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    bool angleInDegrees = GET_PARAM(2);
+
+    cv::Mat magnitude_host(size, CV_32FC1);
+    fill(magnitude_host, 0.0, 100.0);
+
+    cv::Mat angle_host(size, CV_32FC1);
+    fill(angle_host, 0.0, angleInDegrees ? 360.0 : 2 * CV_PI);
+
+    cv::gpu::GpuMat magnitude(magnitude_host);
+    cv::gpu::GpuMat angle(angle_host);
+    cv::gpu::GpuMat x;
+    cv::gpu::GpuMat y;
+
+    cv::gpu::polarToCart(magnitude, angle, x, y, angleInDegrees);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::polarToCart(magnitude, angle, x, y, angleInDegrees);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, PolarToCart, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<AngleInDegrees>(false, true)));
+
+//////////////////////////////////////////////////////////////////////
+// MeanStdDev
+
+GPU_PERF_TEST(MeanStdDev, cv::gpu::DeviceInfo, cv::Size)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+
+    cv::Mat src_host(size, CV_8UC1);
+    fill(src_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar mean;
+    cv::Scalar stddev;
+    cv::gpu::GpuMat buf;
+
+    cv::gpu::meanStdDev(src, mean, stddev, buf);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::meanStdDev(src, mean, stddev, buf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, MeanStdDev, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES));
+
+//////////////////////////////////////////////////////////////////////
+// Norm
+
+GPU_PERF_TEST(Norm, cv::gpu::DeviceInfo, cv::Size, MatDepth, NormType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int normType = GET_PARAM(3);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    double dst;
+    cv::gpu::GpuMat buf;
+
+    dst = cv::gpu::norm(src, normType, buf);
+
+    TEST_CYCLE()
+    {
+        dst = cv::gpu::norm(src, normType, buf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Norm, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32S, CV_32F),
+    testing::Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))));
+
+//////////////////////////////////////////////////////////////////////
+// NormDiff
+
+GPU_PERF_TEST(NormDiff, cv::gpu::DeviceInfo, cv::Size, NormType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int normType = GET_PARAM(2);
+
+    cv::Mat src1_host(size, CV_8UC1);
+    fill(src1_host, 0.0, 255.0);
+
+    cv::Mat src2_host(size, CV_8UC1);
+    fill(src2_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src1(src1_host);
+    cv::gpu::GpuMat src2(src2_host);
+    double dst;
+
+    dst = cv::gpu::norm(src1, src2, normType);
+
+    TEST_CYCLE()
+    {
+        dst = cv::gpu::norm(src1, src2, normType);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, NormDiff, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))));
+
+//////////////////////////////////////////////////////////////////////
+// Sum
+
+GPU_PERF_TEST(Sum, cv::gpu::DeviceInfo, cv::Size, MatType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar dst;
+    cv::gpu::GpuMat buf;
+
+    dst = cv::gpu::sum(src, buf);
+
+    TEST_CYCLE()
+    {
+        dst = cv::gpu::sum(src, buf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Sum, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatType>(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4)));
+
+//////////////////////////////////////////////////////////////////////
+// Sum_Abs
+
+GPU_PERF_TEST(Sum_Abs, cv::gpu::DeviceInfo, cv::Size, MatType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar dst;
+    cv::gpu::GpuMat buf;
+
+    dst = cv::gpu::absSum(src, buf);
+
+    TEST_CYCLE()
+    {
+        dst = cv::gpu::absSum(src, buf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Sum_Abs, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatType>(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4)));
+
+//////////////////////////////////////////////////////////////////////
+// Sum_Sqr
+
+GPU_PERF_TEST(Sum_Sqr, cv::gpu::DeviceInfo, cv::Size, MatType)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::Scalar dst;
+    cv::gpu::GpuMat buf;
+
+    dst = cv::gpu::sqrSum(src, buf);
+
+    TEST_CYCLE()
+    {
+        dst = cv::gpu::sqrSum(src, buf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Sum_Sqr, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatType>(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4)));
+
+//////////////////////////////////////////////////////////////////////
+// MinMax
+
+GPU_PERF_TEST(MinMax, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    double minVal, maxVal;
+    cv::gpu::GpuMat buf;
+
+    cv::gpu::minMax(src, &minVal, &maxVal, cv::gpu::GpuMat(), buf);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::minMax(src, &minVal, &maxVal, cv::gpu::GpuMat(), buf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, MinMax, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// MinMaxLoc
+
+GPU_PERF_TEST(MinMaxLoc, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 255.0);
+
+    cv::gpu::GpuMat src(src_host);
+    double minVal, maxVal;
+    cv::Point minLoc, maxLoc;
+    cv::gpu::GpuMat valbuf, locbuf;
+
+    cv::gpu::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), valbuf, locbuf);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), valbuf, locbuf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, MinMaxLoc, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// CountNonZero
+
+GPU_PERF_TEST(CountNonZero, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+
+    cv::Mat src_host(size, depth);
+    fill(src_host, 0.0, 1.5);
+
+    cv::gpu::GpuMat src(src_host);
+    int dst;
+    cv::gpu::GpuMat buf;
+
+    dst = cv::gpu::countNonZero(src, buf);
+
+    TEST_CYCLE()
+    {
+        dst = cv::gpu::countNonZero(src, buf);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, CountNonZero, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_32F, CV_64F)));
+
+//////////////////////////////////////////////////////////////////////
+// Reduce
+
+CV_ENUM(ReduceCode, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
+#define ALL_REDUCE_CODES testing::Values<ReduceCode>(CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
+
+enum {Rows = 0, Cols = 1};
+CV_ENUM(ReduceDim, Rows, Cols)
+
+GPU_PERF_TEST(Reduce, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, ReduceCode, ReduceDim)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int depth = GET_PARAM(2);
+    int channels = GET_PARAM(3);
+    int reduceOp = GET_PARAM(4);
+    int dim = GET_PARAM(5);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0.0, 10.0);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::reduce(src, dst, dim, reduceOp);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::reduce(src, dst, dim, reduceOp);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Core, Reduce, testing::Combine(
+    ALL_DEVICES,
+    GPU_TYPICAL_MAT_SIZES,
+    testing::Values<MatDepth>(CV_8U, CV_16U, CV_16S, CV_32F),
+    testing::Values<Channels>(1, 2, 3, 4),
+    ALL_REDUCE_CODES,
+    testing::Values(ReduceDim(Rows), ReduceDim(Cols))));
+
+#endif
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index 8270dfe..17fa6b1 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -1005,11 +1005,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
 
         for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
         {
-            int trainIdx = *trainIdx_ptr;
-            int imgIdx = *imgIdx_ptr;
-            float distance = *distance_ptr;
+            int _trainIdx = *trainIdx_ptr;
+            int _imgIdx = *imgIdx_ptr;
+            float _distance = *distance_ptr;
 
-            DMatch m(queryIdx, trainIdx, imgIdx, distance);
+            DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
 
             curMatches.push_back(m);
         }
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu
index f59cef0..19dc9df 100644
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -45,19 +45,19 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace bf_knnmatch 
+    namespace bf_knnmatch
     {
         ///////////////////////////////////////////////////////////////////////////////
         // Reduction
 
-        template <int BLOCK_SIZE> 
-        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 
-                                      int& bestTrainIdx1, int& bestTrainIdx2, 
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                      int& bestTrainIdx1, int& bestTrainIdx2,
                                       float* s_distance, int* s_trainIdx)
         {
-            float myBestDistance1 = numeric_limits<float>::max(); 
+            float myBestDistance1 = numeric_limits<float>::max();
             float myBestDistance2 = numeric_limits<float>::max();
             int myBestTrainIdx1 = -1;
             int myBestTrainIdx2 = -1;
@@ -122,13 +122,13 @@ namespace cv { namespace gpu { namespace device
             bestTrainIdx2 = myBestTrainIdx2;
         }
 
-        template <int BLOCK_SIZE> 
-        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 
-                                       int& bestTrainIdx1, int& bestTrainIdx2, 
-                                       int& bestImgIdx1, int& bestImgIdx2, 
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                       int& bestTrainIdx1, int& bestTrainIdx2,
+                                       int& bestImgIdx1, int& bestImgIdx2,
                                        float* s_distance, int* s_trainIdx, int* s_imgIdx)
         {
-            float myBestDistance1 = numeric_limits<float>::max(); 
+            float myBestDistance1 = numeric_limits<float>::max();
             float myBestDistance2 = numeric_limits<float>::max();
             int myBestTrainIdx1 = -1;
             int myBestTrainIdx2 = -1;
@@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match Unrolled Cached
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
         __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
         {
             #pragma unroll
@@ -219,11 +219,11 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                                           float& bestDistance1, float& bestDistance2, 
-                                           int& bestTrainIdx1, int& bestTrainIdx2, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                           float& bestDistance1, float& bestDistance2,
+                                           int& bestTrainIdx1, int& bestTrainIdx2,
                                            int& bestImgIdx1, int& bestImgIdx2)
         {
             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
         {
             extern __shared__ int smem[];
@@ -313,9 +313,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
                                  cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -330,7 +330,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
         {
             extern __shared__ int smem[];
@@ -374,9 +374,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
                                  cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -394,11 +394,11 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match Unrolled
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                                     float& bestDistance1, float& bestDistance2, 
-                                     int& bestTrainIdx1, int& bestTrainIdx2, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                     float& bestDistance1, float& bestDistance2,
+                                     int& bestTrainIdx1, int& bestTrainIdx2,
                                      int& bestImgIdx1, int& bestImgIdx2)
         {
             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
@@ -459,7 +459,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
         {
             extern __shared__ int smem[];
@@ -490,9 +490,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
                            cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -507,7 +507,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
         {
             extern __shared__ int smem[];
@@ -549,9 +549,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
                            cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -569,11 +569,11 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-        __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                             float& bestDistance1, float& bestDistance2, 
-                             int& bestTrainIdx1, int& bestTrainIdx2, 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                             float& bestDistance1, float& bestDistance2,
+                             int& bestTrainIdx1, int& bestTrainIdx2,
                              int& bestImgIdx1, int& bestImgIdx2)
         {
             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
@@ -633,7 +633,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
         __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
         {
             extern __shared__ int smem[];
@@ -664,9 +664,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
                    cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -681,7 +681,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
         __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
         {
             extern __shared__ int smem[];
@@ -723,9 +723,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
                    cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -743,9 +743,9 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // knnMatch 2 dispatcher
 
-        template <typename Dist, typename T, typename Mask> 
-        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                              const DevMem2Db& trainIdx, const DevMem2Db& distance, 
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                              const DevMem2Db& trainIdx, const DevMem2Db& distance,
                               int cc, cudaStream_t stream)
         {
             if (query.cols <= 64)
@@ -761,11 +761,11 @@ namespace cv { namespace gpu { namespace device
                 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
             }
             else if (query.cols <= 512)
-            {            
+            {
                 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
             }
             else if (query.cols <= 1024)
-            {            
+            {
                 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
             }*/
             else
@@ -774,9 +774,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <typename Dist, typename T, typename Mask> 
-        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                              const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                              const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
                               int cc, cudaStream_t stream)
         {
             if (query.cols <= 64)
@@ -792,11 +792,11 @@ namespace cv { namespace gpu { namespace device
                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
             }
             else if (query.cols <= 512)
-            {            
+            {
                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
             }
             else if (query.cols <= 1024)
-            {            
+            {
                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
             }*/
             else
@@ -832,7 +832,7 @@ namespace cv { namespace gpu { namespace device
                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
                 }
                 else
-                {                
+                {
                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
                 }
@@ -857,7 +857,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace device
                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
                 }
                 else
-                {                
+                {
                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
                 }
@@ -920,7 +920,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
         void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -938,9 +938,9 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Calc Distance dispatcher
 
-        template <typename Dist, typename T, typename Mask> 
-        void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                                    const DevMem2Df& allDist, 
+        template <typename Dist, typename T, typename Mask>
+        void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                                    const DevMem2Df& allDist,
                                     int cc, cudaStream_t stream)
         {
             if (query.cols <= 64)
@@ -956,11 +956,11 @@ namespace cv { namespace gpu { namespace device
                 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
             }
             else if (query.cols <= 512)
-            {            
+            {
                 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
             }
             else if (query.cols <= 1024)
-            {            
+            {
                 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
             }*/
             else
@@ -972,7 +972,7 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // find knn match kernel
 
-        template <int BLOCK_SIZE> 
+        template <int BLOCK_SIZE>
         __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
         {
             const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
@@ -985,7 +985,7 @@ namespace cv { namespace gpu { namespace device
 
             float dist = numeric_limits<float>::max();
             int bestIdx = -1;
-            
+
             for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
             {
                 float reg = allDistRow[i];
@@ -1013,7 +1013,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE> 
+        template <int BLOCK_SIZE>
         void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, 1, 1);
@@ -1038,8 +1038,8 @@ namespace cv { namespace gpu { namespace device
         // knn match Dispatcher
 
         template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, 
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask,
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
             int cc, cudaStream_t stream)
         {
             if (k == 2)
@@ -1051,13 +1051,13 @@ namespace cv { namespace gpu { namespace device
                 calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
                 findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
             }
-        }     
+        }
 
         ///////////////////////////////////////////////////////////////////////////////
         // knn match caller
 
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
             int cc, cudaStream_t stream)
         {
             if (mask.data)
@@ -1073,7 +1073,7 @@ namespace cv { namespace gpu { namespace device
         template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
         template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
 
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
             int cc, cudaStream_t stream)
         {
@@ -1091,7 +1091,7 @@ namespace cv { namespace gpu { namespace device
         template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
             int cc, cudaStream_t stream)
         {
             if (mask.data)
@@ -1106,8 +1106,8 @@ namespace cv { namespace gpu { namespace device
         //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
 
-        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
             int cc, cudaStream_t stream)
         {
             if (masks.data)
@@ -1123,8 +1123,8 @@ namespace cv { namespace gpu { namespace device
         template void match2L1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
         template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
 
-        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
             int cc, cudaStream_t stream)
         {
             if (masks.data)
@@ -1140,8 +1140,8 @@ namespace cv { namespace gpu { namespace device
         //template void match2L2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
         template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
 
-        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
             int cc, cudaStream_t stream)
         {
             if (masks.data)
diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu
index ad8cc88..0248dbf 100644
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -45,14 +45,14 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace bf_match 
+    namespace bf_match
     {
         ///////////////////////////////////////////////////////////////////////////////
         // Reduction
 
-        template <int BLOCK_SIZE> 
+        template <int BLOCK_SIZE>
         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
         {
             s_distance += threadIdx.y * BLOCK_SIZE;
@@ -66,7 +66,7 @@ namespace cv { namespace gpu { namespace device
             reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
         }
 
-        template <int BLOCK_SIZE> 
+        template <int BLOCK_SIZE>
         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
         {
             s_distance += threadIdx.y * BLOCK_SIZE;
@@ -85,7 +85,7 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match Unrolled Cached
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
         __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
         {
             #pragma unroll
@@ -96,9 +96,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query,volatile int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
-                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                                            float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
         {
             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
@@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
         {
             extern __shared__ int smem[];
@@ -173,9 +173,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                                 const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                                 const DevMem2Di& trainIdx, const DevMem2Df& distance,
                                  cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -190,8 +190,8 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
                                             int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
         {
             extern __shared__ int smem[];
@@ -232,9 +232,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                                 const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                                 const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                                  cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -252,9 +252,9 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match Unrolled
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
         __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query,volatile int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
-                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                                      float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
         {
             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
@@ -314,7 +314,7 @@ namespace cv { namespace gpu { namespace device
 
             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-            
+
             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 
             __syncthreads();
@@ -331,9 +331,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                           const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                           const DevMem2Di& trainIdx, const DevMem2Df& distance,
                            cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -349,7 +349,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
+        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
                                       int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
         {
             extern __shared__ int smem[];
@@ -364,7 +364,7 @@ namespace cv { namespace gpu { namespace device
             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
             Mask m = mask;
-            
+
             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
             {
                 const DevMem2D_<T> train = trains[imgIdx];
@@ -388,9 +388,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                            cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -408,9 +408,9 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
         __device__ void loop(int queryIdx, const DevMem2D_<T>& query, volatile int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
-                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
                              float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
         {
             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
@@ -469,7 +469,7 @@ namespace cv { namespace gpu { namespace device
 
             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-            
+
             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 
             __syncthreads();
@@ -486,9 +486,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                   const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                   const DevMem2Di& trainIdx, const DevMem2Df& distance,
                    cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -504,7 +504,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
+        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
                               int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
         {
             extern __shared__ int smem[];
@@ -542,9 +542,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                   const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                   const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                    cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -562,9 +562,9 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match dispatcher
 
-        template <typename Dist, typename T, typename Mask> 
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
+                             const DevMem2Di& trainIdx, const DevMem2Df& distance,
                              int cc, cudaStream_t stream)
         {
             if (query.cols <= 64)
@@ -580,11 +580,11 @@ namespace cv { namespace gpu { namespace device
                 matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
             }
             else if (query.cols <= 512)
-            {            
+            {
                 matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
             }
             else if (query.cols <= 1024)
-            {            
+            {
                 matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
             }*/
             else
@@ -593,9 +593,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <typename Dist, typename T, typename Mask> 
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
+                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                              int cc, cudaStream_t stream)
         {
             if (query.cols <= 64)
@@ -611,11 +611,11 @@ namespace cv { namespace gpu { namespace device
                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
             }
             else if (query.cols <= 512)
-            {            
+            {
                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
             }
             else if (query.cols <= 1024)
-            {            
+            {
                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
             }*/
             else
@@ -627,20 +627,20 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match caller
 
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
                                                const DevMem2Di& trainIdx, const DevMem2Df& distance,
                                                int cc, cudaStream_t stream)
         {
             if (mask.data)
             {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
-                    trainIdx, distance, 
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
                     cc, stream);
             }
             else
             {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
-                    trainIdx, distance, 
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
+                    trainIdx, distance,
                     cc, stream);
             }
         }
@@ -652,20 +652,20 @@ namespace cv { namespace gpu { namespace device
         template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
         template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-                                               const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
+                                               const DevMem2Di& trainIdx, const DevMem2Df& distance,
                                                int cc, cudaStream_t stream)
         {
             if (mask.data)
             {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
-                    trainIdx, distance, 
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
                     cc, stream);
             }
             else
             {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
-                    trainIdx, distance, 
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
+                    trainIdx, distance,
                     cc, stream);
             }
         }
@@ -677,20 +677,20 @@ namespace cv { namespace gpu { namespace device
         //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
         template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-                                                    const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
+                                                    const DevMem2Di& trainIdx, const DevMem2Df& distance,
                                                     int cc, cudaStream_t stream)
         {
             if (mask.data)
             {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
-                    trainIdx, distance, 
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
                     cc, stream);
             }
             else
             {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
-                    trainIdx, distance, 
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
+                    trainIdx, distance,
                     cc, stream);
             }
         }
@@ -701,20 +701,20 @@ namespace cv { namespace gpu { namespace device
         //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                                                int cc, cudaStream_t stream)
         {
             if (masks.data)
             {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
-                    trainIdx, imgIdx, distance, 
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
                     cc, stream);
             }
             else
             {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
-                    trainIdx, imgIdx, distance, 
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
                     cc, stream);
             }
         }
@@ -726,20 +726,20 @@ namespace cv { namespace gpu { namespace device
         template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
         template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                                                int cc, cudaStream_t stream)
         {
             if (masks.data)
             {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
-                    trainIdx, imgIdx, distance, 
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
                     cc, stream);
             }
             else
             {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
-                    trainIdx, imgIdx, distance, 
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
                     cc, stream);
             }
         }
@@ -751,20 +751,20 @@ namespace cv { namespace gpu { namespace device
         //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
         template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                                                    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+                                                    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                                                     int cc, cudaStream_t stream)
         {
             if (masks.data)
             {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
-                    trainIdx, imgIdx, distance, 
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
                     cc, stream);
             }
             else
             {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
-                    trainIdx, imgIdx, distance, 
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
                     cc, stream);
             }
         }
diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu
index 39b721a..015ac0b 100644
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -45,9 +45,9 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace bf_radius_match 
+    namespace bf_radius_match
     {
         ///////////////////////////////////////////////////////////////////////////////
         // Match Unrolled
@@ -112,8 +112,8 @@ namespace cv { namespace gpu { namespace device
             #endif
         }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -121,17 +121,17 @@ namespace cv { namespace gpu { namespace device
 
             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
                 trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
-        }   
+        }
 
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> 
-        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -146,12 +146,12 @@ namespace cv { namespace gpu { namespace device
 
                 if (masks != 0 && masks[i].data)
                 {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
                         trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
                 }
                 else
                 {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
                         trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
                 }
                 cudaSafeCall( cudaGetLastError() );
@@ -223,9 +223,9 @@ namespace cv { namespace gpu { namespace device
             #endif
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace device
 
             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
                 trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
             cudaSafeCall( cudaGetLastError() );
 
@@ -241,9 +241,9 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template <int BLOCK_SIZE, typename Dist, typename T> 
-        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <int BLOCK_SIZE, typename Dist, typename T>
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             cudaStream_t stream)
         {
             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
@@ -258,12 +258,12 @@ namespace cv { namespace gpu { namespace device
 
                 if (masks != 0 && masks[i].data)
                 {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
                         trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
                 }
                 else
                 {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
                         trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
                 }
                 cudaSafeCall( cudaGetLastError() );
@@ -276,9 +276,9 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // Match dispatcher
 
-        template <typename Dist, typename T, typename Mask> 
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
+                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
                              int cc, cudaStream_t stream)
         {
             if (query.cols <= 64)
@@ -294,11 +294,11 @@ namespace cv { namespace gpu { namespace device
                 matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
             }
             else if (query.cols <= 512)
-            {            
+            {
                 matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
             }
             else if (query.cols <= 1024)
-            {            
+            {
                 matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
             }*/
             else
@@ -307,9 +307,9 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <typename Dist, typename T> 
-        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename Dist, typename T>
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
+                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
                              int cc, cudaStream_t stream)
         {
             if (query.cols <= 64)
@@ -325,36 +325,36 @@ namespace cv { namespace gpu { namespace device
                 matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
             }
             else if (query.cols <= 512)
-            {            
+            {
                 matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
             }
             else if (query.cols <= 1024)
-            {            
+            {
                 matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
             }*/
             else
             {
                 match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
             }
-        } 
+        }
 
         ///////////////////////////////////////////////////////////////////////////////
         // Radius Match caller
 
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             int cc, cudaStream_t stream)
         {
             if (mask.data)
             {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                    trainIdx, distance, nMatches, 
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
                     cc, stream);
             }
             else
             {
-                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                    trainIdx, distance, nMatches, 
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
                     cc, stream);
             }
         }
@@ -366,20 +366,20 @@ namespace cv { namespace gpu { namespace device
         template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
         template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             int cc, cudaStream_t stream)
         {
             if (mask.data)
             {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                    trainIdx, distance, nMatches, 
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
                     cc, stream);
             }
             else
             {
-                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                    trainIdx, distance, nMatches, 
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
                     cc, stream);
             }
         }
@@ -391,20 +391,20 @@ namespace cv { namespace gpu { namespace device
         //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
         template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             int cc, cudaStream_t stream)
         {
             if (mask.data)
             {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                    trainIdx, distance, nMatches, 
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
                     cc, stream);
             }
             else
             {
-                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                    trainIdx, distance, nMatches, 
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
                     cc, stream);
             }
         }
@@ -415,12 +415,12 @@ namespace cv { namespace gpu { namespace device
         //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             int cc, cudaStream_t stream)
         {
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-                trainIdx, imgIdx, distance, nMatches, 
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
                 cc, stream);
         }
 
@@ -431,12 +431,12 @@ namespace cv { namespace gpu { namespace device
         template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
         template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             int cc, cudaStream_t stream)
         {
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-                trainIdx, imgIdx, distance, nMatches, 
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
                 cc, stream);
         }
 
@@ -447,12 +447,12 @@ namespace cv { namespace gpu { namespace device
         //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
         template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
             int cc, cudaStream_t stream)
         {
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-                trainIdx, imgIdx, distance, nMatches, 
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
                 cc, stream);
         }
 
diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu
index b6d13e1..d09268a 100644
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,9 +43,9 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace bilateral_filter 
+    namespace bilateral_filter
     {
         __constant__ float* ctable_color;
         __constant__ float* ctable_space;
@@ -108,7 +108,7 @@ namespace cv { namespace gpu { namespace device
                 dp[3] = *(disp + (y+1) * disp_step + x + 0);
                 dp[4] = *(disp + (y  ) * disp_step + x + 1);
 
-                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            
+                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
                 {
                     const int ymin = ::max(0, y - cradius);
                     const int xmin = ::max(0, x - cradius);
@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        template <typename T>     
+        template <typename T>
         void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
         {
             dim3 threads(32, 8, 1);
diff --git a/modules/gpu/src/cuda/blend.cu b/modules/gpu/src/cuda/blend.cu
index 02e9649..e716edc 100644
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -42,9 +42,9 @@
 
 #include "internal_shared.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace blend 
+    namespace blend
     {
         template <typename T>
         __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
@@ -62,14 +62,14 @@ namespace cv { namespace gpu { namespace device
                 T p2 = img2.ptr(y)[x];
                 result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
             }
-        }	
+        }
 
         template <typename T>
         void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
         {
             dim3 threads(16, 16);
             dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-            
+
             blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
             cudaSafeCall( cudaGetLastError() );
 
@@ -105,12 +105,12 @@ namespace cv { namespace gpu { namespace device
         {
             dim3 threads(16, 16);
             dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-            
+
             blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)
                 cudaSafeCall(cudaDeviceSynchronize());
         }
-    } // namespace blend 
+    } // namespace blend
 }}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu
index e296aeb..4776c55 100644
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -44,7 +44,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
 
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
index 4c7c5c7..f9dd490 100644
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -44,9 +44,9 @@
 #include <algorithm>
 #include "internal_shared.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace canny 
+    namespace canny
     {
         __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
         {
@@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device
             }
         };
 
-        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
             PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
         {
             __shared__ int sdx[18][16];
@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         //////////////////////////////////////////////////////////////////////////////////////////
-            
+
         #define CANNY_SHIFT 15
         #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
 
@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace device
                             edge_type = 1 + (int)(m > high_thresh);
                     }
                 }
-                
+
                 map.ptr(i + 1)[j + 1] = edge_type;
             }
         }
@@ -270,7 +270,7 @@ namespace cv { namespace gpu { namespace device
 
             const int tid = threadIdx.y * 16 + threadIdx.x;
             const int lx = tid % 18;
-            const int ly = tid / 18; 
+            const int ly = tid / 18;
 
             if (ly < 14)
                 smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
@@ -294,10 +294,10 @@ namespace cv { namespace gpu { namespace device
                         n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
                         n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
                         n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-                        
+
                         n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
                         n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-                        
+
                         n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
                         n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
                         n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
@@ -318,10 +318,10 @@ namespace cv { namespace gpu { namespace device
                     n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
                     n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
                     n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-                    
+
                     n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
                     n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-                    
+
                     n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
                     n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
                     n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
@@ -361,7 +361,7 @@ namespace cv { namespace gpu { namespace device
             #if __CUDA_ARCH__ >= 120
 
             const int stack_size = 512;
-            
+
             __shared__ unsigned int s_counter;
             __shared__ unsigned int s_ind;
             __shared__ ushort2 s_st[stack_size];
@@ -404,11 +404,11 @@ namespace cv { namespace gpu { namespace device
                         if (subTaskIdx < portion)
                             pos = s_st[s_counter - 1 - subTaskIdx];
                         __syncthreads();
-                            
+
                         if (threadIdx.x == 0)
                             s_counter -= portion;
                         __syncthreads();
-                         
+
                         if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
                         {
                             pos.x += c_dx[threadIdx.x & 7];
@@ -452,7 +452,7 @@ namespace cv { namespace gpu { namespace device
         {
             void* counter_ptr;
             cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-            
+
             unsigned int count;
             cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu
index 5184222..562f3ff 100644
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -45,7 +45,7 @@
 #include <opencv2/gpu/device/color.hpp>
 #include <cvt_colot_internal.h>
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
     {
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace device
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
-    };    
+    };
 
     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
     {
diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu
index d00bec8..624bd3f 100644
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -48,9 +48,9 @@
 #include "opencv2/gpu/device/border_interpolate.hpp"
 #include "opencv2/gpu/device/static_check.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace column_filter 
+    namespace column_filter
     {
         #define MAX_KERNEL_SIZE 32
 
@@ -146,7 +146,7 @@ namespace cv { namespace gpu { namespace device
 
             const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
             const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
-            
+
             B<T> brd(src.rows);
 
             linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
@@ -162,7 +162,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
 
-            static const caller_t callers[5][33] = 
+            static const caller_t callers[5][33] =
             {
                 {
                     0,
@@ -338,9 +338,9 @@ namespace cv { namespace gpu { namespace device
                     linearColumnFilter_caller<30, T, D, BrdColWrap>,
                     linearColumnFilter_caller<31, T, D, BrdColWrap>,
                     linearColumnFilter_caller<32, T, D, BrdColWrap>
-                }               
+                }
             };
-            
+
             loadKernel(kernel, ksize);
 
             callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
diff --git a/modules/gpu/src/cuda/copy_make_border.cu b/modules/gpu/src/cuda/copy_make_border.cu
index 3397672..a54a9b7 100644
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -43,9 +43,9 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
     {
         template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
         {
@@ -58,9 +58,9 @@ namespace cv { namespace gpu { namespace device
 
         template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
         {
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
                 const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-            {        
+            {
                 dim3 block(32, 8);
                 dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
 
@@ -75,20 +75,20 @@ namespace cv { namespace gpu { namespace device
             }
         };
 
-        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
+        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
             const T* borderValue, cudaStream_t stream)
         {
             typedef typename TypeVec<T, cn>::vec_type vec_type;
 
             typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
 
-            static const caller_t callers[5] = 
+            static const caller_t callers[5] =
             {
-                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
             };
 
             callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
diff --git a/modules/gpu/src/cuda/fast.cu b/modules/gpu/src/cuda/fast.cu
index 1383db1..8f904cd 100644
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@@ -40,7 +40,7 @@
 //
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
 //
-// The original code was written by Paul Furgale and Chi Hay Tong 
+// The original code was written by Paul Furgale and Chi Hay Tong
 // and later optimized and prepared for integration into OpenCV by Itseez.
 //
 //M*/
@@ -48,9 +48,9 @@
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/utility.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace fast 
+    namespace fast
     {
         __device__ unsigned int g_counter = 0;
 
@@ -78,14 +78,14 @@ namespace cv { namespace gpu { namespace device
 
 
 
-            d1 = diffType(v, C[0] & 0xff, th);            
+            d1 = diffType(v, C[0] & 0xff, th);
             d2 = diffType(v, C[2] & 0xff, th);
 
             if ((d1 | d2) == 0)
                 return;
 
             mask1 |= (d1 & 1) << 0;
-            mask2 |= ((d1 & 2) >> 1) << 0;            
+            mask2 |= ((d1 & 2) >> 1) << 0;
 
             mask1 |= (d2 & 1) << 8;
             mask2 |= ((d2 & 2) >> 1) << 8;
@@ -141,7 +141,7 @@ namespace cv { namespace gpu { namespace device
                 return;*/
 
             mask1 |= (d1 & 1) << 1;
-            mask2 |= ((d1 & 2) >> 1) << 1; 
+            mask2 |= ((d1 & 2) >> 1) << 1;
 
             mask1 |= (d2 & 1) << 9;
             mask2 |= ((d2 & 2) >> 1) << 9;
@@ -169,7 +169,7 @@ namespace cv { namespace gpu { namespace device
                 return;*/
 
             mask1 |= (d1 & 1) << 5;
-            mask2 |= ((d1 & 2) >> 1) << 5; 
+            mask2 |= ((d1 & 2) >> 1) << 5;
 
             mask1 |= (d2 & 1) << 13;
             mask2 |= ((d2 & 2) >> 1) << 13;
@@ -191,7 +191,7 @@ namespace cv { namespace gpu { namespace device
         // 0 -> not a keypoint
         __device__ __forceinline__ bool isKeyPoint(int mask1, int mask2)
         {
-            return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) || 
+            return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) ||
                    (__popc(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7))));
         }
 
@@ -212,14 +212,14 @@ namespace cv { namespace gpu { namespace device
                 calcMask(C, v, mid, mask1, mask2);
 
                 int isKp = static_cast<int>(isKeyPoint(mask1, mask2));
-                
+
                 min = isKp * (mid + 1) + (isKp ^ 1) * min;
                 max = (isKp ^ 1) * (mid - 1) + isKp * max;
             }
 
             return min - 1;
         }
-        
+
         template <bool calcScore, class Mask>
         __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
         {
@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace device
                 C[2] |= static_cast<uint>(img(i - 1, j - 3)) << (3 * 8);
                 C[1] |= static_cast<uint>(img(i - 1, j + 3)) << 8;
 
-                C[3] |= static_cast<uint>(img(i, j - 3));                
+                C[3] |= static_cast<uint>(img(i, j - 3));
                 v     = static_cast<int>(img(i, j));
                 C[1] |= static_cast<uint>(img(i, j + 3));
 
@@ -313,7 +313,7 @@ namespace cv { namespace gpu { namespace device
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaDeviceSynchronize() );
-            
+
             unsigned int count;
             cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 
@@ -335,14 +335,14 @@ namespace cv { namespace gpu { namespace device
 
                 int score = scoreMat(loc.y, loc.x);
 
-                bool ismax = 
+                bool ismax =
                     score > scoreMat(loc.y - 1, loc.x - 1) &&
                     score > scoreMat(loc.y - 1, loc.x    ) &&
                     score > scoreMat(loc.y - 1, loc.x + 1) &&
 
                     score > scoreMat(loc.y    , loc.x - 1) &&
                     score > scoreMat(loc.y    , loc.x + 1) &&
-                
+
                     score > scoreMat(loc.y + 1, loc.x - 1) &&
                     score > scoreMat(loc.y + 1, loc.x    ) &&
                     score > scoreMat(loc.y + 1, loc.x + 1);
@@ -375,7 +375,7 @@ namespace cv { namespace gpu { namespace device
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaDeviceSynchronize() );
-            
+
             unsigned int new_count;
             cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 
diff --git a/modules/gpu/src/cuda/gftt.cu b/modules/gpu/src/cuda/gftt.cu
index 0d1be71..8101fc8 100644
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -40,7 +40,7 @@
 //
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
 //
-// The original code was written by Paul Furgale and Chi Hay Tong 
+// The original code was written by Paul Furgale and Chi Hay Tong
 // and later optimized and prepared for integration into OpenCV by Itseez.
 //
 //M*/
@@ -50,9 +50,9 @@
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/utility.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace gfft 
+    namespace gfft
     {
         texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
 
@@ -117,7 +117,7 @@ namespace cv { namespace gpu { namespace device
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaDeviceSynchronize() );
-            
+
             uint count;
             cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
 
@@ -126,9 +126,9 @@ namespace cv { namespace gpu { namespace device
 
         class EigGreater
         {
-        public:            
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const 
-            { 
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
                 return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
             }
         };
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
index e2ddf70..a7f4571 100644
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -45,7 +45,7 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     #define UINT_BITS 32U
 
@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
 
     #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
 
-    namespace hist 
+    namespace hist
     {
         #if (!USE_SMEM_ATOMICS)
 
@@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device
         {
             histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
                 DevMem2D_<uint>(src),
-                buf, 
+                buf,
                 static_cast<uint>(src.rows * src.step / sizeof(uint)),
                 src.cols);
 
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu
index 88c115a..27c550f 100644
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -42,7 +42,7 @@
 
 #include "internal_shared.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     // Other values are not supported
     #define CELL_WIDTH 8
@@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
     #define CELLS_PER_BLOCK_X 2
     #define CELLS_PER_BLOCK_Y 2
 
-    namespace hog 
+    namespace hog
     {
         __constant__ int cnbins;
         __constant__ int cblock_stride_x;
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
         __constant__ int cdescr_width;
 
 
-        /* Returns the nearest upper power of two, works only for 
+        /* Returns the nearest upper power of two, works only for
         the typical GPU thread count (pert block) values */
         int power_2up(unsigned int n)
         {
@@ -82,19 +82,19 @@ namespace cv { namespace gpu { namespace device
         }
 
 
-        void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
+        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                               int nblocks_win_x, int nblocks_win_y)
         {
-            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); 
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); 
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); 
-            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );  
-            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); 
+            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
 
-            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; 
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); 
+            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
 
-            int block_hist_size_2up = power_2up(block_hist_size);  
+            int block_hist_size_2up = power_2up(block_hist_size);
             cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
 
             int descr_width = nblocks_win_x * block_hist_size;
@@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device
 
 
         template <int nblocks> // Number of histogram blocks processed by single GPU thread block
-        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, 
+        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad,
                                                          const PtrElemStep qangle, float scale, float* block_hists)
         {
             const int block_x = threadIdx.z;
@@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device
             float* hists = smem;
             float* final_hist = smem + cnbins * 48 * nblocks;
 
-            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + 
+            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
                                  4 * cell_x + cell_thread_x;
             const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
 
@@ -135,8 +135,8 @@ namespace cv { namespace gpu { namespace device
             // 12 means that 12 pixels affect on block's cell (in one row)
             if (cell_thread_x < 12)
             {
-                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + 
-                                            cell_x + block_x * CELLS_PER_BLOCK_X) + 
+                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
+                                            cell_x + block_x * CELLS_PER_BLOCK_X) +
                                            cell_thread_x;
                 for (int bin_id = 0; bin_id < cnbins; ++bin_id)
                     hist[bin_id * 48 * nblocks] = 0.f;
@@ -155,9 +155,9 @@ namespace cv { namespace gpu { namespace device
                     int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
                     int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
 
-                    float gaussian = ::expf(-(dist_center_y * dist_center_y + 
+                    float gaussian = ::expf(-(dist_center_y * dist_center_y +
                                               dist_center_x * dist_center_x) * scale);
-                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * 
+                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
                                           (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
 
                     hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
@@ -169,41 +169,41 @@ namespace cv { namespace gpu { namespace device
                 {
                     if (cell_thread_x < 6) hist_[0] += hist_[6];
                     if (cell_thread_x < 3) hist_[0] += hist_[3];
-                    if (cell_thread_x == 0) 
-                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] 
+                    if (cell_thread_x == 0)
+                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
                             = hist_[0] + hist_[1] + hist_[2];
                 }
             }
 
             __syncthreads();
 
-            float* block_hist = block_hists + (blockIdx.y * img_block_width + 
-                                               blockIdx.x * blockDim.z + block_x) * 
-                                              cblock_hist_size;        
+            float* block_hist = block_hists + (blockIdx.y * img_block_width +
+                                               blockIdx.x * blockDim.z + block_x) *
+                                              cblock_hist_size;
 
             int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
             if (tid < cblock_hist_size)
-                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];     
+                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
         }
 
 
-        void compute_hists(int nbins, int block_stride_x, int block_stride_y, 
-                           int height, int width, const DevMem2Df& grad, 
-                           const DevMem2Db& qangle, float sigma, float* block_hists)                             
+        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
+                           int height, int width, const DevMem2Df& grad,
+                           const DevMem2Db& qangle, float sigma, float* block_hists)
         {
             const int nblocks = 1;
 
-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
                                   block_stride_x;
-            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 
+            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
                                    block_stride_y;
 
             dim3 grid(divUp(img_block_width, nblocks), img_block_height);
             dim3 threads(32, 2, nblocks);
 
-            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, 
+            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
                                                 cudaFuncCachePreferL1));
-         
+
             // Precompute gaussian spatial window parameter
             float scale = 1.f / (2.f * sigma * sigma);
 
@@ -223,18 +223,18 @@ namespace cv { namespace gpu { namespace device
         //
 
 
-        template<int size> 
+        template<int size>
         __device__ float reduce_smem(volatile float* smem)
-        {        
+        {
             unsigned int tid = threadIdx.x;
             float sum = smem[tid];
 
             if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
             if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
             if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
-            
+
             if (tid < 32)
-            {        
+            {
                 if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
                 if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
                 if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
@@ -245,54 +245,54 @@ namespace cv { namespace gpu { namespace device
 
             __syncthreads();
             sum = smem[0];
-            
+
             return sum;
         }
 
 
-        template <int nthreads, // Number of threads which process one block historgam 
+        template <int nthreads, // Number of threads which process one block historgam
                   int nblocks> // Number of block hisograms processed by one GPU thread block
         __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
-                                                           const int img_block_width, 
+                                                           const int img_block_width,
                                                            float* block_hists, float threshold)
         {
             if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
                 return;
 
-            float* hist = block_hists + (blockIdx.y * img_block_width + 
-                                         blockIdx.x * blockDim.z + threadIdx.z) * 
+            float* hist = block_hists + (blockIdx.y * img_block_width +
+                                         blockIdx.x * blockDim.z + threadIdx.z) *
                                         block_hist_size + threadIdx.x;
-            
+
             __shared__ float sh_squares[nthreads * nblocks];
             float* squares = sh_squares + threadIdx.z * nthreads;
-            
+
             float elem = 0.f;
             if (threadIdx.x < block_hist_size)
                 elem = hist[0];
-            
-            squares[threadIdx.x] = elem * elem;        
+
+            squares[threadIdx.x] = elem * elem;
 
             __syncthreads();
             float sum = reduce_smem<nthreads>(squares);
-            
-            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);        
+
+            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
             elem = ::min(elem * scale, threshold);
-            
+
             __syncthreads();
             squares[threadIdx.x] = elem * elem;
 
             __syncthreads();
             sum = reduce_smem<nthreads>(squares);
             scale = 1.0f / (::sqrtf(sum) + 1e-3f);
-            
+
             if (threadIdx.x < block_hist_size)
                 hist[0] = elem * scale;
         }
 
 
-        void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
+        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
                              int height, int width, float* block_hists, float threshold)
-        {   
+        {
             const int nblocks = 1;
 
             int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
@@ -327,19 +327,19 @@ namespace cv { namespace gpu { namespace device
         //
 
 
-        template <int nthreads, // Number of threads per one histogram block 
+        template <int nthreads, // Number of threads per one histogram block
                   int nblocks> // Number of histogram block processed by single GPU thread block
-        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, 
+        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
                                                           const int win_block_stride_x, const int win_block_stride_y,
                                                           const float* block_hists, const float* coefs,
                                                           float free_coef, float threshold, unsigned char* labels)
-        {            
+        {
             const int win_x = threadIdx.z;
             if (blockIdx.x * blockDim.z + win_x >= img_win_width)
                 return;
 
-            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
-                                               blockIdx.x * win_block_stride_x * blockDim.z + win_x) * 
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
                                               cblock_hist_size;
 
             float product = 0.f;
@@ -357,24 +357,24 @@ namespace cv { namespace gpu { namespace device
 
             __syncthreads();
 
-            if (nthreads >= 512) 
-            { 
+            if (nthreads >= 512)
+            {
                 if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                __syncthreads(); 
+                __syncthreads();
             }
-            if (nthreads >= 256) 
-            { 
-                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; 
-                __syncthreads(); 
+            if (nthreads >= 256)
+            {
+                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
+                __syncthreads();
             }
-            if (nthreads >= 128) 
-            { 
-                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; 
-                __syncthreads(); 
+            if (nthreads >= 128)
+            {
+                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
+                __syncthreads();
             }
-            
+
             if (threadIdx.x < 32)
-            {        
+            {
                 volatile float* smem = products;
                 if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
                 if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
@@ -389,10 +389,10 @@ namespace cv { namespace gpu { namespace device
         }
 
 
-        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
                             float* coefs, float free_coef, float threshold, unsigned char* labels)
-        {   
+        {
             const int nthreads = 256;
             const int nblocks = 1;
 
@@ -408,7 +408,7 @@ namespace cv { namespace gpu { namespace device
 
             int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
             classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
-                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
+                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
                 block_hists, coefs, free_coef, threshold, labels);
             cudaSafeCall( cudaGetLastError() );
 
@@ -420,11 +420,11 @@ namespace cv { namespace gpu { namespace device
 
 
         template <int nthreads>
-        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 
+        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
 											          const float* block_hists, PtrElemStepf descriptors)
         {
             // Get left top corner of the window in src
-            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
                                                blockIdx.x * win_block_stride_x) * cblock_hist_size;
 
             // Get left top corner of the window in dst
@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace device
         }
 
 
-        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 
+        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
 							        int height, int width, float* block_hists, DevMem2Df descriptors)
         {
             const int nthreads = 256;
@@ -462,12 +462,12 @@ namespace cv { namespace gpu { namespace device
 
 
         template <int nthreads>
-        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, 
-                                                      const int win_block_stride_y, const float* block_hists, 
+        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
+                                                      const int win_block_stride_y, const float* block_hists,
                                                       PtrElemStepf descriptors)
         {
             // Get left top corner of the window in src
-            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
                                                blockIdx.x * win_block_stride_x) * cblock_hist_size;
 
             // Get left top corner of the window in dst
@@ -482,14 +482,14 @@ namespace cv { namespace gpu { namespace device
                 int y = block_idx / cnblocks_win_x;
                 int x = block_idx - y * cnblocks_win_x;
 
-                descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] 
+                descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]
                     = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
             }
         }
 
 
-        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
                                     DevMem2Df descriptors)
         {
             const int nthreads = 256;
@@ -514,7 +514,7 @@ namespace cv { namespace gpu { namespace device
 
 
         template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, 
+        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img,
                                                       float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -524,9 +524,9 @@ namespace cv { namespace gpu { namespace device
             __shared__ float sh_row[(nthreads + 2) * 3];
 
             uchar4 val;
-            if (x < width) 
-                val = row[x]; 
-            else 
+            if (x < width)
+                val = row[x];
+            else
                 val = row[width - 2];
 
             sh_row[threadIdx.x + 1] = val.x;
@@ -563,9 +563,9 @@ namespace cv { namespace gpu { namespace device
 
                 float3 dx;
                 if (correct_gamma)
-                    dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));    
+                    dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
                 else
-                    dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);    
+                    dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
 
                 float3 dy = make_float3(0.f, 0.f, 0.f);
 
@@ -588,7 +588,7 @@ namespace cv { namespace gpu { namespace device
 
                 float mag0 = dx.x * dx.x + dy.x * dy.x;
                 float mag1 = dx.y * dx.y + dy.y * dy.y;
-                if (mag0 < mag1) 
+                if (mag0 < mag1)
                 {
                     best_dx = dx.y;
                     best_dy = dy.y;
@@ -616,7 +616,7 @@ namespace cv { namespace gpu { namespace device
         }
 
 
-        void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, 
+        void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img,
                                     float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
         {
             const int nthreads = 256;
@@ -635,7 +635,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, 
+        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img,
                                                       float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -644,9 +644,9 @@ namespace cv { namespace gpu { namespace device
 
             __shared__ float sh_row[nthreads + 2];
 
-            if (x < width) 
-                sh_row[threadIdx.x + 1] = row[x]; 
-            else 
+            if (x < width)
+                sh_row[threadIdx.x + 1] = row[x];
+            else
                 sh_row[threadIdx.x + 1] = row[width - 2];
 
             if (threadIdx.x == 0)
@@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device
         }
 
 
-        void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, 
+        void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img,
                                     float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
         {
             const int nthreads = 256;
@@ -729,13 +729,13 @@ namespace cv { namespace gpu { namespace device
             unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
             if (x < dst.cols && y < dst.rows)
-	        {        
+	        {
 		        float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
                 dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
 	        }
         }
 
-        template<class T, class TEX> 
+        template<class T, class TEX>
         static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
         {
             tex.filterMode = cudaFilterModeLinear;
@@ -743,19 +743,19 @@ namespace cv { namespace gpu { namespace device
             size_t texOfs = 0;
             int colOfs = 0;
 
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();    
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
             cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
 
-            if (texOfs != 0) 
+            if (texOfs != 0)
             {
                 colOfs = static_cast<int>( texOfs/sizeof(T) );
                 cudaSafeCall( cudaUnbindTexture(tex) );
                 cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
-            }    
+            }
 
             dim3 threads(32, 8);
             dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
-            
+
 	        float sx = static_cast<float>(src.cols) / dst.cols;
             float sy = static_cast<float>(src.rows) / dst.rows;
 
@@ -769,5 +769,5 @@ namespace cv { namespace gpu { namespace device
 
         void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
         void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
-    } // namespace hog 
+    } // namespace hog
 }}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 79c8fa2..7d04b89 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -970,12 +970,12 @@ namespace cv { namespace gpu { namespace device
         #undef IMPLEMENT_FILTER2D_TEX_READER
 
         template <typename T, typename D>
-        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, 
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, 
+        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
+                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
                           int borderMode, const float* borderValue, cudaStream_t stream)
         {
             typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
-            static const func_t funcs[] = 
+            static const func_t funcs[] =
             {
                 Filter2DCaller<T, D, BrdReflect101>::call,
                 Filter2DCaller<T, D, BrdReplicate>::call,
diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp
index d0df0bb..193fcb3 100644
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -50,9 +50,9 @@
 #include "safe_call.hpp"
 #include "opencv2/gpu/device/common.hpp"
 
-namespace cv { namespace gpu 
+namespace cv { namespace gpu
 {
-    enum 
+    enum
     {
         BORDER_REFLECT101_GPU = 0,
         BORDER_REPLICATE_GPU,
@@ -60,7 +60,7 @@ namespace cv { namespace gpu
         BORDER_REFLECT_GPU,
         BORDER_WRAP_GPU
     };
-            
+
     // Converts CPU border extrapolation mode into GPU internal analogue.
     // Returns true if the GPU analogue exists, false otherwise.
     bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index 0d44421..606ae82 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -43,9 +43,9 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace match_template 
+    namespace match_template
     {
         __device__ __forceinline__ float sum(float v) { return v; }
         __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
@@ -80,7 +80,7 @@ namespace cv { namespace gpu { namespace device
         //////////////////////////////////////////////////////////////////////
         // Naive_CCORR
 
-        template <typename T, int cn> 
+        template <typename T, int cn>
         __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
         {
             typedef typename TypeVec<T, cn>::vec_type Type;
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
             {
                 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
             };
@@ -135,7 +135,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
             {
                 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
             };
@@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
             {
                 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
             };
@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
             {
                 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
             };
@@ -249,7 +249,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
 
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
             {
                 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
             };
@@ -321,7 +321,7 @@ namespace cv { namespace gpu { namespace device
                                                     DevMem2Df result, int cn, cudaStream_t stream)
         {
             typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
             {
                 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
             };
@@ -379,16 +379,16 @@ namespace cv { namespace gpu { namespace device
                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
                 float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
                                          - image_sum_g_ * templ_sum_scale_g;
             }
         }
 
         void matchTemplatePrepared_CCOFF_8UC2(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                 const DevMem2D_<unsigned int> image_sum_g,
-                unsigned int templ_sum_r, unsigned int templ_sum_g, 
+                unsigned int templ_sum_r, unsigned int templ_sum_g,
                 DevMem2Df result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -406,7 +406,7 @@ namespace cv { namespace gpu { namespace device
 
 
         __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
-                int w, int h, 
+                int w, int h,
                 float templ_sum_scale_r,
                 float templ_sum_scale_g,
                 float templ_sum_scale_b,
@@ -437,20 +437,20 @@ namespace cv { namespace gpu { namespace device
         }
 
         void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                 const DevMem2D_<unsigned int> image_sum_g,
                 const DevMem2D_<unsigned int> image_sum_b,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
                 DevMem2Df result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
             matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h, 
+                    w, h,
                     (float)templ_sum_r / (w * h),
                     (float)templ_sum_g / (w * h),
                     (float)templ_sum_b / (w * h),
@@ -464,8 +464,8 @@ namespace cv { namespace gpu { namespace device
 
 
         __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
-                int w, int h, 
-                float templ_sum_scale_r, 
+                int w, int h,
+                float templ_sum_scale_r,
                 float templ_sum_scale_g,
                 float templ_sum_scale_b,
                 float templ_sum_scale_a,
@@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device
                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
                 float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
                                          - image_sum_g_ * templ_sum_scale_g
                                          - image_sum_b_ * templ_sum_scale_b
                                          - image_sum_a_ * templ_sum_scale_a;
@@ -501,24 +501,24 @@ namespace cv { namespace gpu { namespace device
         }
 
         void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                 const DevMem2D_<unsigned int> image_sum_g,
                 const DevMem2D_<unsigned int> image_sum_b,
                 const DevMem2D_<unsigned int> image_sum_a,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
-                unsigned int templ_sum_a, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
                 DevMem2Df result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
             matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h, 
-                    (float)templ_sum_r / (w * h), 
-                    (float)templ_sum_g / (w * h), 
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
                     (float)templ_sum_b / (w * h),
                     (float)templ_sum_a / (w * h),
                     image_sum_r, image_sum_g, image_sum_b, image_sum_a,
@@ -533,9 +533,9 @@ namespace cv { namespace gpu { namespace device
         // Prepared_CCOFF_NORMED
 
         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
-                int w, int h, float weight, 
+                int w, int h, float weight,
                 float templ_sum_scale, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum, 
+                const PtrStep<unsigned int> image_sum,
                 const PtrStep<unsigned long long> image_sqsum,
                 DevMem2Df result)
         {
@@ -557,7 +557,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const DevMem2D_<unsigned int> image_sum, 
+                    int w, int h, const DevMem2D_<unsigned int> image_sum,
                     const DevMem2D_<unsigned long long> image_sqsum,
                     unsigned int templ_sum, unsigned long long templ_sqsum,
                     DevMem2Df result, cudaStream_t stream)
@@ -570,7 +570,7 @@ namespace cv { namespace gpu { namespace device
             float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
 
             matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
-                    w, h, weight, templ_sum_scale, templ_sqsum_scale, 
+                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
                     image_sum, image_sqsum, result);
             cudaSafeCall( cudaGetLastError() );
 
@@ -581,8 +581,8 @@ namespace cv { namespace gpu { namespace device
 
 
         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
-                int w, int h, float weight, 
-                float templ_sum_scale_r, float templ_sum_scale_g, 
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g,
                 float templ_sqsum_scale,
                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
@@ -615,7 +615,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                    int w, int h, 
+                    int w, int h,
                     const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                     const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
@@ -628,15 +628,15 @@ namespace cv { namespace gpu { namespace device
             float weight = 1.f / (w * h);
             float templ_sum_scale_r = templ_sum_r * weight;
             float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
                                        + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
 
             matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, weight, 
+                    w, h, weight,
                     templ_sum_scale_r, templ_sum_scale_g,
                     templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r, 
-                    image_sum_g, image_sqsum_g, 
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
                     result);
             cudaSafeCall( cudaGetLastError() );
 
@@ -647,8 +647,8 @@ namespace cv { namespace gpu { namespace device
 
 
         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
-                int w, int h, float weight, 
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                 float templ_sqsum_scale,
                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
@@ -690,7 +690,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                    int w, int h, 
+                    int w, int h,
                     const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                     const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                     const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@@ -706,17 +706,17 @@ namespace cv { namespace gpu { namespace device
             float templ_sum_scale_r = templ_sum_r * weight;
             float templ_sum_scale_g = templ_sum_g * weight;
             float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
 
             matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h, weight, 
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 
-                    templ_sqsum_scale, 
-                    image_sum_r, image_sqsum_r, 
-                    image_sum_g, image_sqsum_g, 
-                    image_sum_b, image_sqsum_b, 
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
                     result);
             cudaSafeCall( cudaGetLastError() );
 
@@ -727,8 +727,8 @@ namespace cv { namespace gpu { namespace device
 
 
         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
-                int w, int h, float weight, 
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                 float templ_sum_scale_a, float templ_sqsum_scale,
                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
@@ -777,7 +777,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                    int w, int h, 
+                    int w, int h,
                     const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                     const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                     const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@@ -802,13 +802,13 @@ namespace cv { namespace gpu { namespace device
                                       + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
 
             matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h, weight, 
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 
-                    templ_sqsum_scale, 
-                    image_sum_r, image_sqsum_r, 
-                    image_sum_g, image_sqsum_g, 
-                    image_sum_b, image_sqsum_b, 
-                    image_sum_a, image_sqsum_a, 
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    image_sum_a, image_sqsum_a,
                     result);
             cudaSafeCall( cudaGetLastError() );
 
@@ -821,7 +821,7 @@ namespace cv { namespace gpu { namespace device
 
         template <int cn>
         __global__ void normalizeKernel_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum, 
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
                 unsigned long long templ_sqsum, DevMem2Df result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -836,7 +836,7 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
-        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
+        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
                           unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
         {
             dim3 threads(32, 8);
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index c5b2e70..78016b6 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -42,9 +42,9 @@
 
 #include "internal_shared.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace mathfunc 
+    namespace mathfunc
     {
         //////////////////////////////////////////////////////////////////////////////////////
         // Cart <-> Polar
@@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device
             }
         };
         template <typename Mag, typename Angle>
-        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
                                     float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
         {
 	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -137,11 +137,11 @@ namespace cv { namespace gpu { namespace device
 
             grid.x = divUp(x.cols, threads.x);
             grid.y = divUp(x.rows, threads.y);
-            
+
             const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
 
             cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
                 mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
             cudaSafeCall( cudaGetLastError() );
 
@@ -152,7 +152,7 @@ namespace cv { namespace gpu { namespace device
         void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
         {
             typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2][2][2] = 
+            static const caller_t callers[2][2][2] =
             {
                 {
                     {
@@ -187,10 +187,10 @@ namespace cv { namespace gpu { namespace device
 
             grid.x = divUp(mag.cols, threads.x);
             grid.y = divUp(mag.rows, threads.y);
-            
+
             const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
 
-            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
                 angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
             cudaSafeCall( cudaGetLastError() );
 
@@ -201,7 +201,7 @@ namespace cv { namespace gpu { namespace device
         void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
         {
             typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2] = 
+            static const caller_t callers[2] =
             {
                 polarToCart_caller<NonEmptyMag>,
                 polarToCart_caller<EmptyMag>
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index d6b6d94..633b7ee 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -45,9 +45,9 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
     {
         // Performs reduction in shared memory
         template <int size, typename T>
@@ -74,19 +74,19 @@ namespace cv { namespace gpu { namespace device
         {
             explicit Mask8U(PtrStepb mask): mask(mask) {}
 
-            __device__ __forceinline__ bool operator()(int y, int x) const 
-            { 
-                return mask.ptr(y)[x]; 
+            __device__ __forceinline__ bool operator()(int y, int x) const
+            {
+                return mask.ptr(y)[x];
             }
 
             PtrStepb mask;
         };
 
-        struct MaskTrue 
-        { 
-            __device__ __forceinline__ bool operator()(int y, int x) const 
-            { 
-                return true; 
+        struct MaskTrue
+        {
+            __device__ __forceinline__ bool operator()(int y, int x) const
+            {
+                return true;
             }
             __device__ __forceinline__ MaskTrue(){}
             __device__ __forceinline__ MaskTrue(const MaskTrue& mask_){}
@@ -95,7 +95,7 @@ namespace cv { namespace gpu { namespace device
         //////////////////////////////////////////////////////////////////////////////
         // Min max
 
-        // To avoid shared bank conflicts we convert each value into value of 
+        // To avoid shared bank conflicts we convert each value into value of
         // appropriate type (32 bits minimum)
         template <typename T> struct MinMaxTypeTraits {};
         template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
@@ -106,7 +106,7 @@ namespace cv { namespace gpu { namespace device
         template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
         template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
 
-        namespace minmax 
+        namespace minmax
         {
             __constant__ int ctwidth;
             __constant__ int ctheight;
@@ -131,19 +131,19 @@ namespace cv { namespace gpu { namespace device
             {
                 dim3 threads, grid;
                 estimateThreadCfg(cols, rows, threads, grid);
-                bufcols = grid.x * grid.y * elem_size; 
+                bufcols = grid.x * grid.y * elem_size;
                 bufrows = 2;
             }
 
 
             // Estimates device constants which are used in the kernels using specified thread configuration
             void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                 int twidth = divUp(divUp(cols, grid.x), threads.x);
                 int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-            }  
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
+            }
 
 
             // Does min and max in shared memory
@@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
                     for (uint x = x0; x < x_end; x += blockDim.x)
                     {
                         T val = src_row[x];
-                        if (mask(y, x)) 
-                        { 
-                            mymin = ::min(mymin, val); 
-                            mymax = ::max(mymax, val); 
+                        if (mask(y, x))
+                        {
+                            mymin = ::min(mymin, val);
+                            mymax = ::max(mymax, val);
                         }
                     }
                 }
@@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device
 
                 findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
                     maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
@@ -240,7 +240,7 @@ namespace cv { namespace gpu { namespace device
 
 			        findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
 
-                    if (tid == 0) 
+                    if (tid == 0)
                     {
                         minval[0] = (T)sminval[0];
                         maxval[0] = (T)smaxval[0];
@@ -248,7 +248,7 @@ namespace cv { namespace gpu { namespace device
                     }
 		        }
             #else
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
                     maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
             #endif
             }
 
-   
+
             template <typename T>
             void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
             {
@@ -277,7 +277,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
                 *minval = minval_;
                 *maxval = maxval_;
-            }  
+            }
 
             template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
             template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
                 *minval = minval_;
                 *maxval = maxval_;
-            }  
+            }
 
             template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
             template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
@@ -325,7 +325,7 @@ namespace cv { namespace gpu { namespace device
                 typedef typename MinMaxTypeTraits<T>::best_type best_type;
                 __shared__ best_type sminval[nthreads];
                 __shared__ best_type smaxval[nthreads];
-                
+
                 uint tid = threadIdx.y * blockDim.x + threadIdx.x;
                 uint idx = ::min(tid, size - 1);
 
@@ -335,7 +335,7 @@ namespace cv { namespace gpu { namespace device
 
                 findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     minval[0] = (T)sminval[0];
                     maxval[0] = (T)smaxval[0];
@@ -410,7 +410,7 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////////////////////
         // minMaxLoc
 
-        namespace minmaxloc 
+        namespace minmaxloc
         {
             __constant__ int ctwidth;
             __constant__ int ctheight;
@@ -431,7 +431,7 @@ namespace cv { namespace gpu { namespace device
 
 
             // Returns required buffer sizes
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
                                     int& b1rows, int& b2cols, int& b2rows)
             {
                 dim3 threads, grid;
@@ -445,16 +445,16 @@ namespace cv { namespace gpu { namespace device
 
             // Estimates device constants which are used in the kernels using specified thread configuration
             void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                 int twidth = divUp(divUp(cols, grid.x), threads.x);
                 int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-            }  
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
+            }
 
 
             template <typename T>
-            __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, 
+            __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
                                   volatile uint* minloc, volatile uint* maxloc)
             {
                 T val = minval[tid + offset];
@@ -473,7 +473,7 @@ namespace cv { namespace gpu { namespace device
 
 
             template <int size, typename T>
-            __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, 
+            __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
                                                 volatile uint* maxloc, const uint tid)
             {
                 if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
@@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device
 
 
             template <int nthreads, typename T, typename Mask>
-            __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, 
+            __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval,
                                             uint* minloc, uint* maxloc)
             {
                 typedef typename MinMaxTypeTraits<T>::best_type best_type;
@@ -507,7 +507,7 @@ namespace cv { namespace gpu { namespace device
                 uint tid = threadIdx.y * blockDim.x + threadIdx.x;
 
                 T mymin = numeric_limits<T>::max();
-                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); 
+                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
                 uint myminloc = 0;
                 uint mymaxloc = 0;
                 uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
@@ -527,7 +527,7 @@ namespace cv { namespace gpu { namespace device
                     }
                 }
 
-                sminval[tid] = mymin; 
+                sminval[tid] = mymin;
                 smaxval[tid] = mymax;
                 sminloc[tid] = myminloc;
                 smaxloc[tid] = mymaxloc;
@@ -564,7 +564,7 @@ namespace cv { namespace gpu { namespace device
 
 			        findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
 
-                    if (tid == 0) 
+                    if (tid == 0)
                     {
                         minval[0] = (T)sminval[0];
                         maxval[0] = (T)smaxval[0];
@@ -574,7 +574,7 @@ namespace cv { namespace gpu { namespace device
                     }
 		        }
             #else
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
                     maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace device
 
 
             template <typename T>
-            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                      int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
             {
                 dim3 threads, grid;
@@ -598,7 +598,7 @@ namespace cv { namespace gpu { namespace device
                 uint* minloc_buf = (uint*)locbuf.ptr(0);
                 uint* maxloc_buf = (uint*)locbuf.ptr(1);
 
-                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
                                                                    minloc_buf, maxloc_buf);
                 cudaSafeCall( cudaGetLastError() );
 
@@ -627,7 +627,7 @@ namespace cv { namespace gpu { namespace device
 
 
             template <typename T>
-            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
+            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
                                  int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
             {
                 dim3 threads, grid;
@@ -639,7 +639,7 @@ namespace cv { namespace gpu { namespace device
                 uint* minloc_buf = (uint*)locbuf.ptr(0);
                 uint* maxloc_buf = (uint*)locbuf.ptr(1);
 
-                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
                                                                      minloc_buf, maxloc_buf);
                 cudaSafeCall( cudaGetLastError() );
 
@@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device
 
                 findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     minval[0] = (T)sminval[0];
                     maxval[0] = (T)smaxval[0];
@@ -699,7 +699,7 @@ namespace cv { namespace gpu { namespace device
 
 
             template <typename T>
-            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                               int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
             {
                 dim3 threads, grid;
@@ -711,7 +711,7 @@ namespace cv { namespace gpu { namespace device
                 uint* minloc_buf = (uint*)locbuf.ptr(0);
                 uint* maxloc_buf = (uint*)locbuf.ptr(1);
 
-                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
                                                                    minloc_buf, maxloc_buf);
                 cudaSafeCall( cudaGetLastError() );
                 minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
@@ -741,7 +741,7 @@ namespace cv { namespace gpu { namespace device
 
 
             template <typename T>
-            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
+            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
                                           int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
             {
                 dim3 threads, grid;
@@ -753,7 +753,7 @@ namespace cv { namespace gpu { namespace device
                 uint* minloc_buf = (uint*)locbuf.ptr(0);
                 uint* maxloc_buf = (uint*)locbuf.ptr(1);
 
-                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
                                                                      minloc_buf, maxloc_buf);
                 cudaSafeCall( cudaGetLastError() );
                 minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
@@ -785,7 +785,7 @@ namespace cv { namespace gpu { namespace device
         //////////////////////////////////////////////////////////////////////////////////////////////////////////
         // countNonZero
 
-        namespace countnonzero 
+        namespace countnonzero
         {
             __constant__ int ctwidth;
             __constant__ int ctheight;
@@ -811,11 +811,11 @@ namespace cv { namespace gpu { namespace device
 
 
             void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                 int twidth = divUp(divUp(cols, grid.x), threads.x);
                 int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
             }
 
 
@@ -862,7 +862,7 @@ namespace cv { namespace gpu { namespace device
 
 			        sumInSmem<nthreads, uint>(scount, tid);
 
-			        if (tid == 0) 
+			        if (tid == 0)
                     {
                         count[0] = scount[0];
                         blocks_finished = 0;
@@ -873,7 +873,7 @@ namespace cv { namespace gpu { namespace device
             #endif
             }
 
-           
+
             template <typename T>
             int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)
             {
@@ -890,9 +890,9 @@ namespace cv { namespace gpu { namespace device
 
                 uint count;
                 cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                
+
                 return count;
-            }  
+            }
 
             template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);
             template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);
@@ -914,7 +914,7 @@ namespace cv { namespace gpu { namespace device
 
                 sumInSmem<nthreads, uint>(scount, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                     count[0] = scount[0];
             }
 
@@ -937,9 +937,9 @@ namespace cv { namespace gpu { namespace device
 
                 uint count;
                 cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                
+
                 return count;
-            }  
+            }
 
             template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);
             template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);
@@ -965,16 +965,16 @@ namespace cv { namespace gpu { namespace device
             template <> struct SumType<float> { typedef float R; };
             template <> struct SumType<double> { typedef double R; };
 
-            template <typename R> 
+            template <typename R>
             struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
 
-            template <typename R> 
+            template <typename R>
             struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };
 
             template <>
             struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
 
-            template <typename R> 
+            template <typename R>
             struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };
 
             __constant__ int ctwidth;
@@ -987,7 +987,7 @@ namespace cv { namespace gpu { namespace device
             void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
             {
                 threads = dim3(threads_x, threads_y);
-                grid = dim3(divUp(cols, threads.x * threads.y), 
+                grid = dim3(divUp(cols, threads.x * threads.y),
                             divUp(rows, threads.y * threads.x));
                 grid.x = std::min(grid.x, threads.x);
                 grid.y = std::min(grid.y, threads.y);
@@ -1004,11 +1004,11 @@ namespace cv { namespace gpu { namespace device
 
 
             void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                 int twidth = divUp(divUp(cols, grid.x), threads.x);
                 int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
             }
 
             template <typename T, typename R, typename Op, int nthreads>
@@ -1055,7 +1055,7 @@ namespace cv { namespace gpu { namespace device
 
                     sumInSmem<nthreads, R>(smem, tid);
 
-                    if (tid == 0) 
+                    if (tid == 0)
                     {
                         result[0] = smem[0];
                         blocks_finished = 0;
@@ -1078,7 +1078,7 @@ namespace cv { namespace gpu { namespace device
 
                 sumInSmem<nthreads, R>(smem, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                     result[0] = smem[0];
             }
 
@@ -1142,7 +1142,7 @@ namespace cv { namespace gpu { namespace device
                     sumInSmem<nthreads, R>(smem, tid);
                     sumInSmem<nthreads, R>(smem + nthreads, tid);
 
-                    if (tid == 0) 
+                    if (tid == 0)
                     {
                         res.x = smem[0];
                         res.y = smem[nthreads];
@@ -1151,7 +1151,7 @@ namespace cv { namespace gpu { namespace device
                     }
                 }
             #else
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     DstType res;
                     res.x = smem[0];
@@ -1179,7 +1179,7 @@ namespace cv { namespace gpu { namespace device
                 sumInSmem<nthreads, R>(smem, tid);
                 sumInSmem<nthreads, R>(smem + nthreads, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     res.x = smem[0];
                     res.y = smem[nthreads];
@@ -1252,7 +1252,7 @@ namespace cv { namespace gpu { namespace device
                     sumInSmem<nthreads, R>(smem + nthreads, tid);
                     sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
 
-                    if (tid == 0) 
+                    if (tid == 0)
                     {
                         res.x = smem[0];
                         res.y = smem[nthreads];
@@ -1262,7 +1262,7 @@ namespace cv { namespace gpu { namespace device
                     }
                 }
             #else
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     DstType res;
                     res.x = smem[0];
@@ -1293,7 +1293,7 @@ namespace cv { namespace gpu { namespace device
                 sumInSmem<nthreads, R>(smem + nthreads, tid);
                 sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     res.x = smem[0];
                     res.y = smem[nthreads];
@@ -1323,7 +1323,7 @@ namespace cv { namespace gpu { namespace device
                     for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
                     {
                         val = ptr[x0 + x * blockDim.x];
-                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), 
+                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
                                                              Op::call(val.z), Op::call(val.w));
                     }
                 }
@@ -1372,7 +1372,7 @@ namespace cv { namespace gpu { namespace device
                     sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
                     sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
 
-                    if (tid == 0) 
+                    if (tid == 0)
                     {
                         res.x = smem[0];
                         res.y = smem[nthreads];
@@ -1383,7 +1383,7 @@ namespace cv { namespace gpu { namespace device
                     }
                 }
             #else
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     DstType res;
                     res.x = smem[0];
@@ -1417,7 +1417,7 @@ namespace cv { namespace gpu { namespace device
                 sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
                 sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
 
-                if (tid == 0) 
+                if (tid == 0)
                 {
                     res.x = smem[0];
                     res.y = smem[nthreads];
@@ -1488,7 +1488,7 @@ namespace cv { namespace gpu { namespace device
                 sum[1] = result[1];
                 sum[2] = result[2];
                 sum[3] = result[3];
-            }  
+            }
 
             template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
             template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@@ -1537,7 +1537,7 @@ namespace cv { namespace gpu { namespace device
                 sum[1] = result[1];
                 sum[2] = result[2];
                 sum[3] = result[3];
-            }  
+            }
 
             template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
             template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@@ -1608,7 +1608,7 @@ namespace cv { namespace gpu { namespace device
                 sum[1] = result[1];
                 sum[2] = result[2];
                 sum[3] = result[3];
-            }  
+            }
 
             template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
             template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@@ -1728,7 +1728,7 @@ namespace cv { namespace gpu { namespace device
                 sum[1] = result[1];
                 sum[2] = result[2];
                 sum[3] = result[3];
-            }  
+            }
 
             template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
             template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@@ -1894,7 +1894,7 @@ namespace cv { namespace gpu { namespace device
             {
                 for (int y = threadIdx.y; y < src.rows; y += 16)
                     myVal = op(myVal, src.ptr(y)[x]);
-            }        
+            }
 
             smem[threadIdx.x * 16 + threadIdx.y] = myVal;
             __syncthreads();
@@ -1931,11 +1931,11 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
 
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
             {
-                reduceRows_caller<SumReductor, T, S, D>, 
-                reduceRows_caller<AvgReductor, T, S, D>, 
-                reduceRows_caller<MaxReductor, T, S, D>, 
+                reduceRows_caller<SumReductor, T, S, D>,
+                reduceRows_caller<AvgReductor, T, S, D>,
+                reduceRows_caller<MaxReductor, T, S, D>,
                 reduceRows_caller<MinReductor, T, S, D>
             };
 
@@ -1944,15 +1944,15 @@ namespace cv { namespace gpu { namespace device
 
         template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
         template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+        template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
         template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
         template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+        template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
         template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
         template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+        template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
         template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
         template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
@@ -2068,7 +2068,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
 
-            static const caller_t callers[4][4] = 
+            static const caller_t callers[4][4] =
             {
                 {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
                 {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
@@ -2083,15 +2083,15 @@ namespace cv { namespace gpu { namespace device
         template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
         template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-        template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
-        template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+        template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
         template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-        template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
-        template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-        template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+        template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-        template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+        template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
         template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
         template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
diff --git a/modules/gpu/src/cuda/optical_flow.cu b/modules/gpu/src/cuda/optical_flow.cu
index 9406cce..5af3f15 100644
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -42,7 +42,7 @@
 
 #include "opencv2/gpu/device/common.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     namespace optical_flow
     {
@@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
         #define NUM_VERTS_PER_ARROW 6
 
         __global__ void NeedleMapAverageKernel(const DevMem2Df u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
-        {   
+        {
             __shared__ float smem[2 * NEEDLE_MAP_SCALE];
 
             volatile float* u_col_sum = smem;
@@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device
             }
 
             if (threadIdx.x < 8)
-            {        	
+            {
                 // now add the column sums
                 const uint X = threadIdx.x;
 
@@ -80,8 +80,8 @@ namespace cv { namespace gpu { namespace device
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
                 }
 
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0 
-                { 
+                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                {
                     u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
                 }
@@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device
                 v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
             }
         }
-        
+
         void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg)
         {
             const dim3 block(NEEDLE_MAP_SCALE);
diff --git a/modules/gpu/src/cuda/orb.cu b/modules/gpu/src/cuda/orb.cu
index c2eebfd..25254f9 100644
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -40,7 +40,7 @@
 //
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
 //
-// The original code was written by Paul Furgale and Chi Hay Tong 
+// The original code was written by Paul Furgale and Chi Hay Tong
 // and later optimized and prepared for integration into OpenCV by Itseez.
 //
 //M*/
@@ -51,7 +51,7 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     namespace orb
     {
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
         // cull
 
         int cull_gpu(int* loc, float* response, int size, int n_points)
-        {            
+        {
             thrust::device_ptr<int> loc_ptr(loc);
             thrust::device_ptr<float> response_ptr(response);
 
@@ -83,10 +83,10 @@ namespace cv { namespace gpu { namespace device
             {
                 const short2 loc = loc_[ptidx];
 
-                const int r = blockSize / 2;                
+                const int r = blockSize / 2;
                 const int x0 = loc.x - r;
                 const int y0 = loc.y - r;
-                
+
                 int a = 0, b = 0, c = 0;
 
                 for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
@@ -94,12 +94,12 @@ namespace cv { namespace gpu { namespace device
                     const int i = ind / blockSize;
                     const int j = ind % blockSize;
 
-                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 + 
-                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) + 
+                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
+                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
                         (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
 
-                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 + 
-                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) + 
+                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
+                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
                         (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
 
                     a += Ix * Ix;
@@ -160,7 +160,7 @@ namespace cv { namespace gpu { namespace device
                 int m_01 = 0, m_10 = 0;
 
                 const short2 loc = loc_[ptidx];
-                        
+
                 // Treat the center line differently, v=0
                 for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                     m_10 += u * image(loc.y, loc.x + u);
@@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device
                     int v_sum = 0;
                     int m_sum = 0;
                     const int d = c_u_max[v];
-                    
+
                     for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
                     {
                         int val_plus = image(loc.y + v, loc.x + u);
@@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
         {
             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
             {
-                pattern_x += 16 * i; 
+                pattern_x += 16 * i;
                 pattern_y += 16 * i;
 
                 int t0, t1, val;
@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device
 
                 t0 = GET_VALUE(14); t1 = GET_VALUE(15);
                 val |= (t0 < t1) << 7;
-                
+
                 return val;
             }
         };
@@ -266,23 +266,23 @@ namespace cv { namespace gpu { namespace device
         {
             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
             {
-                pattern_x += 12 * i; 
+                pattern_x += 12 * i;
                 pattern_y += 12 * i;
-             
+
                 int t0, t1, t2, val;
 
                 t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
                 val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
-                
+
                 t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
-                
+
                 t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
-                
+
                 t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
-                
+
                 return val;
             }
         };
@@ -291,9 +291,9 @@ namespace cv { namespace gpu { namespace device
         {
             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
             {
-                pattern_x += 16 * i; 
+                pattern_x += 16 * i;
                 pattern_y += 16 * i;
-             
+
                 int t0, t1, t2, t3, k, val;
                 int a, b;
 
@@ -304,7 +304,7 @@ namespace cv { namespace gpu { namespace device
                 if( t3 > t2 ) t2 = t3, b = 3;
                 k = t0 > t2 ? a : b;
                 val = k;
-                
+
                 t0 = GET_VALUE(4); t1 = GET_VALUE(5);
                 t2 = GET_VALUE(6); t3 = GET_VALUE(7);
                 a = 0, b = 2;
@@ -312,7 +312,7 @@ namespace cv { namespace gpu { namespace device
                 if( t3 > t2 ) t2 = t3, b = 3;
                 k = t0 > t2 ? a : b;
                 val |= k << 2;
-                
+
                 t0 = GET_VALUE(8); t1 = GET_VALUE(9);
                 t2 = GET_VALUE(10); t3 = GET_VALUE(11);
                 a = 0, b = 2;
@@ -320,7 +320,7 @@ namespace cv { namespace gpu { namespace device
                 if( t3 > t2 ) t2 = t3, b = 3;
                 k = t0 > t2 ? a : b;
                 val |= k << 4;
-                
+
                 t0 = GET_VALUE(12); t1 = GET_VALUE(13);
                 t2 = GET_VALUE(14); t3 = GET_VALUE(15);
                 a = 0, b = 2;
@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
                 if( t3 > t2 ) t2 = t3, b = 3;
                 k = t0 > t2 ? a : b;
                 val |= k << 6;
-                
+
                 return val;
             }
         };
@@ -399,7 +399,7 @@ namespace cv { namespace gpu { namespace device
                 y[ptidx] = loc.y * scale;
             }
         }
-        
+
         void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
         {
             dim3 block(256);
diff --git a/modules/gpu/src/cuda/remap.cu b/modules/gpu/src/cuda/remap.cu
index 7eb2da3..80bdb2f 100644
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device
         {
             static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)
             {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
 
                 dim3 block(32, 8);
                 dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device
                     cudaSafeCall( cudaDeviceSynchronize() ); \
                 } \
             };
-            
+
         OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
         //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
         OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
@@ -188,7 +188,7 @@ namespace cv { namespace gpu { namespace device
 
         template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
         {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, 
+            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy,
                 DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
             {
                 if (stream == 0)
@@ -198,13 +198,13 @@ namespace cv { namespace gpu { namespace device
             }
         };
 
-        template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, 
+        template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
             DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
         {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, 
+            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
                 DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc);
 
-            static const caller_t callers[3][5] = 
+            static const caller_t callers[3][5] =
             {
                 {
                     RemapDispatcher<PointFilter, BrdReflect101, T>::call,
@@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
                 }
             };
 
-            callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap, 
+            callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap,
                 static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
         }
 
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
index a51c043..b8edce7 100644
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
             }
         };
 
-        template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, 
+        template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
             DevMem2Db dst, int interpolation, cudaStream_t stream)
         {
             typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream);
@@ -245,7 +245,7 @@ namespace cv { namespace gpu { namespace device
             if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
                 interpolation = 1;
 
-            callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy, 
+            callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy,
                 static_cast< DevMem2D_<T> >(dst), stream);
         }
 
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu
index a5ec886..8963212 100644
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -48,9 +48,9 @@
 #include "opencv2/gpu/device/border_interpolate.hpp"
 #include "opencv2/gpu/device/static_check.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace row_filter 
+    namespace row_filter
     {
         #define MAX_KERNEL_SIZE 32
 
@@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device
             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
 
             __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
-            
+
             const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
 
             if (y >= src.rows)
@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
 
-            static const caller_t callers[5][33] = 
+            static const caller_t callers[5][33] =
             {
                 {
                     0,
@@ -337,9 +337,9 @@ namespace cv { namespace gpu { namespace device
                     linearRowFilter_caller<30, T, D, BrdRowWrap>,
                     linearRowFilter_caller<31, T, D, BrdRowWrap>,
                     linearRowFilter_caller<32, T, D, BrdRowWrap>
-                }               
+                }
             };
-            
+
             loadKernel(kernel, ksize);
 
             callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
diff --git a/modules/gpu/src/cuda/safe_call.hpp b/modules/gpu/src/cuda/safe_call.hpp
index 94ba457..956b835 100644
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -60,7 +60,7 @@
     #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
 #endif
 
-namespace cv { namespace gpu 
+namespace cv { namespace gpu
 {
     void nppError(int err, const char *file, const int line, const char *func = "");
     void ncvError(int err, const char *file, const int line, const char *func = "");
diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu
index ac45089..a9c0844 100644
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -42,12 +42,12 @@
 
 #include "internal_shared.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace split_merge 
+    namespace split_merge
     {
         template <typename T, size_t elem_size = sizeof(T)>
-        struct TypeTraits 
+        struct TypeTraits
         {
             typedef T type;
             typedef T type2;
@@ -74,7 +74,7 @@ namespace cv { namespace gpu { namespace device
         };
 
         template <typename T>
-        struct TypeTraits<T, 4> 
+        struct TypeTraits<T, 4>
         {
             typedef int type;
             typedef int2 type2;
@@ -83,7 +83,7 @@ namespace cv { namespace gpu { namespace device
         };
 
         template <typename T>
-        struct TypeTraits<T, 8> 
+        struct TypeTraits<T, 8>
         {
             typedef double type;
             typedef double2 type2;
@@ -95,11 +95,11 @@ namespace cv { namespace gpu { namespace device
         typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
 
         //------------------------------------------------------------
-        // Merge    
+        // Merge
 
         template <typename T>
-        __global__ void mergeC2_(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
+        __global__ void mergeC2_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
                                  int rows, int cols, uchar* dst, size_t dst_step)
         {
             typedef typename TypeTraits<T>::type2 dst_type;
@@ -111,8 +111,8 @@ namespace cv { namespace gpu { namespace device
             const T* src1_y = (const T*)(src1 + y * src1_step);
             dst_type* dst_y = (dst_type*)(dst + y * dst_step);
 
-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                 dst_type dst_elem;
                 dst_elem.x = src0_y[x];
                 dst_elem.y = src1_y[x];
@@ -122,9 +122,9 @@ namespace cv { namespace gpu { namespace device
 
 
         template <typename T>
-        __global__ void mergeC3_(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
+        __global__ void mergeC3_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
                                  int rows, int cols, uchar* dst, size_t dst_step)
         {
             typedef typename TypeTraits<T>::type3 dst_type;
@@ -137,8 +137,8 @@ namespace cv { namespace gpu { namespace device
             const T* src2_y = (const T*)(src2 + y * src2_step);
             dst_type* dst_y = (dst_type*)(dst + y * dst_step);
 
-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                 dst_type dst_elem;
                 dst_elem.x = src0_y[x];
                 dst_elem.y = src1_y[x];
@@ -149,9 +149,9 @@ namespace cv { namespace gpu { namespace device
 
 
         template <>
-        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
+        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
                                  int rows, int cols, uchar* dst, size_t dst_step)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace device
             const double* src2_y = (const double*)(src2 + y * src2_step);
             double* dst_y = (double*)(dst + y * dst_step);
 
-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                 dst_y[3 * x] = src0_y[x];
                 dst_y[3 * x + 1] = src1_y[x];
                 dst_y[3 * x + 2] = src2_y[x];
@@ -172,10 +172,10 @@ namespace cv { namespace gpu { namespace device
 
 
         template <typename T>
-        __global__ void mergeC4_(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
-                                 const uchar* src3, size_t src3_step, 
+        __global__ void mergeC4_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 const uchar* src3, size_t src3_step,
                                  int rows, int cols, uchar* dst, size_t dst_step)
         {
             typedef typename TypeTraits<T>::type4 dst_type;
@@ -189,8 +189,8 @@ namespace cv { namespace gpu { namespace device
             const T* src3_y = (const T*)(src3 + y * src3_step);
             dst_type* dst_y = (dst_type*)(dst + y * dst_step);
 
-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                 dst_type dst_elem;
                 dst_elem.x = src0_y[x];
                 dst_elem.y = src1_y[x];
@@ -202,10 +202,10 @@ namespace cv { namespace gpu { namespace device
 
 
         template <>
-        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
-                                 const uchar* src3, size_t src3_step, 
+        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 const uchar* src3, size_t src3_step,
                                  int rows, int cols, uchar* dst, size_t dst_step)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace device
             const double* src3_y = (const double*)(src3 + y * src3_step);
             double2* dst_y = (double2*)(dst + y * dst_step);
 
-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                 dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
                 dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
             }
@@ -303,7 +303,7 @@ namespace cv { namespace gpu { namespace device
 
 
         template <typename T>
-        __global__ void splitC2_(const uchar* src, size_t src_step, 
+        __global__ void splitC2_(const uchar* src, size_t src_step,
                                 int rows, int cols,
                                 uchar* dst0, size_t dst0_step,
                                 uchar* dst1, size_t dst1_step)
@@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
             T* dst0_y = (T*)(dst0 + y * dst0_step);
             T* dst1_y = (T*)(dst1 + y * dst1_step);
 
-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
             {
                 src_type src_elem = src_y[x];
                 dst0_y[x] = src_elem.x;
@@ -327,7 +327,7 @@ namespace cv { namespace gpu { namespace device
 
 
         template <typename T>
-        __global__ void splitC3_(const uchar* src, size_t src_step, 
+        __global__ void splitC3_(const uchar* src, size_t src_step,
                                 int rows, int cols,
                                 uchar* dst0, size_t dst0_step,
                                 uchar* dst1, size_t dst1_step,
@@ -343,7 +343,7 @@ namespace cv { namespace gpu { namespace device
             T* dst1_y = (T*)(dst1 + y * dst1_step);
             T* dst2_y = (T*)(dst2 + y * dst2_step);
 
-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
             {
                 src_type src_elem = src_y[x];
                 dst0_y[x] = src_elem.x;
@@ -368,7 +368,7 @@ namespace cv { namespace gpu { namespace device
             double* dst1_y = (double*)(dst1 + y * dst1_step);
             double* dst2_y = (double*)(dst2 + y * dst2_step);
 
-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
             {
                 dst0_y[x] = src_y[3 * x];
                 dst1_y[x] = src_y[3 * x + 1];
@@ -395,7 +395,7 @@ namespace cv { namespace gpu { namespace device
             T* dst2_y = (T*)(dst2 + y * dst2_step);
             T* dst3_y = (T*)(dst3 + y * dst3_step);
 
-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
             {
                 src_type src_elem = src_y[x];
                 dst0_y[x] = src_elem.x;
@@ -423,7 +423,7 @@ namespace cv { namespace gpu { namespace device
             double* dst2_y = (double*)(dst2 + y * dst2_step);
             double* dst3_y = (double*)(dst3 + y * dst3_step);
 
-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
             {
                 double2 src_elem1 = src_y[2 * x];
                 double2 src_elem2 = src_y[2 * x + 1];
diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu
index 359aca4..ad686e1 100644
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -42,9 +42,9 @@
 
 #include "internal_shared.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace stereobm 
+    namespace stereobm
     {
         //////////////////////////////////////////////////////////////////////////////////////////////////
         /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
@@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device
 
         template<int RADIUS>
         __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-        {	
+        {
             unsigned int cache = 0;
             unsigned int cache2 = 0;
 
@@ -401,8 +401,8 @@ namespace cv { namespace gpu { namespace device
             prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
             cudaSafeCall( cudaGetLastError() );
 
-            if (stream == 0)   
-                cudaSafeCall( cudaDeviceSynchronize() );    
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
             cudaSafeCall( cudaUnbindTexture (texForSobel ) );
         }
diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu
index 2e57157..65562b8 100644
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -44,9 +44,9 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace stereobp 
+    namespace stereobp
     {
         ///////////////////////////////////////////////////////////////
         /////////////////////// load constants ////////////////////////
diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu
index df502f4..4eae990 100644
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -44,9 +44,9 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace stereocsbp 
+    namespace stereocsbp
     {
         ///////////////////////////////////////////////////////////////
         /////////////////////// load constants ////////////////////////
@@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
         __constant__ int cth;
 
         __constant__ size_t cimg_step;
-        __constant__ size_t cmsg_step;        
+        __constant__ size_t cmsg_step;
         __constant__ size_t cdisp_step1;
         __constant__ size_t cdisp_step2;
 
@@ -392,7 +392,7 @@ namespace cv { namespace gpu { namespace device
                 get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
             else
                 get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-            
+
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)
@@ -575,7 +575,7 @@ namespace cv { namespace gpu { namespace device
             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-            
+
             callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
             cudaSafeCall( cudaGetLastError() );
 
@@ -588,13 +588,13 @@ namespace cv { namespace gpu { namespace device
 
         template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
                                int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-             
+
 
         ///////////////////////////////////////////////////////////////
         //////////////////////// init message /////////////////////////
         ///////////////////////////////////////////////////////////////
 
-         
+
          template <typename T>
         __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
                                                      const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
@@ -691,7 +691,7 @@ namespace cv { namespace gpu { namespace device
             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );
-            
+
             dim3 threads(32, 8, 1);
             dim3 grid(1, 1, 1);
 
@@ -720,7 +720,7 @@ namespace cv { namespace gpu { namespace device
                           const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
                           float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
                           float* data_cost_selected, const float* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
 
         ///////////////////////////////////////////////////////////////
         ////////////////////  calc all iterations /////////////////////
@@ -805,7 +805,7 @@ namespace cv { namespace gpu { namespace device
             for(int t = 0; t < iters; ++t)
             {
                 compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
-                cudaSafeCall( cudaGetLastError() );                
+                cudaSafeCall( cudaGetLastError() );
             }
 			if (stream == 0)
                     cudaSafeCall( cudaDeviceSynchronize() );
@@ -814,7 +814,7 @@ namespace cv { namespace gpu { namespace device
         template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
             int h, int w, int nr_plane, int iters, cudaStream_t stream);
 
-        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, 
+        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
             int h, int w, int nr_plane, int iters, cudaStream_t stream);
 
 
@@ -879,7 +879,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, 
+        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
             const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
 
         template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
diff --git a/modules/gpu/src/cuda/warp.cu b/modules/gpu/src/cuda/warp.cu
index 772bb9b..d40029e 100644
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -98,7 +98,7 @@ namespace cv { namespace gpu { namespace device
         {
             dim3 block(32, 8);
             dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
-            
+
             buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
             cudaSafeCall( cudaGetLastError() );
 
@@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
         {
             static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, int)
             {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
 
                 dim3 block(32, 8);
                 dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
         #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
 
         template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
-        { 
+        {
             static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
             {
                 if (stream == 0)
@@ -266,7 +266,7 @@ namespace cv { namespace gpu { namespace device
             }
         };
 
-        template <class Transform, typename T> 
+        template <class Transform, typename T>
         void warp_caller(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Db dst, int interpolation,
                          int borderMode, const float* borderValue, cudaStream_t stream, int cc)
         {
diff --git a/modules/gpu/src/hog.cpp b/modules/gpu/src/hog.cpp
index 25c3f47..3e5317a 100644
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@@ -98,17 +98,17 @@ namespace cv { namespace gpu { namespace device
 
 using namespace ::cv::gpu::device;
     
-cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size, 
-									  int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels)
-        : win_size(win_size), 
-          block_size(block_size), 
-          block_stride(block_stride), 
-          cell_size(cell_size),
-          nbins(nbins), 
-          win_sigma(win_sigma),
-          threshold_L2hys(threshold_L2hys),
-          gamma_correction(gamma_correction),
-          nlevels(nlevels)
+cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
+                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
+        : win_size(win_size_),
+          block_size(block_size_),
+          block_stride(block_stride_),
+          cell_size(cell_size_),
+          nbins(nbins_),
+          win_sigma(win_sigma_),
+          threshold_L2hys(threshold_L2hys_),
+          gamma_correction(gamma_correction_),
+          nlevels(nlevels_)
 {
     CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 && 
               (win_size.height - block_size.height) % block_stride.height == 0);
@@ -149,9 +149,9 @@ bool cv::gpu::HOGDescriptor::checkDetectorSize() const
     return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
 }
 
-void cv::gpu::HOGDescriptor::setSVMDetector(const vector<float>& detector)
+void cv::gpu::HOGDescriptor::setSVMDetector(const vector<float>& _detector)
 {
-    std::vector<float> detector_reordered(detector.size());
+    std::vector<float> detector_reordered(_detector.size());
 
     size_t block_hist_size = getBlockHistogramSize();
     cv::Size blocks_per_img = numPartsWithin(win_size, block_size, block_stride);
@@ -159,7 +159,7 @@ void cv::gpu::HOGDescriptor::setSVMDetector(const vector<float>& detector)
     for (int i = 0; i < blocks_per_img.height; ++i)
         for (int j = 0; j < blocks_per_img.width; ++j)
         {
-            const float* src = &detector[0] + (j * blocks_per_img.height + i) * block_hist_size;
+            const float* src = &_detector[0] + (j * blocks_per_img.height + i) * block_hist_size;
             float* dst = &detector_reordered[0] + (i * blocks_per_img.width + j) * block_hist_size;
             for (size_t k = 0; k < block_hist_size; ++k)
                 dst[k] = src[k];
@@ -168,7 +168,7 @@ void cv::gpu::HOGDescriptor::setSVMDetector(const vector<float>& detector)
     this->detector.upload(Mat(detector_reordered).reshape(1, 1));
 
     size_t descriptor_size = getDescriptorSize();  
-    free_coef = detector.size() > descriptor_size ? detector[descriptor_size] : 0;
+    free_coef = _detector.size() > descriptor_size ? _detector[descriptor_size] : 0;
 
     CV_Assert(checkDetectorSize());
 }
@@ -190,24 +190,24 @@ cv::gpu::GpuMat cv::gpu::HOGDescriptor::getBuffer(int rows, int cols, int type,
 }
 
 
-void cv::gpu::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)
+void cv::gpu::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& _grad, GpuMat& _qangle)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
 	
     //   grad.create(img.size(), CV_32FC2);
-	grad = getBuffer(img.size(), CV_32FC2, grad_buf);    
+    _grad = getBuffer(img.size(), CV_32FC2, grad_buf);
 
     //   qangle.create(img.size(), CV_8UC2);
-	qangle = getBuffer(img.size(), CV_8UC2, qangle_buf);  
+    _qangle = getBuffer(img.size(), CV_8UC2, qangle_buf);
 
     float angleScale = (float)(nbins / CV_PI);
     switch (img.type()) 
 	{
         case CV_8UC1:
-            hog::compute_gradients_8UC1(nbins, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction);
+            hog::compute_gradients_8UC1(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
             break;
         case CV_8UC4:
-            hog::compute_gradients_8UC4(nbins, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction);
+            hog::compute_gradients_8UC4(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
             break;
     }
 }
@@ -323,8 +323,8 @@ void cv::gpu::HOGDescriptor::detectMultiScale(const GpuMat& img, vector<Rect>& f
 
     for (size_t i = 0; i < level_scale.size(); i++)
     {
-        double scale = level_scale[i];
-        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
+        double _scale = level_scale[i];
+        Size sz(cvRound(img.cols / _scale), cvRound(img.rows / _scale));
         GpuMat smaller_img;
 
         if (sz == img.size())
diff --git a/modules/gpu/src/mssegmentation.cpp b/modules/gpu/src/mssegmentation.cpp
index b5b7bac..1b9e481 100644
--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@@ -78,7 +78,7 @@ template <typename T>
 struct GraphEdge
 {
     GraphEdge() {}
-    GraphEdge(int to, int next, const T& val) : to(to), next(next), val(val) {}
+    GraphEdge(int to_, int next_, const T& val_) : to(to_), next(next_), val(val_) {}
     int to;
     int next;
     T val;
@@ -110,7 +110,7 @@ private:
 struct SegmLinkVal
 {
     SegmLinkVal() {}
-    SegmLinkVal(int dr, int dsp) : dr(dr), dsp(dsp) {}
+    SegmLinkVal(int dr_, int dsp_) : dr(dr_), dsp(dsp_) {}
     bool operator <(const SegmLinkVal& other) const
     {
         return dr + dsp < other.dr + other.dsp;
@@ -123,8 +123,8 @@ struct SegmLinkVal
 struct SegmLink
 {
     SegmLink() {}
-    SegmLink(int from, int to, const SegmLinkVal& val)
-        : from(from), to(to), val(val) {}
+    SegmLink(int from_, int to_, const SegmLinkVal& val_)
+        : from(from_), to(to_), val(val_) {}
     bool operator <(const SegmLink& other) const
     {
         return val < other.val;
@@ -182,10 +182,10 @@ inline int DjSets::merge(int set1, int set2)
 
 
 template <typename T>
-Graph<T>::Graph(int numv, int nume_max) : start(numv, -1), edges(nume_max)
+Graph<T>::Graph(int numv_, int nume_max_) : start(numv_, -1), edges(nume_max_)
 {
-    this->numv = numv;
-    this->nume_max = nume_max;
+    this->numv = numv_;
+    this->nume_max = nume_max_;
     nume = 0;
 }
 
diff --git a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
index f7fa644..c2cbc79 100644
--- a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
@@ -44,25 +44,25 @@
 #ifndef OPENCV_GPU_WARP_REDUCE_HPP__
 #define OPENCV_GPU_WARP_REDUCE_HPP__
 
-namespace cv { namespace gpu { namespace device 
-{		
-    template <class T> 
+namespace cv { namespace gpu { namespace device
+{
+    template <class T>
     __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
     {
         const unsigned int lane = tid & 31; // index of thread in warp (0..31)
-    				
-	    if (lane < 16)
-	    {				
-		    T partial = ptr[tid];
 
-		    ptr[tid] = partial = partial + ptr[tid + 16];
-		    ptr[tid] = partial = partial + ptr[tid + 8];
-		    ptr[tid] = partial = partial + ptr[tid + 4];
-		    ptr[tid] = partial = partial + ptr[tid + 2];
-		    ptr[tid] = partial = partial + ptr[tid + 1];            
-	    }
+        if (lane < 16)
+        {
+            T partial = ptr[tid];
 
-	    return ptr[tid - lane];
+            ptr[tid] = partial = partial + ptr[tid + 16];
+            ptr[tid] = partial = partial + ptr[tid + 8];
+            ptr[tid] = partial = partial + ptr[tid + 4];
+            ptr[tid] = partial = partial + ptr[tid + 2];
+            ptr[tid] = partial = partial + ptr[tid + 1];
+        }
+
+        return ptr[tid - lane];
     }
 }}} // namespace cv { namespace gpu { namespace device {
 
diff --git a/modules/gpu/src/stereocsbp.cpp b/modules/gpu/src/stereocsbp.cpp
index 02af8fd..3c5842e 100644
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
@@ -171,8 +171,8 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te
     {
         cols_pyr[i]     = cols_pyr[i-1] / 2;
         rows_pyr[i]     = rows_pyr[i-1] / 2;
-        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;        
-    }		
+        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
+    }
 
 
 	GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
@@ -193,14 +193,14 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te
 		GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
 
 		GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };						
-		for(int r = 0; r < 5; ++r)		
+        for(int _r = 0; _r < 5; ++_r)
 		{
-			*buf_ptrs[r] = sub2.rowRange(r * sub2.rows/5, (r+1) * sub2.rows/5);
-			assert(buf_ptrs[r]->cols == cols && buf_ptrs[r]->rows == rows * rthis.nr_plane);
+            *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
+            assert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane);
 		}
 	};
 	      
-    size_t elem_step = mbuf.step / sizeof(T);	
+    size_t elem_step = mbuf.step / sizeof(T);
 
 	Size temp_size = data_cost.size();
 	if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp)	
diff --git a/modules/ml/doc/support_vector_machines.rst b/modules/ml/doc/support_vector_machines.rst
index 670ad1e..bf88ec6 100644
--- a/modules/ml/doc/support_vector_machines.rst
+++ b/modules/ml/doc/support_vector_machines.rst
@@ -198,7 +198,7 @@ Trains an SVM with optimal parameters.
 
 .. ocv:pyfunction:: cv2.SVM.train_auto(trainData, responses, varIdx, sampleIdx, params[, k_fold[, Cgrid[, gammaGrid[, pGrid[, nuGrid[, coeffGrid[, degreeGrid[, balanced]]]]]]]]) -> retval
 
-    :param k_fold: Cross-validation parameter. The training set is divided into ``k_fold`` subsets. One subset is used to train the model, the others form the test set. So, the SVM algorithm is executed ``k_fold`` times.
+    :param k_fold: Cross-validation parameter. The training set is divided into ``k_fold`` subsets. One subset is used to test the model, the others form the train set. So, the SVM algorithm is executed ``k_fold`` times.
 
     :param \*Grid: Iteration grid for the corresponding SVM parameter.
 
diff --git a/samples/android/fix_properties.bat b/samples/android/fix_properties.bat
index 1b4842d..6750c4e 100644
--- a/samples/android/fix_properties.bat
+++ b/samples/android/fix_properties.bat
@@ -1,8 +1,8 @@
-android update project --target android-11 --library ../../OpenCV-2.4.0/ --name "Sample - 15-puzzle"                             --path ./15-puzzle
-android update project --target android-11 --library ../../OpenCV-2.4.0/ --name "Sample - face-detection"                        --path ./face-detection
-android update project --target android-11 --library ../../OpenCV-2.4.0/ --name "Sample - image-manipulations"                   --path ./image-manipulations
-android update project --target android-11                               --name "Tutorial 0 (Basic) - Android Camera"            --path ./tutorial-0-androidcamera
-android update project --target android-11 --library ../../OpenCV-2.4.0/ --name "Tutorial 1 (Basic) - Add OpenCV"                --path ./tutorial-1-addopencv
-android update project --target android-11 --library ../../OpenCV-2.4.0/ --name "Tutorial 2 (Basic) - Use OpenCV Camera"         --path ./tutorial-2-opencvcamera
-android update project --target android-11                               --name "Tutorial 3 (Advanced) - Add Native OpenCV"      --path ./tutorial-3-native
-android update project --target android-11 --library ../../OpenCV-2.4.0/ --name "Tutorial 4 (Advanced) - Mix Java+Native OpenCV" --path ./tutorial-4-mixed
\ No newline at end of file
+call android update project --target android-11 --library ../../OpenCV-2.4.1/ --name "Sample - 15-puzzle"                             --path ./15-puzzle
+call android update project --target android-11 --library ../../OpenCV-2.4.1/ --name "Sample - face-detection"                        --path ./face-detection
+call android update project --target android-11 --library ../../OpenCV-2.4.1/ --name "Sample - image-manipulations"                   --path ./image-manipulations
+call android update project --target android-11                               --name "Tutorial 0 (Basic) - Android Camera"            --path ./tutorial-0-androidcamera
+call android update project --target android-11 --library ../../OpenCV-2.4.1/ --name "Tutorial 1 (Basic) - Add OpenCV"                --path ./tutorial-1-addopencv
+call android update project --target android-11 --library ../../OpenCV-2.4.1/ --name "Tutorial 2 (Basic) - Use OpenCV Camera"         --path ./tutorial-2-opencvcamera
+call android update project --target android-11                               --name "Tutorial 3 (Advanced) - Add Native OpenCV"      --path ./tutorial-3-native
+call android update project --target android-11 --library ../../OpenCV-2.4.1/ --name "Tutorial 4 (Advanced) - Mix Java+Native OpenCV" --path ./tutorial-4-mixed
\ No newline at end of file
diff --git a/samples/cpp/rgbdodometry.cpp b/samples/cpp/rgbdodometry.cpp
index 25bf115..660eb31 100644
--- a/samples/cpp/rgbdodometry.cpp
+++ b/samples/cpp/rgbdodometry.cpp
@@ -146,7 +146,7 @@ int main(int argc, char** argv)
     minGradMagnitudes[3] = 1;
 
     const float minDepth = 0.f; //in meters
-    const float maxDepth = 3.f; //in meters
+    const float maxDepth = 4.f; //in meters
     const float maxDepthDiff = 0.07f; //in meters
 
     tm.start();
diff --git a/samples/python2/kmeans.py b/samples/python2/kmeans.py
new file mode 100644
index 0000000..7e11763
--- /dev/null
+++ b/samples/python2/kmeans.py
@@ -0,0 +1,44 @@
+'''
+K-means clusterization sample.
+Usage:
+   kmeans.py
+
+Keyboard shortcuts:
+   ESC   - exit
+   space - generate new distribution
+'''
+
+import numpy as np
+import cv2
+
+from gaussian_mix import make_gaussians
+
+if __name__ == '__main__':
+    cluster_n = 5
+    img_size = 512
+
+    print __doc__
+
+    # generating bright palette
+    colors = np.zeros((1, cluster_n, 3), np.uint8)
+    colors[0,:] = 255
+    colors[0,:,0] = np.arange(0, 180, 180.0/cluster_n)
+    colors = cv2.cvtColor(colors, cv2.COLOR_HSV2BGR)[0]
+
+    while True:
+        print 'sampling distributions...'
+        points, _ = make_gaussians(cluster_n, img_size)
+
+        term_crit = (cv2.TERM_CRITERIA_EPS, 30, 0.1)
+        ret, labels, centers = cv2.kmeans(points, cluster_n, term_crit, 10, 0)
+
+        img = np.zeros((img_size, img_size, 3), np.uint8)
+        for (x, y), label in zip(np.int32(points), labels.ravel()):
+            c = map(int, colors[label])
+            cv2.circle(img, (x, y), 1, c, -1)
+
+        cv2.imshow('gaussian mixture', img)
+        ch = 0xFF & cv2.waitKey(0)
+        if ch == 27:
+            break
+    cv2.destroyAllWindows()