From 926a6bba0025c88ba4db155a45ce3912730314ee Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 31 May 2011 08:31:10 +0000 Subject: [PATCH] modified according to CUDA 4.0 API updates --- modules/gpu/CMakeLists.txt_cuda32 | 236 ----------------- modules/gpu/CMakeLists.txt_cuda4.0 | 240 ------------------ modules/gpu/FindNPP.cmake | 125 --------- modules/gpu/include/opencv2/gpu/gpu.hpp | 247 +++++++----------- modules/gpu/src/arithm.cpp | 128 +++++----- modules/gpu/src/bilateral_filter.cpp | 23 +- modules/gpu/src/blend.cpp | 15 +- modules/gpu/src/brute_force_matcher.cpp | 76 +++--- modules/gpu/src/calib3d.cpp | 50 +--- modules/gpu/src/color.cpp | 10 +- modules/gpu/src/cuda/blend.cu | 22 +- modules/gpu/src/cuda/brute_force_matcher.cu | 281 +++++++++++---------- modules/gpu/src/cuda/calib3d.cu | 4 +- modules/gpu/src/cuda/element_operations.cu | 20 +- modules/gpu/src/cuda/filters.cu | 78 +++--- modules/gpu/src/cuda/hog.cu | 20 +- modules/gpu/src/cuda/imgproc.cu | 36 +-- modules/gpu/src/cuda/internal_shared.hpp | 37 +++ modules/gpu/src/cuda/match_template.cu | 32 +-- modules/gpu/src/cuda/mathfunc.cu | 4 +- modules/gpu/src/cuda/matrix_operations.cu | 6 +- modules/gpu/src/cuda/matrix_reductions.cu | 56 ++-- modules/gpu/src/cuda/split_merge.cu | 12 +- modules/gpu/src/cuda/stereobm.cu | 26 +- modules/gpu/src/cuda/stereobp.cu | 20 +- modules/gpu/src/cuda/stereocsbp.cu | 12 +- modules/gpu/src/cuda/surf.cu | 16 +- modules/gpu/src/cudastream.cpp | 37 ++- modules/gpu/src/element_operations.cpp | 309 ++++++++--------------- modules/gpu/src/filtering.cpp | 219 ++++++++++------ modules/gpu/src/graphcuts.cpp | 16 +- modules/gpu/src/imgproc_gpu.cpp | 259 +++++++++---------- modules/gpu/src/matrix_operations.cpp | 18 +- modules/gpu/src/matrix_reductions.cpp | 24 +- modules/gpu/src/opencv2/gpu/device/transform.hpp | 8 +- modules/gpu/src/precomp.hpp | 4 +- modules/gpu/src/split_merge.cpp | 46 +--- modules/gpu/src/stereobm.cpp | 11 +- modules/gpu/src/stereobp.cpp | 91 ++++--- modules/gpu/src/stereocsbp.cpp | 80 +++--- 40 files changed, 1135 insertions(+), 1819 deletions(-) delete mode 100644 modules/gpu/CMakeLists.txt_cuda32 delete mode 100644 modules/gpu/CMakeLists.txt_cuda4.0 delete mode 100644 modules/gpu/FindNPP.cmake diff --git a/modules/gpu/CMakeLists.txt_cuda32 b/modules/gpu/CMakeLists.txt_cuda32 deleted file mode 100644 index 3cddd41..0000000 --- a/modules/gpu/CMakeLists.txt_cuda32 +++ /dev/null @@ -1,236 +0,0 @@ -set(name "gpu") - -set(the_target "opencv_${name}") -project(${the_target}) - -set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed -set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu) - -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/src/cuda" - "${CMAKE_CURRENT_SOURCE_DIR}/src" - "${CMAKE_CURRENT_BINARY_DIR}") - -file(GLOB lib_srcs "src/*.cpp") -file(GLOB lib_int_hdrs "src/*.h*") -file(GLOB lib_cuda "src/cuda/*.cu*") -file(GLOB lib_cuda_hdrs "src/cuda/*.h*") -source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs}) -source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) - -file(GLOB lib_hdrs "include/opencv2/${name}/*.h*") -source_group("Include" FILES ${lib_hdrs}) - -#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*") -file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*") -source_group("Device" FILES ${lib_device_hdrs}) - -if (HAVE_CUDA) - file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp") - file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu") - file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h") - - source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda}) - include_directories("src/nvidia/core" "src/nvidia/NPP_staging") -endif() - -if (HAVE_CUDA) - get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp}) - find_package(NPP 3.2.16 REQUIRED) - message(STATUS "NPP detected: " ${NPP_VERSION}) - - include_directories(${CUDA_INCLUDE_DIRS} ${CUDA_NPP_INCLUDES}) - - if (UNIX OR APPLE) - set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;") - #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC") - endif() - - #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep") - #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;") - - string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - - if(MSVC) - #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408") - - string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - endif() - - if (OPENCV_BUILD_SHARED_LIB) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS") - endif() - - CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda}) - #CUDA_BUILD_CLEAN_TARGET() -endif() - -foreach(d ${DEPS}) - if(${d} MATCHES "opencv_") - string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d}) - include_directories("${d_dir}/include") - endif() -endforeach() - -add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs}) - -if(PCHSupport_FOUND) - set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp) - if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*") - if(${CMAKE_GENERATOR} MATCHES "Visual*") - set(${the_target}_pch "src/precomp.cpp") - endif() - add_native_precompiled_header(${the_target} ${pch_header}) - elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles") - add_precompiled_header(${the_target} ${pch_header}) - endif() -endif() - -# For dynamic link numbering convenions -set_target_properties(${the_target} PROPERTIES - VERSION ${OPENCV_VERSION} - SOVERSION ${OPENCV_SOVERSION} - OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}" - ) - -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "modules") -endif() - -if (OPENCV_BUILD_SHARED_LIB) - if (MSVC) - set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS) - else() - add_definitions(-DCVAPI_EXPORTS) - endif() -endif() - -# Additional target properties -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/" - INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib" - ) - -# Add the required libraries for linking: -target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} ) - -if (HAVE_CUDA) - target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES}) - CUDA_ADD_CUFFT_TO_TARGET(${the_target}) -endif() - -if(MSVC) - if(CMAKE_CROSSCOMPILING) - set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk") - endif() - - set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc") -endif() - -# Dependencies of this target: -add_dependencies(${the_target} ${DEPS}) - -install(TARGETS ${the_target} - RUNTIME DESTINATION bin COMPONENT main - LIBRARY DESTINATION lib COMPONENT main - ARCHIVE DESTINATION lib COMPONENT main) - -install(FILES ${lib_hdrs} - DESTINATION include/opencv2/${name} - COMPONENT main) - -install(FILES src/nvidia/NPP_staging/NPP_staging.hpp src/nvidia/core/NCV.hpp - DESTINATION include/opencv2/${name} - COMPONENT main) - -#install(FILES ${lib_device_hdrs} -# DESTINATION include/opencv2/${name}/device -# COMPONENT main) - - -################################################################################################################ -################################ GPU Module Tests ##################################################### -################################################################################################################ - -# Test files processing is in the separated directory to avoid 'Src' source -# filter creation in Visual Studio -if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test) - set(the_test_target "opencv_test_${name}") - - include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/test" - "${CMAKE_CURRENT_BINARY_DIR}") - - set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS}) - foreach(d ${test_deps}) - if(${d} MATCHES "opencv_") - if(${d} MATCHES "opencv_lapack") - else() - string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d}) - include_directories("${d_dir}/include") - endif() - endif() - endforeach() - - file(GLOB test_srcs "test/*.cpp") - file(GLOB test_hdrs "test/*.h*") - - source_group("Src" FILES ${test_hdrs} ${test_srcs}) - if(HAVE_CUDA) - include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging) - - file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*") - source_group("Src\\NVidia" FILES ${nvidia}) - endif() - - - add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia}) - - if(PCHSupport_FOUND) - set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp) - if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*") - if(${CMAKE_GENERATOR} MATCHES "Visual*") - set(${the_test_target}_pch "test/test_precomp.cpp") - endif() - add_native_precompiled_header(${the_test_target} ${pch_header}) - elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles") - add_precompiled_header(${the_test_target} ${pch_header}) - endif() - endif() - - # Additional target properties - set_target_properties(${the_test_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/" - ) - - if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_test_target} PROPERTIES FOLDER "tests") - endif() - - add_dependencies(${the_test_target} ${test_deps}) - - # Add the required libraries for linking: - target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps}) - - enable_testing() - get_target_property(LOC ${the_test_target} LOCATION) - add_test(${the_test_target} "${LOC}") - - if(WIN32) - install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main) - endif() -endif() diff --git a/modules/gpu/CMakeLists.txt_cuda4.0 b/modules/gpu/CMakeLists.txt_cuda4.0 deleted file mode 100644 index 9d67437..0000000 --- a/modules/gpu/CMakeLists.txt_cuda4.0 +++ /dev/null @@ -1,240 +0,0 @@ -set(name "gpu") - -set(the_target "opencv_${name}") -project(${the_target}) - -set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed -set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu) - -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/src/cuda" - "${CMAKE_CURRENT_SOURCE_DIR}/src" - "${CMAKE_CURRENT_BINARY_DIR}") - -file(GLOB lib_srcs "src/*.cpp") -file(GLOB lib_int_hdrs "src/*.h*") -file(GLOB lib_cuda "src/cuda/*.cu*") -file(GLOB lib_cuda_hdrs "src/cuda/*.h*") -source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs}) -source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) - -file(GLOB lib_hdrs "include/opencv2/${name}/*.h*") -source_group("Include" FILES ${lib_hdrs}) - -#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*") -file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*") -source_group("Device" FILES ${lib_device_hdrs}) - -if (HAVE_CUDA) - file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp") - file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu") - file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h") - - source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda}) - include_directories("src/nvidia/core" "src/nvidia/NPP_staging") -endif() - -if (HAVE_CUDA) - #get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH) - #set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp}) - #find_package(NPP 3.2.16 REQUIRED) - #message(STATUS "NPP detected: " ${NPP_VERSION}) - - include_directories(${CUDA_INCLUDE_DIRS}) - - if (UNIX OR APPLE) - set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;") - #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC") - endif() - - #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep") - #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;") - - string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - - if(MSVC) - #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408") - - string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - endif() - - if (OPENCV_BUILD_SHARED_LIB) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS") - endif() - - CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda}) - #CUDA_BUILD_CLEAN_TARGET() -endif() - -foreach(d ${DEPS}) - if(${d} MATCHES "opencv_") - string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d}) - include_directories("${d_dir}/include") - endif() -endforeach() - -add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs}) - -if(PCHSupport_FOUND) - set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp) - if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*") - if(${CMAKE_GENERATOR} MATCHES "Visual*") - set(${the_target}_pch "src/precomp.cpp") - endif() - add_native_precompiled_header(${the_target} ${pch_header}) - elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles") - add_precompiled_header(${the_target} ${pch_header}) - endif() -endif() - -# For dynamic link numbering convenions -set_target_properties(${the_target} PROPERTIES - VERSION ${OPENCV_VERSION} - SOVERSION ${OPENCV_SOVERSION} - OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}" - ) - -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "modules") -endif() - -if (OPENCV_BUILD_SHARED_LIB) - if (MSVC) - set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS) - else() - add_definitions(-DCVAPI_EXPORTS) - endif() -endif() - -# Additional target properties -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/" - INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib" - ) - -# Add the required libraries for linking: -target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} ) - -if (HAVE_CUDA) - target_link_libraries(${the_target} ${CUDA_LIBRARIES}) - CUDA_ADD_CUFFT_TO_TARGET(${the_target}) - - unset(CUDA_npp_LIBRARY CACHE) - find_cuda_helper_libs(npp) - target_link_libraries(${the_target} ${CUDA_npp_LIBRARY}) -endif() - -if(MSVC) - if(CMAKE_CROSSCOMPILING) - set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk") - endif() - - set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc") -endif() - -# Dependencies of this target: -add_dependencies(${the_target} ${DEPS}) - -install(TARGETS ${the_target} - RUNTIME DESTINATION bin COMPONENT main - LIBRARY DESTINATION lib COMPONENT main - ARCHIVE DESTINATION lib COMPONENT main) - -install(FILES ${lib_hdrs} - DESTINATION include/opencv2/${name} - COMPONENT main) - -install(FILES src/nvidia/NPP_staging/NPP_staging.hpp src/nvidia/core/NCV.hpp - DESTINATION include/opencv2/${name} - COMPONENT main) - -#install(FILES ${lib_device_hdrs} -# DESTINATION include/opencv2/${name}/device -# COMPONENT main) - - -################################################################################################################ -################################ GPU Module Tests ##################################################### -################################################################################################################ - -# Test files processing is in the separated directory to avoid 'Src' source -# filter creation in Visual Studio -if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test) - set(the_test_target "opencv_test_${name}") - - include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/test" - "${CMAKE_CURRENT_BINARY_DIR}") - - set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS}) - foreach(d ${test_deps}) - if(${d} MATCHES "opencv_") - if(${d} MATCHES "opencv_lapack") - else() - string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d}) - include_directories("${d_dir}/include") - endif() - endif() - endforeach() - - file(GLOB test_srcs "test/*.cpp") - file(GLOB test_hdrs "test/*.h*") - - source_group("Src" FILES ${test_hdrs} ${test_srcs}) - if(HAVE_CUDA) - include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging) - - file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*") - source_group("Src\\NVidia" FILES ${nvidia}) - endif() - - - add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia}) - - if(PCHSupport_FOUND) - set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp) - if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*") - if(${CMAKE_GENERATOR} MATCHES "Visual*") - set(${the_test_target}_pch "test/test_precomp.cpp") - endif() - add_native_precompiled_header(${the_test_target} ${pch_header}) - elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles") - add_precompiled_header(${the_test_target} ${pch_header}) - endif() - endif() - - # Additional target properties - set_target_properties(${the_test_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/" - ) - - if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_test_target} PROPERTIES FOLDER "tests") - endif() - - add_dependencies(${the_test_target} ${test_deps}) - - # Add the required libraries for linking: - target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps}) - - enable_testing() - get_target_property(LOC ${the_test_target} LOCATION) - add_test(${the_test_target} "${LOC}") - - if(WIN32) - install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main) - endif() -endif() diff --git a/modules/gpu/FindNPP.cmake b/modules/gpu/FindNPP.cmake deleted file mode 100644 index efde017..0000000 --- a/modules/gpu/FindNPP.cmake +++ /dev/null @@ -1,125 +0,0 @@ -############################################################################### -# -# FindNPP.cmake -# -# CUDA_NPP_LIBRARY_ROOT_DIR -- Path to the NPP dorectory. -# CUDA_NPP_INCLUDES -- NPP Include directories. -# CUDA_NPP_LIBRARIES -- NPP libraries. -# NPP_VERSION -- NPP version in format "major.minor.build". -# -# If not found automatically, please set CUDA_NPP_LIBRARY_ROOT_DIR -# in CMake or set enviroment varivabe $CUDA_NPP_ROOT -# -# Author: Anatoly Baksheev, Itseez Ltd. -# -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -############################################################################### - -cmake_policy(PUSH) -cmake_minimum_required(VERSION 2.8.0) -cmake_policy(POP) - -if(NOT "${CUDA_NPP_LIBRARY_ROOT_DIR}" STREQUAL "${CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL}") - unset(CUDA_NPP_INCLUDES CACHE) - unset(CUDA_NPP_LIBRARIES CACHE) -endif() - -if(CMAKE_SIZEOF_VOID_P EQUAL 4) - if (UNIX OR APPLE) - set(NPP_SUFFIX "32") - else() - set(NPP_SUFFIX "-mt") - endif() -else(CMAKE_SIZEOF_VOID_P EQUAL 4) - if (UNIX OR APPLE) - set(NPP_SUFFIX "64") - else() - set(NPP_SUFFIX "-mt-x64") - endif() -endif(CMAKE_SIZEOF_VOID_P EQUAL 4) - -if(NOT CUDA_NPP_LIBRARY_ROOT_DIR OR CUDA_NPP_LIBRARY_ROOT_DIR STREQUAL "") - unset(CUDA_NPP_LIBRARY_ROOT_DIR CACHE) - find_path(CUDA_NPP_LIBRARY_ROOT_DIR "common/npp/include/npp.h" PATHS ENV CUDA_NPP_ROOT DOC "NPP root directory.") - MESSAGE(STATUS "NPP root directory: " ${CUDA_NPP_LIBRARY_ROOT_DIR}) -endif() - -# Search includes in our own paths. -find_path(CUDA_NPP_INCLUDES npp.h PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}/common/npp/include") -# Search default search paths, after we search our own set of paths. -find_path(CUDA_NPP_INCLUDES device_functions.h) -mark_as_advanced(CUDA_NPP_INCLUDES) - -# Find NPP library -find_library(CUDA_NPP_LIBRARIES - NAMES "npp" "npp${NPP_SUFFIX}" "libnpp${NPP_SUFFIX}" - PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}" - PATH_SUFFIXES "common/lib" "common/npp/lib" - DOC "NPP library" - ) - -# Search default search paths, after we search our own set of paths. -find_library(CUDA_NPP_LIBRARIES NAMES npp${NPP_SUFFIX} libnpp${NPP_SUFFIX} DOC "NPP library") -mark_as_advanced(CUDA_NPP_LIBRARIES) - -if(EXISTS ${CUDA_NPP_INCLUDES}/nppversion.h) - file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_major REGEX "#define NPP_VERSION_MAJOR.*") - file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_minor REGEX "#define NPP_VERSION_MINOR.*") - file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_build REGEX "#define NPP_VERSION_BUILD.*") - - string( REGEX REPLACE "#define NPP_VERSION_MAJOR[ \t]+|//.*" "" npp_major ${npp_major}) - string( REGEX REPLACE "#define NPP_VERSION_MINOR[ \t]+|//.*" "" npp_minor ${npp_minor}) - string( REGEX REPLACE "#define NPP_VERSION_BUILD[ \t]+|//.*" "" npp_build ${npp_build}) - - string( REGEX MATCH "[0-9]+" npp_major ${npp_major} ) - string( REGEX MATCH "[0-9]+" npp_minor ${npp_minor} ) - string( REGEX MATCH "[0-9]+" npp_build ${npp_build} ) - set( NPP_VERSION "${npp_major}.${npp_minor}.${npp_build}") -endif() - -if(NOT EXISTS ${CUDA_NPP_LIBRARIES} OR NOT EXISTS ${CUDA_NPP_INCLUDES}/npp.h) - set(CUDA_NPP_FOUND FALSE) - message(FATAL_ERROR "NPP headers/libraries are not found. Please specify CUDA_NPP_LIBRARY_ROOT_DIR in CMake or set $CUDA_NPP_ROOT.") -endif() - -include( FindPackageHandleStandardArgs ) -find_package_handle_standard_args( NPP - REQUIRED_VARS - CUDA_NPP_INCLUDES - CUDA_NPP_LIBRARIES - #Need cmake 2.8.3 to uncomment this. - #VERSION_VAR - NPP_VERSION) - -if(APPLE) - # We need to add the path to cudart to the linker using rpath, since the library name for the cuda libraries is prepended with @rpath. - get_filename_component(_cuda_path_to_npp "${CUDA_NPP_LIBRARIES}" PATH) - if(_cuda_path_to_npp) - list(APPEND CUDA_NPP_LIBRARIES "-Wl,-rpath,${_cuda_path_to_npp}") - endif() -endif() - -set(CUDA_NPP_FOUND TRUE) -set(CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL "${CUDA_NPP_LIBRARY_ROOT_DIR}" CACHE INTERNAL "This is the value of the last time CUDA_NPP_LIBRARY_ROOT_DIR was set successfully." FORCE) - - diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index cbb0f37..d3f7fd6 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -447,12 +447,21 @@ namespace cv // converts matrix type, ex from float to uchar depending on type void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); + + static Stream& Null(); + + operator bool() const; + private: void create(); void release(); + struct Impl; Impl *impl; + friend struct StreamAccessor; + + explicit Stream(Impl* impl); }; @@ -460,168 +469,130 @@ namespace cv //! transposes the matrix //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc) - CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst); + CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null()); //! reverses the order of the rows, columns or both in a matrix //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode); + CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null()); //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) //! destination array will have the depth type as lut and the same channels number as source //! supports CV_8UC1, CV_8UC3 types - CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst); + CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null()); //! makes multi-channel array out of several single-channel arrays - CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst); + CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null()); //! makes multi-channel array out of several single-channel arrays - CV_EXPORTS void merge(const vector& src, GpuMat& dst); - - //! makes multi-channel array out of several single-channel arrays (async version) - CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream); - - //! makes multi-channel array out of several single-channel arrays (async version) - CV_EXPORTS void merge(const vector& src, GpuMat& dst, const Stream& stream); + CV_EXPORTS void merge(const vector& src, GpuMat& dst, Stream& stream = Stream::Null()); //! copies each plane of a multi-channel array to a dedicated array - CV_EXPORTS void split(const GpuMat& src, GpuMat* dst); + CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null()); //! copies each plane of a multi-channel array to a dedicated array - CV_EXPORTS void split(const GpuMat& src, vector& dst); - - //! copies each plane of a multi-channel array to a dedicated array (async version) - CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, const Stream& stream); - - //! copies each plane of a multi-channel array to a dedicated array (async version) - CV_EXPORTS void split(const GpuMat& src, vector& dst, const Stream& stream); + CV_EXPORTS void split(const GpuMat& src, vector& dst, Stream& stream = Stream::Null()); //! computes magnitude of complex (x(i).re, x(i).im) vector //! supports only CV_32FC2 type - CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude); + CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null()); //! computes squared magnitude of complex (x(i).re, x(i).im) vector //! supports only CV_32FC2 type - CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude); + CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null()); //! computes magnitude of each (x(i), y(i)) vector //! supports only floating-point source - CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude); - //! async version - CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, const Stream& stream); + CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null()); //! computes squared magnitude of each (x(i), y(i)) vector //! supports only floating-point source - CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude); - //! async version - CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, const Stream& stream); + CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null()); //! computes angle (angle(i)) of each (x(i), y(i)) vector //! supports only floating-point source - CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false); - //! async version - CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, const Stream& stream); + CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null()); //! converts Cartesian coordinates to polar //! supports only floating-point source - CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false); - //! async version - CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees, const Stream& stream); + CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null()); //! converts polar coordinates to Cartesian //! supports only floating-point source - CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false); - //! async version - CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream); + CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null()); //////////////////////////// Per-element operations //////////////////////////////////// //! adds one matrix to another (c = a + b) //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c); + CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); //! adds scalar to a matrix (c = a + s) //! supports CV_32FC1 and CV_32FC2 type - CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c); + CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); //! subtracts one matrix from another (c = a - b) //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c); + CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); //! subtracts scalar from a matrix (c = a - s) //! supports CV_32FC1 and CV_32FC2 type - CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c); + CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); //! computes element-wise product of the two arrays (c = a * b) //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c); + CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); //! multiplies matrix to a scalar (c = a * s) //! supports CV_32FC1 and CV_32FC2 type - CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c); + CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); //! computes element-wise quotient of the two arrays (c = a / b) //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c); + CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); //! computes element-wise quotient of matrix and scalar (c = a / s) //! supports CV_32FC1 and CV_32FC2 type - CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c); + CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); //! computes exponent of each matrix element (b = e**a) //! supports only CV_32FC1 type - CV_EXPORTS void exp(const GpuMat& a, GpuMat& b); + CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a)) //! supports only CV_32FC1 type - CV_EXPORTS void log(const GpuMat& a, GpuMat& b); + CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); //! computes element-wise absolute difference of two arrays (c = abs(a - b)) //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c); + CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); //! computes element-wise absolute difference of array and scalar (c = abs(a - s)) //! supports only CV_32FC1 type - CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c); + CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null()); //! compares elements of two arrays (c = a b) //! supports CV_8UC4, CV_32FC1 types - CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop); + CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null()); //! performs per-elements bit-wise inversion - CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat()); - //! async version - CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream); + CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); //! calculates per-element bit-wise disjunction of two arrays - CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat()); - //! async version - CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream); + CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); //! calculates per-element bit-wise conjunction of two arrays - CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat()); - //! async version - CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream); + CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); //! calculates per-element bit-wise "exclusive or" operation - CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat()); - //! async version - CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream); + CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); //! computes per-element minimum of two arrays (dst = min(src1, src2)) - CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); - //! Async version - CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream); + CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()); //! computes per-element minimum of array and scalar (dst = min(src1, src2)) - CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst); - //! Async version - CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream); + CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); //! computes per-element maximum of two arrays (dst = max(src1, src2)) - CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); - //! Async version - CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream); + CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()); //! computes per-element maximum of array and scalar (dst = max(src1, src2)) - CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst); - //! Async version - CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream); + CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); ////////////////////////////// Image processing ////////////////////////////// @@ -645,68 +616,60 @@ namespace cv //! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV. //! Supported types of input disparity: CV_8U, CV_16S. //! Output disparity has CV_8UC4 type in BGRA format (alpha = 255). - CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp); - //! async version - CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, const Stream& stream); + CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null()); //! Reprojects disparity image to 3D space. //! Supports CV_8U and CV_16S types of input disparity. //! The output is a 4-channel floating-point (CV_32FC4) matrix. //! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map. //! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify. - CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q); - //! async version - CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const Stream& stream); + CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream = Stream::Null()); //! converts image from one color space to another - CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0); - //! async version - CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, const Stream& stream); + CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null()); //! applies fixed threshold to the image - CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type); - //! async version - CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, const Stream& stream); + CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null()); //! resizes the image //! Supports INTER_NEAREST, INTER_LINEAR //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR); + CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); //! warps the image using affine transformation //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC - CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR); + CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null()); //! warps the image using perspective transformation //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC - CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR); + CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null()); //! rotate 8bit single or four channel image //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR); + CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); //! copies 2D array to a larger destination array and pads borders with user-specifiable constant //! supports CV_8UC1, CV_8UC4, CV_32SC1 and CV_32FC1 types - CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar()); + CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar(), Stream& stream = Stream::Null()); //! computes the integral image //! sum will have CV_32S type, but will contain unsigned int values //! supports only CV_8UC1 source type - CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum); + CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null()); //! buffered version - CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer); + CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null()); //! computes the integral image and integral for the squared image //! sum will have CV_32S type, sqsum - CV32F type //! supports only CV_8UC1 source type - CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum); + CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null()); //! computes squared integral image //! result matrix will have 64F type, but will contain 64U values //! supports source images of 8UC1 type only - CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum); + CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null()); //! computes vertical sum, supports only CV_32FC1 images CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum); @@ -714,14 +677,7 @@ namespace cv //! computes the standard deviation of integral images //! supports only CV_32SC1 source type and CV_32FC1 sqr type //! output will have CV_32FC1 type - CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect); - - // applies Canny edge detector and produces the edge map - // disabled until fix crash - //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); - //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3); - //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); - //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3); + CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null()); //! computes Harris cornerness criteria at each image pixel CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101); @@ -792,7 +748,7 @@ namespace cv //! performs linear blending of two images //! to avoid accuracy errors sum of weigths shouldn't be very close to zero CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, - GpuMat& result); + GpuMat& result, Stream& stream = Stream::Null()); ////////////////////////////// Matrix reductions ////////////////////////////// @@ -863,17 +819,11 @@ namespace cv ///////////////////////////// Calibration 3D ////////////////////////////////// CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - GpuMat& dst); - - CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - GpuMat& dst, const Stream& stream); + GpuMat& dst, Stream& stream = Stream::Null()); CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst); - - CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, - const Stream& stream); + const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, + Stream& stream = Stream::Null()); CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, @@ -893,7 +843,7 @@ namespace cv public: BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} virtual ~BaseRowFilter_GPU() {} - virtual void operator()(const GpuMat& src, GpuMat& dst) = 0; + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; int ksize, anchor; }; @@ -908,7 +858,7 @@ namespace cv public: BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} virtual ~BaseColumnFilter_GPU() {} - virtual void operator()(const GpuMat& src, GpuMat& dst) = 0; + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; int ksize, anchor; }; @@ -922,7 +872,7 @@ namespace cv public: BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {} virtual ~BaseFilter_GPU() {} - virtual void operator()(const GpuMat& src, GpuMat& dst) = 0; + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; Size ksize; Point anchor; }; @@ -938,7 +888,7 @@ namespace cv public: virtual ~FilterEngine_GPU() {} - virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) = 0; + virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0; }; //! returns the non-separable filter engine with the specified filter @@ -1027,47 +977,47 @@ namespace cv //! smooths the image using the normalized box filter //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1)); + CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()); //! a synonym for normalized box filter - static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1)) { boxFilter(src, dst, -1, ksize, anchor); } + static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); } //! erodes the image (applies the local minimum operator) - CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); //! dilates the image (applies the local maximum operator) - CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); //! applies an advanced morphological operation to the image - CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); //! applies non-separable 2D linear filter to the image - CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1)); + CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null()); //! applies separable 2D linear filter to the image CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, - Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! applies generalized Sobel operator to the image CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! applies the vertical or horizontal Scharr operator to the image CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! smooths the image using Gaussian filter. CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! applies Laplacian operator to the image //! supports only ksize = 1 and ksize = 3 - CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1); + CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null()); //////////////////////////////// Image Labeling //////////////////////////////// //!performs labeling via graph cuts - CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf); + CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& stream = Stream::Null()); ////////////////////////////////// Histograms ////////////////////////////////// @@ -1076,23 +1026,23 @@ namespace cv //! Calculates histogram with evenly distributed bins for signle channel source. //! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types. //! Output hist will have one row and histSize cols and CV_32SC1 type. - CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel); + CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); //! Calculates histogram with evenly distributed bins for four-channel source. //! All channels of source are processed separately. //! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types. //! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type. - CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4]); + CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); //! Calculates histogram with bins determined by levels array. //! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. //! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types. //! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type. - CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels); + CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null()); //! Calculates histogram with bins determined by levels array. //! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. //! All channels of source are processed separately. //! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types. //! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type. - CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]); + CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()); //////////////////////////////// StereoBM_GPU //////////////////////////////// @@ -1110,10 +1060,7 @@ namespace cv //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair //! Output disparity has CV_8U type. - void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity); - - //! async version - void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const Stream & stream); + void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); //! Some heuristics that tries to estmate // if current GPU will be faster than CPU in this algorithm. @@ -1165,15 +1112,11 @@ namespace cv //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, //! if disparity is empty output type will be CV_16S else output type will be disparity.type(). - void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity); - - //! async version - void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream); + void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); //! version for user specified data term - void operator()(const GpuMat& data, GpuMat& disparity); - void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream); + void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null()); int ndisp; @@ -1194,7 +1137,7 @@ namespace cv /////////////////////////// StereoConstantSpaceBP /////////////////////////// // "A Constant-Space Belief Propagation Algorithm for Stereo Matching" - // Qingxiong Yang, Liang Wang�, Narendra Ahuja + // Qingxiong Yang, Liang Wang, Narendra Ahuja // http://vision.ai.uiuc.edu/~qyang6/ class CV_EXPORTS StereoConstantSpaceBP @@ -1224,10 +1167,7 @@ namespace cv //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, //! if disparity is empty output type will be CV_16S else output type will be disparity.type(). - void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity); - - //! async version - void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream); + void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); int ndisp; @@ -1280,10 +1220,7 @@ namespace cv //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image. //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type. - void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst); - - //! async version - void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream); + void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null()); private: int ndisp; @@ -1406,7 +1343,7 @@ namespace cv // distance.at(0, queryIdx) will contain distance void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, GpuMat& trainIdx, GpuMat& distance, - const GpuMat& mask = GpuMat()); + const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); // Download trainIdx and distance to CPU vector with DMatch static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector& matches); @@ -1425,7 +1362,7 @@ namespace cv // distance.at(0, queryIdx) will contain distance void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, - const GpuMat& maskCollection); + const GpuMat& maskCollection, Stream& stream = Stream::Null()); // Download trainIdx, imgIdx and distance to CPU vector with DMatch static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, @@ -1443,7 +1380,7 @@ namespace cv // allDist.at(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best, // otherwise it will contain distance between queryIdx and trainIdx descriptors void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat()); + GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); // Download trainIdx and distance to CPU vector with DMatch // compactResult is used when mask is not empty. If compactResult is false matches @@ -1478,7 +1415,7 @@ namespace cv // Matches doesn't sorted. void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, - const GpuMat& mask = GpuMat()); + const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); // Download trainIdx, nMatches and distance to CPU vector with DMatch. // matches will be sorted in increasing order of distances. diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index c6a23de..176eeb3 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -48,37 +48,36 @@ using namespace std; #if !defined (HAVE_CUDA) -void cv::gpu::transpose(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } -void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::magnitude(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); } -void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); } -void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); } -void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); } -void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); } -void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); } +void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } +void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); } +void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); } +void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ //////////////////////////////////////////////////////////////////////// // transpose -void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) +void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s) { CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8); dst.create( src.cols, src.rows, src.type() ); + cudaStream_t stream = StreamAccessor::getStream(s); + if (src.elemSize() == 1) { + NppStreamHandler h(stream); + NppiSize sz; sz.width = src.cols; sz.height = src.rows; @@ -87,6 +86,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) } else if (src.elemSize() == 4) { + NppStStreamHandler h(stream); + NcvSize32u sz; sz.width = src.cols; sz.height = src.rows; @@ -96,6 +97,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) } else // if (src.elemSize() == 8) { + NppStStreamHandler h(stream); + NcvSize32u sz; sz.width = src.cols; sz.height = src.rows; @@ -104,13 +107,14 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) dst.ptr(), dst.step, sz) ); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// // flip -void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) +void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& s) { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); @@ -120,6 +124,10 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + if (src.type() == CV_8UC1) { nppSafeCall( nppiMirror_8u_C1R(src.ptr(), src.step, @@ -133,13 +141,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// // LUT -void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) +void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) { class LevelsInit { @@ -172,6 +181,10 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) Mat nppLut; lut.convertTo(nppLut, CV_32S); + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + if (src.type() == CV_8UC1) { nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz, nppLut.ptr(), lvls.pLevels, 256) ); @@ -192,13 +205,14 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr(), src.step, dst.ptr(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) ); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// // exp -void cv::gpu::exp(const GpuMat& src, GpuMat& dst) +void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& s) { CV_Assert(src.type() == CV_32FC1); @@ -208,15 +222,20 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst) sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( nppiExp_32f_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// // log -void cv::gpu::log(const GpuMat& src, GpuMat& dst) +void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& s) { CV_Assert(src.type() == CV_32FC1); @@ -226,9 +245,14 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst) sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( nppiLn_32f_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// @@ -238,7 +262,7 @@ namespace { typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); - inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func) + inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream) { CV_Assert(src.type() == CV_32FC2); @@ -248,20 +272,23 @@ namespace sz.width = src.cols; sz.height = src.rows; + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } } -void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst) +void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream) { - ::npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R); + ::npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream)); } -void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst) +void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream) { - ::npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R); + ::npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// @@ -310,52 +337,27 @@ namespace } } -void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst) -{ - ::cartToPolar_caller(x, y, &dst, false, 0, false, 0); -} - -void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, const Stream& stream) +void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream) { ::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream)); } -void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst) -{ - ::cartToPolar_caller(x, y, &dst, true, 0, false, 0); -} - -void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, const Stream& stream) +void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream) { ::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream)); } -void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees) -{ - ::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, 0); -} - -void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, const Stream& stream) +void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream) { ::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); } -void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees) -{ - ::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, 0); -} - -void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, const Stream& stream) +void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream) { ::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); } -void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees) -{ - ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, 0); -} - -void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream) +void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream) { ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream)); } diff --git a/modules/gpu/src/bilateral_filter.cpp b/modules/gpu/src/bilateral_filter.cpp index ddc8745..79ff0f0 100644 --- a/modules/gpu/src/bilateral_filter.cpp +++ b/modules/gpu/src/bilateral_filter.cpp @@ -51,7 +51,6 @@ using namespace std; cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); } cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); } -void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -101,7 +100,7 @@ namespace template void bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold, GpuMat& table_color, GpuMat& table_space, - const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream) + const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream) { short edge_disc = max(short(1), short(ndisp * edge_threshold + 0.5)); short max_disc = short(ndisp * max_disc_threshold + 0.5); @@ -109,14 +108,19 @@ namespace bf::load_constants(table_color.ptr(), table_space, ndisp, radius, edge_disc, max_disc); if (&dst != &disp) - disp.copyTo(dst); + { + if (stream) + stream.enqueueCopy(disp, dst); + else + disp.copyTo(dst); + } - bf::bilateral_filter_gpu((DevMem2D_)dst, img, img.channels(), iters, stream); + bf::bilateral_filter_gpu((DevMem2D_)dst, img, img.channels(), iters, StreamAccessor::getStream(stream)); } typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, GpuMat& table_color, GpuMat& table_space, - const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream); + const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream); const bilateral_filter_operator_t operators[] = {bilateral_filter_operator, 0, 0, bilateral_filter_operator, 0, 0, 0, 0}; @@ -139,18 +143,11 @@ cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radi calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f); } -void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst) -{ - CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters); - CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3)); - operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, 0); -} - void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream) { CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters); CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3)); - operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, StreamAccessor::getStream(stream)); + operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream); } #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/blend.cpp b/modules/gpu/src/blend.cpp index bf56507..8eb578b 100644 --- a/modules/gpu/src/blend.cpp +++ b/modules/gpu/src/blend.cpp @@ -48,8 +48,7 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) -void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, - GpuMat&) { throw_nogpu(); } +void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } #else @@ -57,14 +56,14 @@ namespace cv { namespace gpu { template void blendLinearCaller(int rows, int cols, int cn, const PtrStep_ img1, const PtrStep_ img2, - const PtrStep_ weights1, const PtrStep_ weights2, PtrStep_ result); + const PtrStep_ weights1, const PtrStep_ weights2, PtrStep_ result, cudaStream_t stream); void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, - const PtrStepf weights1, const PtrStepf weights2, PtrStep result); + const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream); }} void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, - GpuMat& result) + GpuMat& result, Stream& stream) { CV_Assert(img1.size() == img2.size()); CV_Assert(img1.type() == img2.type()); @@ -83,12 +82,12 @@ void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& { case CV_8U: if (cn != 4) - blendLinearCaller(size.height, size.width, cn, img1, img2, weights1, weights2, result); + blendLinearCaller(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream)); else - blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result); + blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream)); break; case CV_32F: - blendLinearCaller(size.height, size.width, cn, img1, img2, weights1, weights2, result); + blendLinearCaller(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream)); break; default: CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function"); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 9166bcb..d8805a7 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -54,18 +54,18 @@ const vector& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); } bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; } bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; } -void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector&, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector&, const std::vector&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, int, const GpuMat&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector >&, int, const std::vector&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, float, const GpuMat&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector >&, float, const std::vector&, bool) { throw_nogpu(); } @@ -77,47 +77,47 @@ namespace cv { namespace gpu { namespace bfmatcher template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12); + bool cc_12, cudaStream_t stream); template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12); + bool cc_12, cudaStream_t stream); template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12); + bool cc_12, cudaStream_t stream); template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12); + bool cc_12, cudaStream_t stream); template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12); + bool cc_12, cudaStream_t stream); template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12); + bool cc_12, cudaStream_t stream); template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); }}} namespace @@ -168,7 +168,7 @@ bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const // Match void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask) + GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask, Stream& stream) { if (queryDescs.empty() || trainDescs.empty()) return; @@ -177,7 +177,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12); + bool cc_12, cudaStream_t stream); static const match_caller_t match_callers[3][8] = { @@ -213,7 +213,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. // trainIdx store after imgIdx, so we doesn't lose it value. - func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12); + func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, @@ -301,7 +301,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect } void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, - GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection) + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection, Stream& stream) { if (queryDescs.empty() || trainCollection.empty()) return; @@ -310,7 +310,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12); + const DevMem2Df& distance, bool cc_12, cudaStream_t stream); static const match_caller_t match_callers[3][8] = { @@ -344,7 +344,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); - func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12); + func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, @@ -403,7 +403,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector // KnnMatch void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask) + GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask, Stream& stream) { if (queryDescs.empty() || trainDescs.empty()) return; @@ -411,7 +411,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con using namespace cv::gpu::bfmatcher; typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); static const match_caller_t match_callers[3][8] = { @@ -436,16 +436,24 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con const int nTrain = trainDescs.rows; trainIdx.create(nQuery, k, CV_32S); - trainIdx.setTo(Scalar::all(-1)); distance.create(nQuery, k, CV_32F); - ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); - allDist.setTo(Scalar::all(numeric_limits::max())); + + if (stream) + { + stream.enqueueMemSet(trainIdx, Scalar::all(-1)); + stream.enqueueMemSet(allDist, Scalar::all(numeric_limits::max())); + } + else + { + trainIdx.setTo(Scalar::all(-1)); + allDist.setTo(Scalar::all(numeric_limits::max())); + } match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist); + func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, @@ -547,7 +555,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, // RadiusMatch void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask) + GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask, Stream& stream) { if (queryDescs.empty() || trainDescs.empty()) return; @@ -555,7 +563,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, using namespace cv::gpu::bfmatcher; typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); static const radiusMatch_caller_t radiusMatch_callers[3][8] = { @@ -583,17 +591,21 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); - nMatches.setTo(Scalar::all(0)); if (trainIdx.empty()) { trainIdx.create(nQuery, nTrain, CV_32SC1); distance.create(nQuery, nTrain, CV_32FC1); } + if (stream) + stream.enqueueMemSet(nMatches, Scalar::all(0)); + else + nMatches.setTo(Scalar::all(0)); + radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr(), distance); + func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr(), distance, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, diff --git a/modules/gpu/src/calib3d.cpp b/modules/gpu/src/calib3d.cpp index 32bbab6..84db041 100644 --- a/modules/gpu/src/calib3d.cpp +++ b/modules/gpu/src/calib3d.cpp @@ -44,20 +44,11 @@ #if !defined(HAVE_CUDA) -void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, - GpuMat&) { throw_nogpu(); } +void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, - GpuMat&, const Stream&) { throw_nogpu(); } +void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, - const Mat&, const Mat&, GpuMat&) { throw_nogpu(); } - -void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, - const Mat&, const Mat&, GpuMat&, const Stream&) { throw_nogpu(); } - -void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, - Mat&, Mat&, bool, int, float, int, vector*) { throw_nogpu(); } +void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, vector*) { throw_nogpu(); } #else @@ -66,14 +57,12 @@ using namespace cv::gpu; namespace cv { namespace gpu { namespace transform_points { - void call(const DevMem2D_ src, const float* rot, const float* transl, - DevMem2D_ dst, cudaStream_t stream); + void call(const DevMem2D_ src, const float* rot, const float* transl, DevMem2D_ dst, cudaStream_t stream); }}} namespace { - void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, - GpuMat& dst, cudaStream_t stream) + void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream) { CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3); CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F); @@ -88,30 +77,20 @@ namespace } } -void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - GpuMat& dst) -{ - ::transformPointsCaller(src, rvec, tvec, dst, 0); -} - -void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - GpuMat& dst, const Stream& stream) +void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream) { ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream)); } namespace cv { namespace gpu { namespace project_points { - void call(const DevMem2D_ src, const float* rot, const float* transl, - const float* proj, DevMem2D_ dst, cudaStream_t stream); + void call(const DevMem2D_ src, const float* rot, const float* transl, const float* proj, DevMem2D_ dst, cudaStream_t stream); }}} namespace { - void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, - const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, - cudaStream_t stream) + void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream) { CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3); CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F); @@ -124,20 +103,11 @@ namespace Rodrigues(rvec, rot); dst.create(src.size(), CV_32FC2); - project_points::call(src, rot.ptr(), tvec.ptr(), - camera_mat.ptr(), dst,stream); + project_points::call(src, rot.ptr(), tvec.ptr(), camera_mat.ptr(), dst,stream); } } -void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst) -{ - ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, 0); -} - -void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, - const Stream& stream) +void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream) { ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream)); } diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp index 3188297..e95ce86 100644 --- a/modules/gpu/src/color.cpp +++ b/modules/gpu/src/color.cpp @@ -47,8 +47,7 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) -void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int) { throw_nogpu(); } -void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, const Stream&) { throw_nogpu(); } +void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -455,12 +454,7 @@ namespace } } -void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn) -{ - cvtColor_caller(src, dst, code, dcn, 0); -} - -void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, const Stream& stream) +void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream) { cvtColor_caller(src, dst, code, dcn, StreamAccessor::getStream(stream)); } diff --git a/modules/gpu/src/cuda/blend.cu b/modules/gpu/src/cuda/blend.cu index 42b5a63..bed0ee0 100644 --- a/modules/gpu/src/cuda/blend.cu +++ b/modules/gpu/src/cuda/blend.cu @@ -68,19 +68,22 @@ namespace cv { namespace gpu template void blendLinearCaller(int rows, int cols, int cn, const PtrStep_ img1, const PtrStep_ img2, - const PtrStepf weights1, const PtrStepf weights2, PtrStep_ result) + const PtrStepf weights1, const PtrStepf weights2, PtrStep_ result, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); - blendLinearKernel<<>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); - cudaSafeCall(cudaThreadSynchronize()); + blendLinearKernel<<>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); } template void blendLinearCaller(int, int, int, const PtrStep, const PtrStep, - const PtrStepf, const PtrStepf, PtrStep); + const PtrStepf, const PtrStepf, PtrStep, cudaStream_t stream); template void blendLinearCaller(int, int, int, const PtrStepf, const PtrStepf, - const PtrStepf, const PtrStepf, PtrStepf); + const PtrStepf, const PtrStepf, PtrStepf, cudaStream_t stream); __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, @@ -105,13 +108,16 @@ namespace cv { namespace gpu void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, - const PtrStepf weights1, const PtrStepf weights2, PtrStep result) + const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - blendLinearKernel8UC4<<>>(rows, cols, img1, img2, weights1, weights2, result); - cudaSafeCall(cudaThreadSynchronize()); + blendLinearKernel8UC4<<>>(rows, cols, img1, img2, weights1, weights2, result); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); } }} \ No newline at end of file diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index f0a8995..cf58178 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -589,7 +589,7 @@ namespace cv { namespace gpu { namespace bfmatcher template void matchSimple_caller(const DevMem2D_& queryDescs, const Train& train, - const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream) { StaticAssert::check(); // blockDimY vals must reduce by warp @@ -597,14 +597,15 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); match, Dist, T> - <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } template void matchCached_caller(const DevMem2D_& queryDescs, const Train& train, - const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream) { StaticAssert::check(); // blockDimY vals must reduce by warp StaticAssert= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length @@ -614,10 +615,11 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); match, Dist, T> - <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// @@ -626,167 +628,165 @@ namespace cv { namespace gpu { namespace bfmatcher template void matchDispatcher(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12) + bool cc_12, cudaStream_t stream) { if (queryDescs.cols < 64) - matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); + matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); else if (queryDescs.cols == 64) - matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); + matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); else if (queryDescs.cols < 128) - matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); + matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); else if (queryDescs.cols == 128) - matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); + matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); else if (queryDescs.cols < 256) - matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); + matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); else if (queryDescs.cols == 256 && cc_12) - matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); + matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); else - matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); - - cudaSafeCall( cudaThreadSynchronize() ); + matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); } template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12) + bool cc_12, cudaStream_t stream) { SingleTrain train((DevMem2D_)trainDescs); if (mask.data) { SingleMask m(mask); - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream); } else { - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); } } - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12) + bool cc_12, cudaStream_t stream) { SingleTrain train((DevMem2D_)trainDescs); if (mask.data) { SingleMask m(mask); - matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream); } else { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); } } - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12) + bool cc_12, cudaStream_t stream) { SingleTrain train((DevMem2D_)trainDescs); if (mask.data) { SingleMask m(mask); - matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream); } else { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); } } - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12) + const DevMem2Df& distance, bool cc_12, cudaStream_t stream) { TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); if (maskCollection.data) { MaskCollection mask(maskCollection.data); - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream); } else { - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); } } - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12) + const DevMem2Df& distance, bool cc_12, cudaStream_t stream) { TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); if (maskCollection.data) { MaskCollection mask(maskCollection.data); - matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream); } else { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); } } - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12) + const DevMem2Df& distance, bool cc_12, cudaStream_t stream) { TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); if (maskCollection.data) { MaskCollection mask(maskCollection.data); - matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream); } else { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); } } - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); /////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////// Knn Match //////////////////////////////////// @@ -833,16 +833,17 @@ namespace cv { namespace gpu { namespace bfmatcher template void calcDistance_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, - const Mask& mask, const DevMem2Df& distance) + const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) { dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1); - calcDistance<<>>( + calcDistance<<>>( queryDescs, trainDescs, mask, distance); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// @@ -1010,105 +1011,106 @@ namespace cv { namespace gpu { namespace bfmatcher // find knn match kernel caller template - void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) + void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) { dim3 threads(BLOCK_SIZE, 1, 1); dim3 grid(trainIdx.rows, 1, 1); for (int i = 0; i < knn; ++i) { - findBestMatch<<>>(allDist, i, trainIdx, distance); + findBestMatch<<>>(allDist, i, trainIdx, distance); cudaSafeCall( cudaGetLastError() ); } - - cudaSafeCall( cudaThreadSynchronize() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// // knn match caller template - void calcDistanceDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, const DevMem2Df& allDist) + void calcDistanceDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) { - calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist); + calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist, stream); } - void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) + void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) { - findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); + findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream); } template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) { if (mask.data) { - calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, SingleMask(mask), allDist); + calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, SingleMask(mask), allDist, stream); } else { - calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, WithOutMask(), allDist); + calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, WithOutMask(), allDist, stream); } - findKnnMatchDispatcher(knn, trainIdx, distance, allDist); + findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream); } - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) { if (mask.data) { calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - SingleMask(mask), allDist); + SingleMask(mask), allDist, stream); } else { calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - WithOutMask(), allDist); + WithOutMask(), allDist, stream); } - findKnnMatchDispatcher(knn, trainIdx, distance, allDist); + findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream); } - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) { if (mask.data) { calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - SingleMask(mask), allDist); + SingleMask(mask), allDist, stream); } else { calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - WithOutMask(), allDist); + WithOutMask(), allDist, stream); } - findKnnMatchDispatcher(knn, trainIdx, distance, allDist); + findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream); } - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream); /////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////// Radius Match ////////////////////////////////// @@ -1166,16 +1168,17 @@ namespace cv { namespace gpu { namespace bfmatcher template void radiusMatch_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, - const DevMem2Df& distance) + const DevMem2Df& distance, cudaStream_t stream) { dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1); - radiusMatch<<>>( + radiusMatch<<>>( queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// @@ -1184,77 +1187,77 @@ namespace cv { namespace gpu { namespace bfmatcher template void radiusMatchDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, - const DevMem2Df& distance) + const DevMem2Df& distance, cudaStream_t stream) { radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, - trainIdx, nMatches, distance); + trainIdx, nMatches, distance, stream); } template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance) + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream) { if (mask.data) { radiusMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, SingleMask(mask), trainIdx, nMatches, distance); + maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream); } else { radiusMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, WithOutMask(), trainIdx, nMatches, distance); + maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream); } } - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance) + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream) { if (mask.data) { radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, SingleMask(mask), trainIdx, nMatches, distance); + maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream); } else { radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, WithOutMask(), trainIdx, nMatches, distance); + maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream); } } - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance) + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream) { if (mask.data) { radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, SingleMask(mask), trainIdx, nMatches, distance); + maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream); } else { radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, WithOutMask(), trainIdx, nMatches, distance); + maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream); } } - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); }}} diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 200b82a..456cf76 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -184,7 +184,9 @@ namespace cv { namespace gpu computeHypothesisScoresKernel<<>>( num_points, object, image, dist_threshold, hypothesis_scores); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); } } // namespace solvepnp_ransac diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 30b6e05..e403706 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -64,19 +64,19 @@ namespace cv { namespace gpu { namespace mathfunc }; template - inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) + inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { NotEqual op; - transform(static_cast< DevMem2D_ >(src1), static_cast< DevMem2D_ >(src2), dst, op, 0); + transform(static_cast< DevMem2D_ >(src1), static_cast< DevMem2D_ >(src2), dst, op, stream); } - void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) + void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { - compare_ne(src1, src2, dst); + compare_ne(src1, src2, dst, stream); } - void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) + void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) { - compare_ne(src1, src2, dst); + compare_ne(src1, src2, dst, stream); } @@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu index c597b54..a9225c2 100644 --- a/modules/gpu/src/cuda/filters.cu +++ b/modules/gpu/src/cuda/filters.cu @@ -93,9 +93,9 @@ namespace filter_krnls typedef typename SmemType::smem_t smem_t; __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; - - const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; - const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; + + const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; + const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3; @@ -129,7 +129,7 @@ namespace filter_krnls namespace cv { namespace gpu { namespace filters { template class B> - void linearRowFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor) + void linearRowFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) { dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); @@ -143,16 +143,17 @@ namespace cv { namespace gpu { namespace filters "try bigger image or another border extrapolation mode", __FILE__, __LINE__); } - filter_krnls::linearRowFilter<<>>(src, dst, anchor, b); + filter_krnls::linearRowFilter<<>>(src, dst, anchor, b); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } template - void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type) + void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) { - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor); + typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); static const caller_t callers[3][17] = { { @@ -173,7 +174,7 @@ namespace cv { namespace gpu { namespace filters linearRowFilter_caller<14, T, D, BrdRowReflect101>, linearRowFilter_caller<15, T, D, BrdRowReflect101>, linearRowFilter_caller<16, T, D, BrdRowReflect101>, - }, + }, { 0, linearRowFilter_caller<1 , T, D, BrdRowReplicate>, @@ -192,7 +193,7 @@ namespace cv { namespace gpu { namespace filters linearRowFilter_caller<14, T, D, BrdRowReplicate>, linearRowFilter_caller<15, T, D, BrdRowReplicate>, linearRowFilter_caller<16, T, D, BrdRowReplicate>, - }, + }, { 0, linearRowFilter_caller<1 , T, D, BrdRowConstant>, @@ -216,15 +217,15 @@ namespace cv { namespace gpu { namespace filters loadLinearKernel(kernel, ksize); - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); } - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);; - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); }}} namespace filter_krnls @@ -233,9 +234,9 @@ namespace filter_krnls __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) { __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; - - const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; - const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; + + const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; + const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; T* sDataColumn = smem + threadIdx.x; @@ -269,7 +270,7 @@ namespace filter_krnls namespace cv { namespace gpu { namespace filters { template class B> - void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor) + void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) { dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); @@ -282,16 +283,17 @@ namespace cv { namespace gpu { namespace filters "try bigger image or another border extrapolation mode", __FILE__, __LINE__); } - filter_krnls::linearColumnFilter<<>>(src, dst, anchor, b); + filter_krnls::linearColumnFilter<<>>(src, dst, anchor, b); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } template - void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type) + void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) { - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor); + typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); static const caller_t callers[3][17] = { { @@ -312,7 +314,7 @@ namespace cv { namespace gpu { namespace filters linearColumnFilter_caller<14, T, D, BrdColReflect101>, linearColumnFilter_caller<15, T, D, BrdColReflect101>, linearColumnFilter_caller<16, T, D, BrdColReflect101>, - }, + }, { 0, linearColumnFilter_caller<1 , T, D, BrdColReplicate>, @@ -331,7 +333,7 @@ namespace cv { namespace gpu { namespace filters linearColumnFilter_caller<14, T, D, BrdColReplicate>, linearColumnFilter_caller<15, T, D, BrdColReplicate>, linearColumnFilter_caller<16, T, D, BrdColReplicate>, - }, + }, { 0, linearColumnFilter_caller<1 , T, D, BrdColConstant>, @@ -355,15 +357,15 @@ namespace cv { namespace gpu { namespace filters loadLinearKernel(kernel, ksize); - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); } - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); }}} ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -390,10 +392,10 @@ namespace cv { namespace gpu { namespace bf cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) ); size_t table_space_step = table_space.step / sizeof(float); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); - + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); - + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); } @@ -538,10 +540,10 @@ namespace cv { namespace gpu { namespace bf break; default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); - } + } if (stream != 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu index f42cbbc..5afb3df 100644 --- a/modules/gpu/src/cuda/hog.cu +++ b/modules/gpu/src/cuda/hog.cu @@ -220,7 +220,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y, img_block_width, grad, qangle, scale, block_hists); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -324,7 +324,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y, cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -418,7 +418,7 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block block_hists, coefs, free_coef, threshold, labels); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } //---------------------------------------------------------------------------- @@ -463,7 +463,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -512,7 +512,7 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } //---------------------------------------------------------------------------- @@ -636,7 +636,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + + cudaSafeCall( cudaDeviceSynchronize() ); } template @@ -707,7 +708,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -765,7 +767,9 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex) resize_for_hog_kernel<<>>(sx, sy, (DevMem2D_)dst, colOfs); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaUnbindTexture(tex) ); } diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index dad5335..82f578a 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc remap_1c<<>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaUnbindTexture(tex_remap) ); } @@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace imgproc remap_3c<<>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// @@ -263,7 +263,7 @@ namespace cv { namespace gpu { namespace imgproc meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); } extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) @@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace imgproc meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); } @@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace imgproc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void drawColorDisp_gpu(const DevMem2D_& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream) @@ -411,7 +411,7 @@ namespace cv { namespace gpu { namespace imgproc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// @@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace imgproc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) @@ -502,7 +502,7 @@ namespace cv { namespace gpu { namespace imgproc extractCovData_kernel<<>>(Dx.cols, Dx.rows, Dx, Dy, dst); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// @@ -611,7 +611,8 @@ namespace cv { namespace gpu { namespace imgproc cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall(cudaUnbindTexture(harrisDxTex)); cudaSafeCall(cudaUnbindTexture(harrisDyTex)); } @@ -727,7 +728,8 @@ namespace cv { namespace gpu { namespace imgproc cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); + cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); } @@ -763,7 +765,7 @@ namespace cv { namespace gpu { namespace imgproc column_sumKernel_32F<<>>(src.cols, src.rows, src, dst); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// @@ -791,7 +793,7 @@ namespace cv { namespace gpu { namespace imgproc mulSpectrumsKernel<<>>(a, b, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// @@ -820,7 +822,7 @@ namespace cv { namespace gpu { namespace imgproc mulSpectrumsKernel_CONJ<<>>(a, b, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// @@ -850,7 +852,7 @@ namespace cv { namespace gpu { namespace imgproc mulAndScaleSpectrumsKernel<<>>(a, b, scale, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////// @@ -880,7 +882,7 @@ namespace cv { namespace gpu { namespace imgproc mulAndScaleSpectrumsKernel_CONJ<<>>(a, b, scale, c); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } ///////////////////////////////////////////////////////////////////////// @@ -904,7 +906,9 @@ namespace cv { namespace gpu { namespace imgproc dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); downsampleKernel<<>>(src, rows, cols, k, dst); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); } template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst); diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp index 8c49563..6e3f54e 100644 --- a/modules/gpu/src/cuda/internal_shared.hpp +++ b/modules/gpu/src/cuda/internal_shared.hpp @@ -46,6 +46,8 @@ #include "opencv2/gpu/devmem2d.hpp" #include "safe_call.hpp" #include "cuda_runtime.h" +#include "npp.h" +#include "NPP_staging.hpp" namespace cv { @@ -106,6 +108,41 @@ namespace cv cudaSafeCall( cudaGetTextureReference(&tex, name) ); cudaSafeCall( cudaUnbindTexture(tex) ); } + + class NppStreamHandler + { + public: + inline explicit NppStreamHandler(cudaStream_t newStream = 0) + { + oldStream = nppGetStream(); + nppSetStream(newStream); + } + + inline ~NppStreamHandler() + { + nppSetStream(oldStream); + } + + private: + cudaStream_t oldStream; + }; + + class NppStStreamHandler + { + public: + inline explicit NppStStreamHandler(cudaStream_t newStream = 0) + { + oldStream = nppStSetActiveCUDAstream(newStream); + } + + inline ~NppStStreamHandler() + { + nppStSetActiveCUDAstream(oldStream); + } + + private: + cudaStream_t oldStream; + }; } } diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index a62acac..6822ad7 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -134,7 +134,7 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ, } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -165,7 +165,7 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ, } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -228,7 +228,7 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ, } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -259,7 +259,7 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ, } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -309,7 +309,7 @@ void matchTemplatePrepared_SQDIFF_8U( } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -360,7 +360,7 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U( } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -392,7 +392,7 @@ void matchTemplatePrepared_CCOFF_8U( w, h, (float)templ_sum / (w * h), image_sum, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -434,7 +434,7 @@ void matchTemplatePrepared_CCOFF_8UC2( image_sum_r, image_sum_g, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -490,7 +490,7 @@ void matchTemplatePrepared_CCOFF_8UC3( image_sum_r, image_sum_g, image_sum_b, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -556,7 +556,7 @@ void matchTemplatePrepared_CCOFF_8UC4( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -602,7 +602,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8U( image_sum, image_sqsum, result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -665,7 +665,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -742,7 +742,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -833,7 +833,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4( result); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -877,7 +877,7 @@ void normalize_8U(int w, int h, const DevMem2D_ image_sqsum, } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -919,7 +919,7 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn) } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 35adafe..bb75f3f 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) @@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream) diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index 0a4fa40..28ba2e9 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace matrix_operations { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall ( cudaThreadSynchronize() ); + cudaSafeCall ( cudaDeviceSynchronize() ); } void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) @@ -199,7 +199,7 @@ namespace cv { namespace gpu { namespace matrix_operations { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall ( cudaThreadSynchronize() ); + cudaSafeCall ( cudaDeviceSynchronize() ); } template void set_to_gpu(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream); @@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace matrix_operations { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall ( cudaThreadSynchronize() ); + cudaSafeCall ( cudaDeviceSynchronize() ); } template void set_to_gpu(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream); diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 43c64cf..59c71de 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -275,11 +275,11 @@ namespace cv { namespace gpu { namespace mathfunc minMaxKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } @@ -306,11 +306,11 @@ namespace cv { namespace gpu { namespace mathfunc minMaxKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } @@ -363,11 +363,11 @@ namespace cv { namespace gpu { namespace mathfunc minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } @@ -395,11 +395,11 @@ namespace cv { namespace gpu { namespace mathfunc minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } @@ -609,17 +609,17 @@ namespace cv { namespace gpu { namespace mathfunc minloc_buf, maxloc_buf); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) ); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } @@ -650,7 +650,7 @@ namespace cv { namespace gpu { namespace mathfunc minloc_buf, maxloc_buf); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); @@ -724,7 +724,7 @@ namespace cv { namespace gpu { namespace mathfunc minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); @@ -766,7 +766,7 @@ namespace cv { namespace gpu { namespace mathfunc minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); @@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace mathfunc countNonZeroKernel<256, T><<>>(src, count_buf); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); uint count; cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); @@ -942,7 +942,7 @@ namespace cv { namespace gpu { namespace mathfunc countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); uint count; cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); @@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace mathfunc break; } - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); @@ -1543,7 +1543,7 @@ namespace cv { namespace gpu { namespace mathfunc } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); @@ -1615,7 +1615,7 @@ namespace cv { namespace gpu { namespace mathfunc break; } - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); @@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace mathfunc } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); @@ -1737,7 +1737,7 @@ namespace cv { namespace gpu { namespace mathfunc break; } - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); @@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace mathfunc } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu index 671832f..361a617 100644 --- a/modules/gpu/src/cuda/split_merge.cu +++ b/modules/gpu/src/cuda/split_merge.cu @@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace split_merge { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); } @@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace split_merge { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); } @@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace split_merge { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); } @@ -445,7 +445,7 @@ namespace cv { namespace gpu { namespace split_merge { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); } @@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace split_merge { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); } @@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace split_merge { cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaDeviceSynchronize()); } diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu index fd802ad..62d1e2b 100644 --- a/modules/gpu/src/cuda/stereobm.cu +++ b/modules/gpu/src/cuda/stereobm.cu @@ -102,19 +102,19 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS)); - __syncthreads(); + __syncthreads(); ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS)); - __syncthreads(); + __syncthreads(); ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS)); - __syncthreads(); + __syncthreads(); ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS)); - __syncthreads(); + __syncthreads(); ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS)); - __syncthreads(); + __syncthreads(); ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS)); - __syncthreads(); + __syncthreads(); ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS)); - __syncthreads(); + __syncthreads(); ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS)); int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7]))); @@ -327,8 +327,8 @@ template void kernel_caller(const DevMem2D& left, const DevMem2D& ri stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp); cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); }; typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream); @@ -407,7 +407,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaUnbindTexture (texForSobel ) ); } @@ -531,10 +531,10 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a textureness_kernel<<>>(disp, winsz, avgTexturenessThreshold); cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); - cudaSafeCall( cudaUnbindTexture (texForTF) ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaUnbindTexture (texForTF) ); } }}} diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu index 33301a5..04e81db 100644 --- a/modules/gpu/src/cuda/stereobp.cu +++ b/modules/gpu/src/cuda/stereobp.cu @@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template <> void comp_data_gpu(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) { @@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template <> void comp_data_gpu(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) @@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template <> void comp_data_gpu(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) { @@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template <> void comp_data_gpu(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) @@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template <> void comp_data_gpu(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) { @@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////// @@ -287,7 +287,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); @@ -337,7 +337,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream); @@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -520,7 +520,7 @@ namespace cv { namespace gpu { namespace bp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template void output_gpu(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_& disp, cudaStream_t stream); diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu index e608cfa..03e160f 100644 --- a/modules/gpu/src/cuda/stereocsbp.cu +++ b/modules/gpu/src/cuda/stereocsbp.cu @@ -385,7 +385,7 @@ namespace cv { namespace gpu { namespace csbp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); dim3 threads(32, 8, 1); dim3 grid(1, 1, 1); @@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace csbp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step, @@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace csbp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2, @@ -713,7 +713,7 @@ namespace cv { namespace gpu { namespace csbp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -815,7 +815,7 @@ namespace cv { namespace gpu { namespace csbp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace csbp cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index d6e825e..bb697f4 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace surf icvCalcLayerDetAndTrace<<>>(det, trace); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// @@ -338,7 +338,7 @@ namespace cv { namespace gpu { namespace surf cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// @@ -483,7 +483,7 @@ namespace cv { namespace gpu { namespace surf icvInterpolateKeypoint<<>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// @@ -674,7 +674,7 @@ namespace cv { namespace gpu { namespace surf icvCalcOrientation<<>>(featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// @@ -986,24 +986,24 @@ namespace cv { namespace gpu { namespace surf compute_descriptors64<<>>(descriptors, featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); normalize_descriptors<64><<>>(descriptors); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } else { compute_descriptors128<<>>(descriptors, featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); normalize_descriptors<128><<>>(descriptors); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } } }}} diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp index d9e4aa8..b1dd03f 100644 --- a/modules/gpu/src/cudastream.cpp +++ b/modules/gpu/src/cudastream.cpp @@ -64,6 +64,8 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { thro void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); } +Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; } +cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; } #else /* !defined (HAVE_CUDA) */ @@ -117,7 +119,7 @@ namespace } } -CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; }; +CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; }; void cv::gpu::Stream::create() { @@ -188,18 +190,35 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); } void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); } -void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val) +void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s) { CV_Assert((src.depth() != CV_64F) || (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) + { + cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, impl->stream) ); + return; + } + if (src.depth() == CV_8U) + { + int cn = src.channels(); + + if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) + { + int val = saturate_cast(s[0]); + cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, impl->stream) ); + return; + } + } + typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream); static const set_caller_t set_callers[] = { kernelSet, kernelSet, kernelSet, kernelSet, kernelSet, kernelSet, kernelSet }; - set_callers[src.depth()](src, val, impl->stream); + set_callers[src.depth()](src, s, impl->stream); } void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) @@ -246,5 +265,17 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream); } +cv::gpu::Stream::operator bool() const +{ + return impl && impl->stream; +} + +cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {} + +cv::gpu::Stream& cv::gpu::Stream::Null() +{ + static Stream s((Impl*)0); + return s; +} #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 4b2737f..7123a83 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -47,35 +47,26 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) -void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } -void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } -void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } -void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } -void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } -void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); } -void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } -void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } -void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } -void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } -void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); } -double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int) {throw_nogpu(); return 0.0;} -double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, const Stream&) {throw_nogpu(); return 0.0;} +void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } +void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); } +double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_nogpu(); return 0.0;} #else @@ -90,7 +81,7 @@ namespace void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4, - npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1) + npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1, cudaStream_t stream) { CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1); @@ -100,6 +91,8 @@ namespace sz.width = src1.cols; sz.height = src1.rows; + NppStreamHandler h(stream); + switch (src1.type()) { case CV_8UC1: @@ -118,7 +111,8 @@ namespace CV_Assert(!"Unsupported source type"); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } template struct NppArithmScalarFunc; @@ -135,7 +129,7 @@ namespace template::func_ptr func> struct NppArithmScalar<1, func> { - static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst) + static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) { dst.create(src.size(), src.type()); @@ -143,14 +137,17 @@ namespace sz.width = src.cols; sz.height = src.rows; + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, (Npp32f)sc[0], dst.ptr(), dst.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppArithmScalar<2, func> { - static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst) + static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) { dst.create(src.size(), src.type()); @@ -162,78 +159,81 @@ namespace nValue.re = (Npp32f)sc[0]; nValue.im = (Npp32f)sc[1]; + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, nValue, dst.ptr(), dst.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; } -void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) +void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { - nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R); + nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream)); } -void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) +void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { - nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R); + nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream)); } -void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) +void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { - nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R); + nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream)); } -void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) +void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { - nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R); + nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, StreamAccessor::getStream(stream)); } -void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst) +void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) { - typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); + typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); static const caller_t callers[] = {0, NppArithmScalar<1, nppiAddC_32f_C1R>::calc, NppArithmScalar<2, nppiAddC_32fc_C1R>::calc}; CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); - callers[src.channels()](src, sc, dst); + callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst) +void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) { - typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); + typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); static const caller_t callers[] = {0, NppArithmScalar<1, nppiSubC_32f_C1R>::calc, NppArithmScalar<2, nppiSubC_32fc_C1R>::calc}; CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); - callers[src.channels()](src, sc, dst); + callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst) +void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) { - typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); + typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); static const caller_t callers[] = {0, NppArithmScalar<1, nppiMulC_32f_C1R>::calc, NppArithmScalar<2, nppiMulC_32fc_C1R>::calc}; CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); - callers[src.channels()](src, sc, dst); + callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst) +void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) { - typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); + typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); static const caller_t callers[] = {0, NppArithmScalar<1, nppiDivC_32f_C1R>::calc, NppArithmScalar<2, nppiDivC_32fc_C1R>::calc}; CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); - callers[src.channels()](src, sc, dst); + callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// // Absolute difference -void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) +void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s) { CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); @@ -245,6 +245,10 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) sz.width = src1.cols; sz.height = src1.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + switch (src1.type()) { case CV_8UC1: @@ -263,22 +267,28 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) CV_Assert(!"Unsupported source type"); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } -void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst) +void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s) { - CV_Assert(src.type() == CV_32FC1); + CV_Assert(src1.type() == CV_32FC1); - dst.create( src.size(), src.type() ); + dst.create( src1.size(), src1.type() ); NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + sz.width = src1.cols; + sz.height = src1.rows; + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); - nppSafeCall( nppiAbsDiffC_32f_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz, (Npp32f)s[0]) ); + nppSafeCall( nppiAbsDiffC_32f_C1R(src1.ptr(), src1.step, dst.ptr(), dst.step, sz, (Npp32f)src2[0]) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -287,11 +297,11 @@ void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst) namespace cv { namespace gpu { namespace mathfunc { - void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst); - void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst); + void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); }}} -void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop) +void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& s) { CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); @@ -305,34 +315,42 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c sz.width = src1.cols; sz.height = src1.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + if (src1.type() == CV_8UC4) { if (cmpop != CMP_NE) { + NppStreamHandler h(stream); + nppSafeCall( nppiCompare_8u_C4R(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, nppCmpOp[cmpop]) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } else { - mathfunc::compare_ne_8uc4(src1, src2, dst); + mathfunc::compare_ne_8uc4(src1, src2, dst, stream); } } else { if (cmpop != CMP_NE) { + NppStreamHandler h(stream); + nppSafeCall( nppiCompare_32f_C1R(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, nppCmpOp[cmpop]) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } else { - mathfunc::compare_ne_32f(src1, src2, dst); + mathfunc::compare_ne_32f(src1, src2, dst, stream); } } } @@ -383,16 +401,7 @@ namespace } -void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask) -{ - if (mask.empty()) - ::bitwiseNotCaller(src, dst, 0); - else - ::bitwiseNotCaller(src, dst, mask, 0); -} - - -void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream) +void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.empty()) ::bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream)); @@ -519,16 +528,7 @@ namespace } -void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask) -{ - if (mask.empty()) - ::bitwiseOrCaller(src1, src2, dst, 0); - else - ::bitwiseOrCaller(src1, src2, dst, mask, 0); -} - - -void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream) +void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.empty()) ::bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream)); @@ -537,16 +537,7 @@ void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, co } -void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask) -{ - if (mask.empty()) - ::bitwiseAndCaller(src1, src2, dst, 0); - else - ::bitwiseAndCaller(src1, src2, dst, mask, 0); -} - - -void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream) +void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.empty()) ::bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream)); @@ -555,16 +546,7 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c } -void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask) -{ - if (mask.empty()) - ::bitwiseXorCaller(src1, src2, dst, 0); - else - ::bitwiseXorCaller(src1, src2, dst, mask, 0); -} - - -void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream) +void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.empty()) ::bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream)); @@ -624,22 +606,7 @@ namespace } } -void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) -{ - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert((src1.depth() != CV_64F) || - (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); - - typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - min_caller, min_caller, min_caller, min_caller, min_caller, - min_caller, min_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) +void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert((src1.depth() != CV_64F) || @@ -653,22 +620,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str }; funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); } - -void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) -{ - CV_Assert((src1.depth() != CV_64F) || - (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); - - typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - min_caller, min_caller, min_caller, min_caller, min_caller, - min_caller, min_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) +void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream) { CV_Assert((src1.depth() != CV_64F) || (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); @@ -682,22 +634,7 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) -{ - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert((src1.depth() != CV_64F) || - (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); - - typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - max_caller, max_caller, max_caller, max_caller, max_caller, - max_caller, max_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) +void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert((src1.depth() != CV_64F) || @@ -712,21 +649,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) -{ - CV_Assert((src1.depth() != CV_64F) || - (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); - - typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - max_caller, max_caller, max_caller, max_caller, max_caller, - max_caller, max_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) +void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream) { CV_Assert((src1.depth() != CV_64F) || (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); @@ -760,10 +683,14 @@ namespace } } -double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type) +double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s) { + cudaStream_t stream = StreamAccessor::getStream(s); + if (src.type() == CV_32FC1 && type == THRESH_TRUNC) { + NppStreamHandler h(stream); + dst.create(src.size(), src.type()); NppiSize sz; @@ -773,7 +700,8 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double nppSafeCall( nppiThreshold_32f_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz, static_cast(thresh), NPP_CMP_GREATER) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } else { @@ -801,36 +729,9 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal = cvRound(maxVal); } - callers[src.depth()](src, dst, thresh, maxVal, type, 0); - } - - return thresh; -} - -double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, const Stream& stream) -{ - typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, - cudaStream_t stream); - - static const caller_t callers[] = - { - threshold_caller, threshold_caller, - threshold_caller, threshold_caller, - threshold_caller, threshold_caller, threshold_caller - }; - - CV_Assert(src.channels() == 1 && src.depth() <= CV_64F); - CV_Assert(type <= THRESH_TOZERO_INV); - - dst.create(src.size(), src.type()); - - if (src.depth() != CV_32F) - { - thresh = cvFloor(thresh); - maxVal = cvRound(maxVal); + callers[src.depth()](src, dst, thresh, maxVal, type, stream); } - callers[src.depth()](src, dst, thresh, maxVal, type, StreamAccessor::getStream(stream)); return thresh; } diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index d8dbc9f..61e15e2 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -66,16 +66,16 @@ Ptr cv::gpu::createGaussianFilter_GPU(int, Size, double, doubl Ptr cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } -void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point) { throw_nogpu(); } -void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } -void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } -void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); } -void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); } -void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); } -void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); } -void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); } -void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); } -void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double) { throw_nogpu(); } +void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_nogpu(); } +void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); } +void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); } +void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_nogpu(); } +void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, Stream&) { throw_nogpu(); } +void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, Stream&) { throw_nogpu(); } #else @@ -137,21 +137,25 @@ namespace filter2D(filter2D_), srcType(srcType_), dstType(dstType_) {} - virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) + virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) { CV_Assert(src.type() == srcType); Size src_size = src.size(); dst.create(src_size, dstType); - dst = Scalar(0.0); + + if (stream) + stream.enqueueMemSet(dst, Scalar::all(0.0)); + else + dst.setTo(Scalar::all(0.0)); normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); GpuMat srcROI = src(roi); GpuMat dstROI = dst(roi); - (*filter2D)(srcROI, dstROI); + (*filter2D)(srcROI, dstROI, stream); } Ptr filter2D; @@ -181,16 +185,25 @@ namespace anchor = Point(rowFilter->anchor, columnFilter->anchor); } - virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) + virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) { CV_Assert(src.type() == srcType); Size src_size = src.size(); dst.create(src_size, dstType); - dst = Scalar(0.0); dstBuf.create(src_size, bufType); - dstBuf = Scalar(0.0); + + if (stream) + { + stream.enqueueMemSet(dst, Scalar::all(0)); + stream.enqueueMemSet(dstBuf, Scalar::all(0)); + } + else + { + dst = Scalar(0.0); + dstBuf = Scalar(0.0); + } normalizeROI(roi, ksize, anchor, src_size); @@ -198,8 +211,8 @@ namespace GpuMat dstROI = dst(roi); GpuMat dstBufROI = dstBuf(roi); - (*rowFilter)(srcROI, dstBufROI); - (*columnFilter)(dstBufROI, dstROI); + (*rowFilter)(srcROI, dstBufROI, stream); + (*columnFilter)(dstBufROI, dstROI, stream); } Ptr rowFilter; @@ -229,15 +242,20 @@ namespace public: NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz, ksize, anchor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; } @@ -258,15 +276,20 @@ namespace public: NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz, ksize, anchor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; } @@ -293,7 +316,7 @@ namespace public: NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; @@ -304,10 +327,15 @@ namespace NppiPoint oAnchor; oAnchor.x = anchor.x; oAnchor.y = anchor.y; + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, oKernelSize, oAnchor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } nppFilterBox_t func; @@ -331,7 +359,7 @@ Ptr cv::gpu::createBoxFilter_GPU(int srcType, int dstType, con return createFilter2D_GPU(boxFilter, srcType, dstType); } -void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor) +void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor, Stream& stream) { int sdepth = src.depth(), cn = src.channels(); if( ddepth < 0 ) @@ -340,7 +368,7 @@ void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, dst.create(src.size(), CV_MAKETYPE(ddepth, cn)); Ptr f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor); - f->apply(src, dst); + f->apply(src, dst, Rect(0,0,-1,-1), stream); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -356,7 +384,7 @@ namespace NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) : BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; @@ -368,9 +396,14 @@ namespace oAnchor.x = anchor.x; oAnchor.y = anchor.y; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, kernel.ptr(), oKernelSize, oAnchor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } GpuMat kernel; @@ -404,16 +437,16 @@ namespace MorphologyFilterEngine_GPU(const Ptr& filter2D_, int type, int iters_) : Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {} - virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) + virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) { if (iters > 1) morfBuf.create(src.size(), src.type()); - Filter2DEngine_GPU::apply(src, dst); + Filter2DEngine_GPU::apply(src, dst, roi, stream); for(int i = 1; i < iters; ++i) { dst.swap(morfBuf); - Filter2DEngine_GPU::apply(morfBuf, dst); + Filter2DEngine_GPU::apply(morfBuf, dst, roi, stream); } } @@ -435,7 +468,7 @@ Ptr cv::gpu::createMorphologyFilter_GPU(int op, int type, cons namespace { - void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations) + void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations, Stream& stream) { Mat kernel; Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); @@ -444,7 +477,10 @@ namespace if (iterations == 0 || _kernel.rows * _kernel.cols == 1) { - src.copyTo(dst); + if (stream) + stream.enqueueCopy(src, dst); + else + src.copyTo(dst); return; } @@ -468,49 +504,49 @@ namespace Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); - f->apply(src, dst); + f->apply(src, dst, Rect(0,0,-1,-1), stream); } } -void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations) +void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream) { - morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations); + morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, stream); } -void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations) +void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream) { - morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations); + morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, stream); } -void cv::gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations) +void cv::gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations, Stream& stream) { GpuMat temp; switch( op ) { - case MORPH_ERODE: erode( src, dst, kernel, anchor, iterations); break; - case MORPH_DILATE: dilate( src, dst, kernel, anchor, iterations); break; + case MORPH_ERODE: erode( src, dst, kernel, anchor, iterations, stream); break; + case MORPH_DILATE: dilate( src, dst, kernel, anchor, iterations, stream); break; case MORPH_OPEN: - erode( src, temp, kernel, anchor, iterations); - dilate( temp, dst, kernel, anchor, iterations); + erode( src, temp, kernel, anchor, iterations, stream); + dilate( temp, dst, kernel, anchor, iterations, stream); break; case CV_MOP_CLOSE: - dilate( src, temp, kernel, anchor, iterations); - erode( temp, dst, kernel, anchor, iterations); + dilate( src, temp, kernel, anchor, iterations, stream); + erode( temp, dst, kernel, anchor, iterations, stream); break; case CV_MOP_GRADIENT: - erode( src, temp, kernel, anchor, iterations); - dilate( src, dst, kernel, anchor, iterations); - subtract(dst, temp, dst); + erode( src, temp, kernel, anchor, iterations, stream); + dilate( src, dst, kernel, anchor, iterations, stream); + subtract(dst, temp, dst, stream); break; case CV_MOP_TOPHAT: - erode( src, dst, kernel, anchor, iterations); - dilate( dst, temp, kernel, anchor, iterations); - subtract(src, temp, dst); + erode( src, dst, kernel, anchor, iterations, stream); + dilate( dst, temp, kernel, anchor, iterations, stream); + subtract(src, temp, dst, stream); break; case CV_MOP_BLACKHAT: - dilate( src, dst, kernel, anchor, iterations); - erode( dst, temp, kernel, anchor, iterations); - subtract(temp, src, dst); + dilate( src, dst, kernel, anchor, iterations, stream); + erode( dst, temp, kernel, anchor, iterations, stream); + subtract(temp, src, dst, stream); break; default: CV_Error( CV_StsBadArg, "unknown morphological operation" ); @@ -531,7 +567,7 @@ namespace NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) : BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; @@ -542,11 +578,16 @@ namespace NppiPoint oAnchor; oAnchor.x = anchor.x; oAnchor.y = anchor.y; + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, kernel.ptr(), oKernelSize, oAnchor, nDivisor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } GpuMat kernel; @@ -578,7 +619,7 @@ Ptr cv::gpu::createLinearFilter_GPU(int srcType, int dstType, return createFilter2D_GPU(linearFilter, srcType, dstType); } -void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor) +void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, Stream& stream) { if( ddepth < 0 ) ddepth = src.depth(); @@ -586,7 +627,7 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); Ptr f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor); - f->apply(src, dst); + f->apply(src, dst, Rect(0, 0, -1, -1), stream); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -595,10 +636,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke namespace cv { namespace gpu { namespace filters { template - void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template - void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); }}} namespace @@ -606,7 +647,7 @@ namespace typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor); - typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); class NppLinearRowFilter : public BaseRowFilter_GPU { @@ -614,15 +655,20 @@ namespace NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, kernel.ptr(), ksize, anchor, nDivisor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } GpuMat kernel; @@ -636,9 +682,9 @@ namespace GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { - func(src, dst, kernel.ptr(), ksize, anchor, brd_type); + func(src, dst, kernel.ptr(), ksize, anchor, brd_type, StreamAccessor::getStream(s)); } Mat kernel; @@ -720,15 +766,20 @@ namespace NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, kernel.ptr(), ksize, anchor, nDivisor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } GpuMat kernel; @@ -742,9 +793,9 @@ namespace GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { - func(src, dst, kernel.ptr(), ksize, anchor, brd_type); + func(src, dst, kernel.ptr(), ksize, anchor, brd_type, StreamAccessor::getStream(s)); } Mat kernel; @@ -834,7 +885,8 @@ Ptr cv::gpu::createSeparableLinearFilter_GPU(int srcType, int return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType); } -void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType) +void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType, + Stream& stream) { if( ddepth < 0 ) ddepth = src.depth(); @@ -842,7 +894,7 @@ void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType); - f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); + f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -855,7 +907,7 @@ Ptr cv::gpu::createDerivFilter_GPU(int srcType, int dstType, i return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } -void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType) +void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); @@ -870,10 +922,10 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream); } -void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType) +void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType, Stream& stream) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); @@ -888,10 +940,10 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream); } -void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale) +void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, Stream& stream) { CV_Assert(ksize == 1 || ksize == 3); @@ -904,7 +956,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d if (scale != 1) kernel *= scale; - filter2D(src, dst, ddepth, kernel, Point(-1,-1)); + filter2D(src, dst, ddepth, kernel, Point(-1,-1), stream); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -938,7 +990,7 @@ Ptr cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } -void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType) +void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream) { if (ksize.width == 1 && ksize.height == 1) { @@ -949,7 +1001,7 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si dst.create(src.size(), src.type()); Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType); - f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); + f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -965,7 +1017,7 @@ namespace public: NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} - virtual void operator()(const GpuMat& src, GpuMat& dst) + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) { NppiSize sz; sz.width = src.cols; @@ -976,10 +1028,15 @@ namespace NppiPoint oAnchor; oAnchor.x = anchor.x; oAnchor.y = anchor.y; + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, oKernelSize, oAnchor) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } nppFilterRank_t func; diff --git a/modules/gpu/src/graphcuts.cpp b/modules/gpu/src/graphcuts.cpp index 87fbd56..00f734b 100644 --- a/modules/gpu/src/graphcuts.cpp +++ b/modules/gpu/src/graphcuts.cpp @@ -44,11 +44,11 @@ #if !defined (HAVE_CUDA) -void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ -void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf) +void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s) { Size src_size = terminals.size(); CV_Assert(terminals.type() == CV_32S); @@ -73,17 +73,17 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize()) buf.create(1, bufsz, CV_8U); + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( nppiGraphcut_32s8u(terminals.ptr(), leftTransp.ptr(), rightTransp.ptr(), top.ptr(), bottom.ptr(), terminals.step, leftTransp.step, sznpp, labels.ptr(), labels.step, buf.ptr()) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } #endif /* !defined (HAVE_CUDA) */ - - - - - diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 92a0cbf..38a2c35 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -50,30 +50,24 @@ using namespace cv::gpu; void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&){ throw_nogpu(); } void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); } void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); } -void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int) { throw_nogpu(); } -void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, const Stream&) { throw_nogpu(); } -void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&) { throw_nogpu(); } -void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, const Stream&) { throw_nogpu(); } -void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int) { throw_nogpu(); } -void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const Scalar&) { throw_nogpu(); } -void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); } -void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); } -void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int) { throw_nogpu(); } -void cv::gpu::integral(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } +void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); } +void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); } +void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const Scalar&, Stream&) { throw_nogpu(); } +void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); } +void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); } +void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_nogpu(); } +void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&) { throw_nogpu(); } -//void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); } -//void cv::gpu::Canny(const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); } -//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); } -//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); } +void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_nogpu(); } void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_nogpu(); } -void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } -void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*) { throw_nogpu(); } -void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } -void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu(); } +void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_nogpu(); } +void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); } @@ -203,14 +197,7 @@ namespace const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller, 0, 0, drawColorDisp_caller, 0, 0, 0, 0}; } -void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp) -{ - CV_Assert(src.type() == CV_8U || src.type() == CV_16S); - - drawColorDisp_callers[src.type()](src, dst, ndisp, 0); -} - -void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, const Stream& stream) +void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& stream) { CV_Assert(src.type() == CV_8U || src.type() == CV_16S); @@ -234,14 +221,7 @@ namespace const reprojectImageTo3D_caller_t reprojectImageTo3D_callers[] = {reprojectImageTo3D_caller, 0, 0, reprojectImageTo3D_caller, 0, 0, 0, 0}; } -void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q) -{ - CV_Assert((disp.type() == CV_8U || disp.type() == CV_16S) && Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4); - - reprojectImageTo3D_callers[disp.type()](disp, xyzw, Q, 0); -} - -void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const Stream& stream) +void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream) { CV_Assert((disp.type() == CV_8U || disp.type() == CV_16S) && Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4); @@ -251,7 +231,7 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, //////////////////////////////////////////////////////////////////////// // resize -void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation) +void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s) { static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR/*, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS*/}; @@ -284,6 +264,10 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub dstsz.width = dst.cols; dstsz.height = dst.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + if (src.type() == CV_8UC1) { nppSafeCall( nppiResize_8u_C1R(src.ptr(), srcsz, src.step, srcrect, @@ -295,13 +279,14 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub dst.ptr(), dst.step, dstsz, fx, fy, npp_inter[interpolation]) ); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// // copyMakeBorder -void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value) +void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value, Stream& s) { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1); @@ -314,6 +299,10 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom dstsz.width = dst.cols; dstsz.height = dst.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + switch (src.type()) { case CV_8UC1: @@ -349,7 +338,8 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom CV_Assert(!"Unsupported source type"); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// @@ -372,7 +362,7 @@ namespace void nppWarpCaller(const GpuMat& src, GpuMat& dst, double coeffs[][3], const Size& dsize, int flags, npp_warp_8u_t npp_warp_8u[][2], npp_warp_16u_t npp_warp_16u[][2], - npp_warp_32s_t npp_warp_32s[][2], npp_warp_32f_t npp_warp_32f[][2]) + npp_warp_32s_t npp_warp_32s[][2], npp_warp_32f_t npp_warp_32f[][2], cudaStream_t stream) { static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; @@ -397,6 +387,8 @@ namespace int warpInd = (flags & WARP_INVERSE_MAP) >> 4; + NppStreamHandler h(stream); + switch (src.depth()) { case CV_8U: @@ -419,11 +411,12 @@ namespace CV_Assert(!"Unsupported source type"); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } } -void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags) +void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, Stream& s) { static npp_warp_8u_t npp_warpAffine_8u[][2] = { @@ -464,10 +457,10 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz Mat coeffsMat(2, 3, CV_64F, (void*)coeffs); M.convertTo(coeffsMat, coeffsMat.type()); - nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpAffine_8u, npp_warpAffine_16u, npp_warpAffine_32s, npp_warpAffine_32f); + nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpAffine_8u, npp_warpAffine_16u, npp_warpAffine_32s, npp_warpAffine_32f, StreamAccessor::getStream(s)); } -void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags) +void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, Stream& s) { static npp_warp_8u_t npp_warpPerspective_8u[][2] = { @@ -508,13 +501,13 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size Mat coeffsMat(3, 3, CV_64F, (void*)coeffs); M.convertTo(coeffsMat, coeffsMat.type()); - nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpPerspective_8u, npp_warpPerspective_16u, npp_warpPerspective_32s, npp_warpPerspective_32f); + nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpPerspective_8u, npp_warpPerspective_16u, npp_warpPerspective_32s, npp_warpPerspective_32f, StreamAccessor::getStream(s)); } //////////////////////////////////////////////////////////////////////// // rotate -void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation) +void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& s) { static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; @@ -535,6 +528,10 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d dstroi.height = dst.rows; dstroi.width = dst.cols; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + if (src.type() == CV_8UC1) { nppSafeCall( nppiRotate_8u_C1R(src.ptr(), srcsz, src.step, srcroi, @@ -546,19 +543,20 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d dst.ptr(), dst.step, dstroi, angle, xShift, yShift, npp_inter[interpolation]) ); } - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////// // integral -void cv::gpu::integral(const GpuMat& src, GpuMat& sum) +void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s) { GpuMat buffer; - integralBuffered(src, sum, buffer); + integralBuffered(src, sum, buffer, s); } -void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer) +void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s) { CV_Assert(src.type() == CV_8UC1); @@ -575,35 +573,45 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer) nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStStreamHandler h(stream); + nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), src.step, sum.ptr(), sum.step, roiSize, buffer.ptr(), bufSize, prop) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } -void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum) +void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& s) { CV_Assert(src.type() == CV_8UC1); - int w = src.cols + 1, h = src.rows + 1; + int width = src.cols + 1, height = src.rows + 1; - sum.create(h, w, CV_32S); - sqsum.create(h, w, CV_32F); + sum.create(height, width, CV_32S); + sqsum.create(height, width, CV_32F); NppiSize sz; sz.width = src.cols; sz.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast(src.ptr()), src.step, sum.ptr(), - sum.step, sqsum.ptr(), sqsum.step, sz, 0, 0.0f, h) ); + sum.step, sqsum.ptr(), sqsum.step, sz, 0, 0.0f, height) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////////// // sqrIntegral -void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum) +void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s) { CV_Assert(src.type() == CV_8U); @@ -618,11 +626,16 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum) nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); GpuMat buf(1, bufSize, CV_8U); + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStStreamHandler h(stream); + sqsum.create(src.rows + 1, src.cols + 1, CV_64F); nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast(src.ptr(0)), src.step, sqsum.ptr(0), sqsum.step, roiSize, buf.ptr(0), bufSize, prop)); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////////////// @@ -641,7 +654,7 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst) imgproc::columnSum_32F(src, dst); } -void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect) +void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s) { CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1); @@ -657,69 +670,17 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons nppRect.x = rect.x; nppRect.y = rect.y; + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr(), src.step, sqr.ptr(), sqr.step, dst.ptr(), dst.step, sz, nppRect) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } -//////////////////////////////////////////////////////////////////////// -// Canny - -//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize) -//{ -// CV_Assert(!"disabled until fix crash"); -// -// GpuMat srcDx, srcDy; -// -// Sobel(image, srcDx, CV_32F, 1, 0, apertureSize); -// Sobel(image, srcDy, CV_32F, 0, 1, apertureSize); -// -// GpuMat buf; -// -// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize); -//} -// -//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize) -//{ -// CV_Assert(!"disabled until fix crash"); -// -// GpuMat srcDx, srcDy; -// -// Sobel(image, srcDx, CV_32F, 1, 0, apertureSize); -// Sobel(image, srcDy, CV_32F, 0, 1, apertureSize); -// -// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize); -//} -// -//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize) -//{ -// CV_Assert(!"disabled until fix crash"); -// -// GpuMat buf; -// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize); -//} -// -//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize) -//{ -// CV_Assert(!"disabled until fix crash"); -// CV_Assert(srcDx.type() == CV_32FC1 && srcDy.type() == CV_32FC1 && srcDx.size() == srcDy.size()); -// -// edges.create(srcDx.size(), CV_8UC1); -// -// NppiSize sz; -// sz.height = srcDx.rows; -// sz.width = srcDx.cols; -// -// int bufsz; -// nppSafeCall( nppiCannyGetBufferSize(sz, &bufsz) ); -// ensureSizeIsEnough(1, bufsz, CV_8UC1, buf); -// -// nppSafeCall( nppiCanny_32f8u_C1R(srcDx.ptr(), srcDx.step, srcDy.ptr(), srcDy.step, -// edges.ptr(), edges.step, sz, (Npp32f)threshold1, (Npp32f)threshold2, buf.ptr()) ); -// -// cudaSafeCall( cudaThreadSynchronize() ); -//} //////////////////////////////////////////////////////////////////////// // Histogram @@ -755,7 +716,7 @@ namespace { typedef typename NppHistogramEvenFuncC1::src_t src_t; - static void hist(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel) + static void hist(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream) { int levels = histSize + 1; hist.create(1, histSize, CV_32S); @@ -769,10 +730,14 @@ namespace get_buf_size(sz, levels, &buf_size); buffer.create(1, buf_size, CV_8U); + + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, sz, hist.ptr(), levels, lowerLevel, upperLevel, buffer.ptr()) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func, get_buf_size_c4_t get_buf_size> @@ -780,7 +745,7 @@ namespace { typedef typename NppHistogramEvenFuncC4::src_t src_t; - static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4]) + static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream) { int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1}; hist[0].create(1, histSize[0], CV_32S); @@ -799,9 +764,13 @@ namespace get_buf_size(sz, levels, &buf_size); buffer.create(1, buf_size, CV_8U); + + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr()) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -849,7 +818,7 @@ namespace typedef typename NppHistogramRangeFuncC1::level_t level_t; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1::LEVEL_TYPE_CODE}; - static void hist(const GpuMat& src, GpuMat& hist, const GpuMat& levels) + static void hist(const GpuMat& src, GpuMat& hist, const GpuMat& levels, cudaStream_t stream) { CV_Assert(levels.type() == LEVEL_TYPE_CODE && levels.rows == 1); @@ -864,9 +833,13 @@ namespace get_buf_size(sz, levels.cols, &buf_size); buffer.create(1, buf_size, CV_8U); + + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, sz, hist.ptr(), levels.ptr(), levels.cols, buffer.ptr()) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func, get_buf_size_c4_t get_buf_size> @@ -876,7 +849,7 @@ namespace typedef typename NppHistogramRangeFuncC1::level_t level_t; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1::LEVEL_TYPE_CODE}; - static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]) + static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], cudaStream_t stream) { CV_Assert(levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1); CV_Assert(levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1); @@ -901,9 +874,13 @@ namespace get_buf_size(sz, nLevels, &buf_size); buffer.create(1, buf_size, CV_8U); + + NppStreamHandler h(stream); + nppSafeCall( func(src.ptr(), src.step, sz, pHist, pLevels, nLevels, buffer.ptr()) ); - cudaSafeCall( cudaThreadSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } }; } @@ -915,11 +892,11 @@ void cv::gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperL levels.upload(host_levels); } -void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel) +void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream) { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 ); - typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, int levels, int lowerLevel, int upperLevel); + typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, int levels, int lowerLevel, int upperLevel, cudaStream_t stream); static const hist_t hist_callers[] = { NppHistogramEvenC1::hist, @@ -928,14 +905,14 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerL NppHistogramEvenC1::hist }; - hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel); + hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); } -void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4]) +void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream) { CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 ); - typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4]); + typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream); static const hist_t hist_callers[] = { NppHistogramEvenC4::hist, @@ -944,14 +921,14 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int l NppHistogramEvenC4::hist }; - hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel); + hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); } -void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels) +void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream) { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1); - typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, const GpuMat& levels); + typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, const GpuMat& levels, cudaStream_t stream); static const hist_t hist_callers[] = { NppHistogramRangeC1::hist, @@ -962,14 +939,14 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels) NppHistogramRangeC1::hist }; - hist_callers[src.depth()](src, hist, levels); + hist_callers[src.depth()](src, hist, levels, StreamAccessor::getStream(stream)); } -void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]) +void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream) { CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4); - typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]); + typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], cudaStream_t stream); static const hist_t hist_callers[] = { NppHistogramRangeC4::hist, @@ -980,7 +957,7 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4 NppHistogramRangeC4::hist }; - hist_callers[src.depth()](src, hist, levels); + hist_callers[src.depth()](src, hist, levels, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index c60fcdb..3d80280 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -128,7 +128,7 @@ void cv::gpu::GpuMat::copyTo( GpuMat& m ) const CV_DbgAssert(!this->empty()); m.create(size(), type()); cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void cv::gpu::GpuMat::copyTo( GpuMat& mat, const GpuMat& mask ) const @@ -179,7 +179,7 @@ namespace sz.height = src.rows; nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppCvt @@ -193,7 +193,7 @@ namespace sz.height = src.rows; nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, NPP_RND_NEAR) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -349,7 +349,7 @@ namespace Scalar_ nppS = s; nppSafeCall( func(nppS.val, src.ptr(), src.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppSet @@ -364,7 +364,7 @@ namespace Scalar_ nppS = s; nppSafeCall( func(nppS[0], src.ptr(), src.step, sz) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -400,7 +400,7 @@ namespace Scalar_ nppS = s; nppSafeCall( func(nppS.val, src.ptr(), src.step, sz, mask.ptr(), mask.step) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppSetMask @@ -415,7 +415,7 @@ namespace Scalar_ nppS = s; nppSafeCall( func(nppS[0], src.ptr(), src.step, sz, mask.ptr(), mask.step) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -463,8 +463,8 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask) { {NppSet::set,kernelSet,kernelSet,NppSet::set}, {kernelSet,kernelSet,kernelSet,kernelSet}, - {NppSet::set,kernelSet,kernelSet,NppSet::set}, - {NppSet::set,kernelSet,kernelSet,NppSet::set}, + {NppSet::set,NppSet::set,kernelSet,NppSet::set}, + {NppSet::set,NppSet::set,kernelSet,NppSet::set}, {NppSet::set,kernelSet,kernelSet,NppSet::set}, {NppSet::set,kernelSet,kernelSet,NppSet::set}, {kernelSet,kernelSet,kernelSet,kernelSet}, diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp index d182fd0..a2b4836 100644 --- a/modules/gpu/src/matrix_reductions.cpp +++ b/modules/gpu/src/matrix_reductions.cpp @@ -114,24 +114,14 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) sz.width = src.cols; sz.height = src.rows; -#if NPP_VERSION_MAJOR >= 4 - DeviceBuffer dbuf(2); nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), src.step, sz, dbuf, (double*)dbuf + 1) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); double* ptrs[2] = {mean.val, stddev.val}; dbuf.download(ptrs); - -#else - - nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), src.step, sz, mean.val, stddev.val) ); - - cudaSafeCall( cudaThreadSynchronize() ); - -#endif } @@ -184,25 +174,15 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) int funcIdx = normType >> 1; double retVal; - -#if NPP_VERSION_MAJOR >= 4 DeviceBuffer dbuf; nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr(), src1.step, src2.ptr(), src2.step, sz, dbuf) ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); dbuf.download(&retVal); -#else - - nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr(), src1.step, src2.ptr(), src2.step, sz, &retVal) ); - - cudaSafeCall( cudaThreadSynchronize() ); - -#endif - return retVal; } diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index de24836..a954e1b 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -332,7 +332,7 @@ namespace cv cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template @@ -349,7 +349,7 @@ namespace cv cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; template<> struct TransformDispatcher @@ -370,7 +370,7 @@ namespace cv cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template @@ -389,7 +389,7 @@ namespace cv cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index 7a5441b..022b26a 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -77,8 +77,8 @@ #include "nvidia/NPP_staging/NPP_staging.hpp" #include "nvidia/NCVHaarObjectDetection.hpp" -#define CUDART_MINIMUM_REQUIRED_VERSION 3020 -#define NPP_MINIMUM_REQUIRED_VERSION 3216 +#define CUDART_MINIMUM_REQUIRED_VERSION 4000 +#define NPP_MINIMUM_REQUIRED_VERSION 4000 #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) #error "Insufficient Cuda Runtime library version, please update it." diff --git a/modules/gpu/src/split_merge.cpp b/modules/gpu/src/split_merge.cpp index 03bd74c..3b3a312 100644 --- a/modules/gpu/src/split_merge.cpp +++ b/modules/gpu/src/split_merge.cpp @@ -46,14 +46,10 @@ using namespace std; #if !defined (HAVE_CUDA) -void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/) { throw_nogpu(); } -void cv::gpu::merge(const vector& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); } -void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } -void cv::gpu::merge(const vector& /*src*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } -void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/) { throw_nogpu(); } -void cv::gpu::split(const GpuMat& /*src*/, vector& /*dst*/) { throw_nogpu(); } -void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } -void cv::gpu::split(const GpuMat& /*src*/, vector& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } +void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); } +void cv::gpu::merge(const vector& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); } +void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_nogpu(); } +void cv::gpu::split(const GpuMat& /*src*/, vector& /*dst*/, Stream& /*stream*/) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -148,51 +144,25 @@ namespace cv { namespace gpu { namespace split_merge }}} -void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst) -{ - split_merge::merge(src, n, dst, 0); -} - - -void cv::gpu::merge(const vector& src, GpuMat& dst) -{ - split_merge::merge(&src[0], src.size(), dst, 0); -} - - -void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream) +void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream) { split_merge::merge(src, n, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::merge(const vector& src, GpuMat& dst, const Stream& stream) +void cv::gpu::merge(const vector& src, GpuMat& dst, Stream& stream) { split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream)); } -void cv::gpu::split(const GpuMat& src, GpuMat* dst) -{ - split_merge::split(src, dst, 0); -} - - -void cv::gpu::split(const GpuMat& src, vector& dst) -{ - dst.resize(src.channels()); - if(src.channels() > 0) - split_merge::split(src, &dst[0], 0); -} - - -void cv::gpu::split(const GpuMat& src, GpuMat* dst, const Stream& stream) +void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream) { split_merge::split(src, dst, StreamAccessor::getStream(stream)); } -void cv::gpu::split(const GpuMat& src, vector& dst, const Stream& stream) +void cv::gpu::split(const GpuMat& src, vector& dst, Stream& stream) { dst.resize(src.channels()); if(src.channels() > 0) diff --git a/modules/gpu/src/stereobm.cpp b/modules/gpu/src/stereobm.cpp index 67a0220..f29758b 100644 --- a/modules/gpu/src/stereobm.cpp +++ b/modules/gpu/src/stereobm.cpp @@ -51,8 +51,7 @@ cv::gpu::StereoBM_GPU::StereoBM_GPU() { throw_nogpu(); } cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_nogpu(); } bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_nogpu(); return false; } -void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } +void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -124,13 +123,7 @@ static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& ri bm::postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream); } - -void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity) -{ - ::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, 0); -} - -void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const Stream& stream) +void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream) { ::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream)); } diff --git a/modules/gpu/src/stereobp.cpp b/modules/gpu/src/stereobp.cpp index f4dd8f1..ea151fc 100644 --- a/modules/gpu/src/stereobp.cpp +++ b/modules/gpu/src/stereobp.cpp @@ -53,10 +53,8 @@ void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&, cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, int) { throw_nogpu(); } cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, float, float, float, float, int) { throw_nogpu(); } -void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -133,7 +131,7 @@ namespace CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < numeric_limits::max()); } - void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, cudaStream_t stream) + void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { typedef void (*comp_data_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream); static const comp_data_t comp_data_callers[2][5] = @@ -154,16 +152,16 @@ namespace const int min_image_dim_size = 2; CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size); - init(); + init(stream); datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type); - comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], stream); + comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], StreamAccessor::getStream(stream)); calcBP(disp, stream); } - void operator()(const GpuMat& data, GpuMat& disp, cudaStream_t stream) + void operator()(const GpuMat& data, GpuMat& disp, Stream& stream) { CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0)); @@ -176,14 +174,14 @@ namespace const int min_image_dim_size = 2; CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size); - init(); + init(stream); datas[0] = data; calcBP(disp, stream); } private: - void init() + void init(Stream& stream) { u.create(rows * rthis.ndisp, cols, rthis.msg_type); d.create(rows * rthis.ndisp, cols, rthis.msg_type); @@ -193,10 +191,20 @@ namespace if (rthis.levels & 1) { //can clear less area - u = zero; - d = zero; - l = zero; - r = zero; + if (stream) + { + stream.enqueueMemSet(u, zero); + stream.enqueueMemSet(d, zero); + stream.enqueueMemSet(l, zero); + stream.enqueueMemSet(r, zero); + } + else + { + u.setTo(zero); + d.setTo(zero); + l.setTo(zero); + r.setTo(zero); + } } if (rthis.levels > 1) @@ -211,10 +219,20 @@ namespace if ((rthis.levels & 1) == 0) { - u2 = zero; - d2 = zero; - l2 = zero; - r2 = zero; + if (stream) + { + stream.enqueueMemSet(u2, zero); + stream.enqueueMemSet(d2, zero); + stream.enqueueMemSet(l2, zero); + stream.enqueueMemSet(r2, zero); + } + else + { + u2.setTo(zero); + d2.setTo(zero); + l2.setTo(zero); + r2.setTo(zero); + } } } @@ -229,7 +247,7 @@ namespace rows_all[0] = rows; } - void calcBP(GpuMat& disp, cudaStream_t stream) + void calcBP(GpuMat& disp, Stream& stream) { using namespace cv::gpu::bp; @@ -259,6 +277,8 @@ namespace const int funcIdx = rthis.msg_type == CV_32F; + cudaStream_t cudaStream = StreamAccessor::getStream(stream); + for (int i = 1; i < rthis.levels; ++i) { cols_all[i] = (cols_all[i-1] + 1) / 2; @@ -266,7 +286,7 @@ namespace datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type); - data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], stream); + data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], cudaStream); } DevMem2D mus[] = {u, u2}; @@ -280,9 +300,9 @@ namespace { // for lower level we have already computed messages by setting to zero if (i != rthis.levels - 1) - level_up_messages_callers[funcIdx](mem_idx, cols_all[i], rows_all[i], rows_all[i+1], mus, mds, mls, mrs, stream); + level_up_messages_callers[funcIdx](mem_idx, cols_all[i], rows_all[i], rows_all[i+1], mus, mds, mls, mrs, cudaStream); - calc_all_iterations_callers[funcIdx](cols_all[i], rows_all[i], rthis.iters, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i], stream); + calc_all_iterations_callers[funcIdx](cols_all[i], rows_all[i], rthis.iters, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i], cudaStream); mem_idx = (mem_idx + 1) & 1; } @@ -291,12 +311,21 @@ namespace disp.create(rows, cols, CV_16S); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); - out = zero; - output_callers[funcIdx](u, d, l, r, datas.front(), out, stream); + if (stream) + stream.enqueueMemSet(out, zero); + else + out.setTo(zero); + + output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream); if (disp.type() != CV_16S) - out.convertTo(disp, disp.type()); + { + if (stream) + stream.enqueueConvert(out, disp, disp.type()); + else + out.convertTo(disp, disp.type()); + } } StereoBeliefPropagation& rthis; @@ -323,28 +352,16 @@ namespace }; } -void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp) -{ - ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); - impl(left, right, disp, 0); -} - void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); - impl(left, right, disp, StreamAccessor::getStream(stream)); -} - -void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp) -{ - ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); - impl(data, disp, 0); + impl(left, right, disp, stream); } void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream) { ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); - impl(data, disp, StreamAccessor::getStream(stream)); + impl(data, disp, stream); } #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/stereocsbp.cpp b/modules/gpu/src/stereocsbp.cpp index d29c61c..25beb8b 100644 --- a/modules/gpu/src/stereocsbp.cpp +++ b/modules/gpu/src/stereocsbp.cpp @@ -53,7 +53,6 @@ void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, i cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) { throw_nogpu(); } cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, float, float, int, int) { throw_nogpu(); } -void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -136,7 +135,7 @@ cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, in template static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2], GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected, - GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, cudaStream_t stream) + GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane && left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); @@ -145,6 +144,8 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2] const Scalar zero = Scalar::all(0); + cudaStream_t cudaStream = StreamAccessor::getStream(stream); + //////////////////////////////////////////////////////////////////////////////////////////// // Init @@ -210,18 +211,36 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2] csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); - l[0] = zero; - d[0] = zero; - r[0] = zero; - u[0] = zero; - - l[1] = zero; - d[1] = zero; - r[1] = zero; - u[1] = zero; - - data_cost = zero; - data_cost_selected = zero; + if (stream) + { + stream.enqueueMemSet(l[0], zero); + stream.enqueueMemSet(d[0], zero); + stream.enqueueMemSet(r[0], zero); + stream.enqueueMemSet(u[0], zero); + + stream.enqueueMemSet(l[1], zero); + stream.enqueueMemSet(d[1], zero); + stream.enqueueMemSet(r[1], zero); + stream.enqueueMemSet(u[1], zero); + + stream.enqueueMemSet(data_cost, zero); + stream.enqueueMemSet(data_cost_selected, zero); + } + else + { + l[0].setTo(zero); + d[0].setTo(zero); + r[0].setTo(zero); + u[0].setTo(zero); + + l[1].setTo(zero); + d[1].setTo(zero); + r[1].setTo(zero); + u[1].setTo(zero); + + data_cost.setTo(zero); + data_cost_selected.setTo(zero); + } int cur_idx = 0; @@ -230,12 +249,12 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2] if (i == levels - 1) { csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr(), data_cost_selected.ptr(), - step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, stream); + step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream); } else { csbp::compute_data_cost(disp_selected_pyr[cur_idx].ptr(), data_cost.ptr(), step_pyr[i], step_pyr[i+1], - left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream); + left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream); int new_idx = (cur_idx + 1) & 1; @@ -243,46 +262,49 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2] u[cur_idx].ptr(), d[cur_idx].ptr(), l[cur_idx].ptr(), r[cur_idx].ptr(), disp_selected_pyr[new_idx].ptr(), disp_selected_pyr[cur_idx].ptr(), data_cost_selected.ptr(), data_cost.ptr(), step_pyr[i], step_pyr[i+1], rows_pyr[i], - cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream); + cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream); cur_idx = new_idx; } csbp::calc_all_iterations(u[cur_idx].ptr(), d[cur_idx].ptr(), l[cur_idx].ptr(), r[cur_idx].ptr(), data_cost_selected.ptr(), disp_selected_pyr[cur_idx].ptr(), step_pyr[i], - rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, stream); + rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream); } if (disp.empty()) disp.create(rows, cols, CV_16S); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); - out = zero; + + if (stream) + stream.enqueueMemSet(out, zero); + else + out.setTo(zero); csbp::compute_disp(u[cur_idx].ptr(), d[cur_idx].ptr(), l[cur_idx].ptr(), r[cur_idx].ptr(), - data_cost_selected.ptr(), disp_selected_pyr[cur_idx].ptr(), step_pyr[0], out, nr_plane_pyr[0], stream); + data_cost_selected.ptr(), disp_selected_pyr[cur_idx].ptr(), step_pyr[0], out, nr_plane_pyr[0], cudaStream); if (disp.type() != CV_16S) - out.convertTo(disp, disp.type()); + { + if (stream) + stream.enqueueConvert(out, disp, disp.type()); + else + out.convertTo(disp, disp.type()); + } } typedef void (*csbp_operator_t)(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2], GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected, - GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, cudaStream_t stream); + GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream); const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator, 0, csbp_operator, 0, 0}; -void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp) -{ - CV_Assert(msg_type == CV_32F || msg_type == CV_16S); - operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, 0); -} - void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { CV_Assert(msg_type == CV_32F || msg_type == CV_16S); - operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, StreamAccessor::getStream(stream)); + operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, stream); } #endif /* !defined (HAVE_CUDA) */ -- 2.7.4