From d4087f19a2aa38c00b101b01d06c60dc70edf5d0 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 11 Dec 2013 16:38:30 +0400 Subject: [PATCH] All CUDA related stuff were moved to separate dynamic library. --- modules/core/CMakeLists.txt | 23 +- modules/core/cuda/CMakeLists.txt | 11 + modules/core/cuda/main.cpp | 23 + modules/core/include/opencv2/core/gpumat.hpp | 2 + modules/core/src/gpumat.cpp | 1143 ++------------------------ modules/core/src/gpumat_cuda.hpp | 1069 ++++++++++++++++++++++++ 6 files changed, 1200 insertions(+), 1071 deletions(-) create mode 100644 modules/core/cuda/CMakeLists.txt create mode 100644 modules/core/cuda/main.cpp create mode 100644 modules/core/src/gpumat_cuda.hpp diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 66b8ae0..5951982 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,22 +1,27 @@ set(the_description "The Core Functionality") -ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) -ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() -if(HAVE_CUDA) - ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") - ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) -endif() - file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) +if(DYNAMIC_CUDA_SUPPORT) + add_definitions(-DDYNAMIC_CUDA_SUPPORT) +endif() + +ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) +ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) + +if(HAVE_CUDA) + ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") + ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +endif() + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) @@ -25,3 +30,7 @@ ocv_add_precompiled_headers(${the_module}) ocv_add_accuracy_tests() ocv_add_perf_tests() + +if(DYNAMIC_CUDA_SUPPORT) + add_subdirectory(cuda) +endif() diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt new file mode 100644 index 0000000..0b1c942 --- /dev/null +++ b/modules/core/cuda/CMakeLists.txt @@ -0,0 +1,11 @@ +project(opencv_core_cuda) +set(HAVE_CUDA FALSE) +add_definitions("-DHAVE_CUDA") +include_directories(${CUDA_INCLUDE_DIRS} + "../src/" + "../include/opencv2/core/" + "${OpenCV_SOURCE_DIR}/modules/gpu/include" + ) +ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +cuda_add_library(opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu) +target_link_libraries(opencv_core_cuda ${CUDA_LIBRARIES}) \ No newline at end of file diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp new file mode 100644 index 0000000..c4b8cbe --- /dev/null +++ b/modules/core/cuda/main.cpp @@ -0,0 +1,23 @@ +#include "opencv2/core/core.hpp" +#include "opencv2/core/gpumat.hpp" + +#ifdef HAVE_CUDA +#include +#include + +#define CUDART_MINIMUM_REQUIRED_VERSION 4020 +#define NPP_MINIMUM_REQUIRED_VERSION 4200 + +#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) +#error "Insufficient Cuda Runtime library version, please update it." +#endif + +#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION) +#error "Insufficient NPP version, please update it." +#endif +#endif + +using namespace cv; +using namespace cv::gpu; + +#include "gpumat_cuda.hpp" \ No newline at end of file diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 193c9aa..b502102 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -48,6 +48,8 @@ #include "opencv2/core/core.hpp" #include "opencv2/core/cuda_devptrs.hpp" +#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") + namespace cv { namespace gpu { //////////////////////////////// Initialization & Info //////////////////////// diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 4c4af61..9a2e36c 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -44,7 +44,7 @@ #include "opencv2/core/gpumat.hpp" #include -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) #include #include @@ -64,489 +64,62 @@ using namespace std; using namespace cv; using namespace cv::gpu; -#ifndef HAVE_CUDA - -#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") - -#else // HAVE_CUDA +#include "gpumat_cuda.hpp" namespace { -#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) -#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) - - inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") - { - if (cudaSuccess != err) - cv::gpu::error(cudaGetErrorString(err), file, line, func); - } - - inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") + const GpuFuncTable* gpuFuncTable() { - if (err < 0) - { - std::ostringstream msg; - msg << "NPP API Call Error: " << err; - cv::gpu::error(msg.str().c_str(), file, line, func); - } + static EmptyFuncTable funcTable; + return &funcTable; } } -#endif // HAVE_CUDA - //////////////////////////////// Initialization & Info //////////////////////// -#ifndef HAVE_CUDA - -int cv::gpu::getCudaEnabledDeviceCount() { return 0; } - -void cv::gpu::setDevice(int) { throw_nogpu; } -int cv::gpu::getDevice() { throw_nogpu; return 0; } - -void cv::gpu::resetDevice() { throw_nogpu; } - -bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; } - -bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; } - -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; } -void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; } -size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; } -size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; } -bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; } -bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; } -void cv::gpu::DeviceInfo::query() { throw_nogpu; } - -void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; } -void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; } - -#else // HAVE_CUDA - -int cv::gpu::getCudaEnabledDeviceCount() -{ - int count; - cudaError_t error = cudaGetDeviceCount( &count ); - - if (error == cudaErrorInsufficientDriver) - return -1; - - if (error == cudaErrorNoDevice) - return 0; - - cudaSafeCall( error ); - return count; -} - -void cv::gpu::setDevice(int device) -{ - cudaSafeCall( cudaSetDevice( device ) ); -} - -int cv::gpu::getDevice() -{ - int device; - cudaSafeCall( cudaGetDevice( &device ) ); - return device; -} - -void cv::gpu::resetDevice() -{ - cudaSafeCall( cudaDeviceReset() ); -} - -namespace -{ - class CudaArch - { - public: - CudaArch(); - - bool builtWith(FeatureSet feature_set) const; - bool hasPtx(int major, int minor) const; - bool hasBin(int major, int minor) const; - bool hasEqualOrLessPtx(int major, int minor) const; - bool hasEqualOrGreaterPtx(int major, int minor) const; - bool hasEqualOrGreaterBin(int major, int minor) const; - - private: - static void fromStr(const string& set_as_str, vector& arr); - - vector bin; - vector ptx; - vector features; - }; - - const CudaArch cudaArch; - - CudaArch::CudaArch() - { - fromStr(CUDA_ARCH_BIN, bin); - fromStr(CUDA_ARCH_PTX, ptx); - fromStr(CUDA_ARCH_FEATURES, features); - } - - bool CudaArch::builtWith(FeatureSet feature_set) const - { - return !features.empty() && (features.back() >= feature_set); - } - - bool CudaArch::hasPtx(int major, int minor) const - { - return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); - } - - bool CudaArch::hasBin(int major, int minor) const - { - return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); - } - - bool CudaArch::hasEqualOrLessPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.front() <= major * 10 + minor); - } - - bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.back() >= major * 10 + minor); - } - - bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const - { - return !bin.empty() && (bin.back() >= major * 10 + minor); - } - - void CudaArch::fromStr(const string& set_as_str, vector& arr) - { - if (set_as_str.find_first_not_of(" ") == string::npos) - return; - - istringstream stream(set_as_str); - int cur_value; - - while (!stream.eof()) - { - stream >> cur_value; - arr.push_back(cur_value); - } - - sort(arr.begin(), arr.end()); - } -} - -bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) -{ - return cudaArch.builtWith(feature_set); -} - -bool cv::gpu::TargetArchs::has(int major, int minor) -{ - return hasPtx(major, minor) || hasBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasPtx(int major, int minor) -{ - return cudaArch.hasPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasBin(int major, int minor) -{ - return cudaArch.hasBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) -{ - return cudaArch.hasEqualOrLessPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) -{ - return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) -{ - return cudaArch.hasEqualOrGreaterPtx(major, minor); -} +int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); } -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) -{ - return cudaArch.hasEqualOrGreaterBin(major, minor); -} +void cv::gpu::setDevice(int device) { gpuFuncTable()->setDevice(device); } +int cv::gpu::getDevice() { return gpuFuncTable()->getDevice(); } -bool cv::gpu::deviceSupports(FeatureSet feature_set) -{ - static int versions[] = - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); +void cv::gpu::resetDevice() { gpuFuncTable()->resetDevice(); } - const int devId = getDevice(); +bool cv::gpu::deviceSupports(FeatureSet feature_set) { return gpuFuncTable()->deviceSupports(feature_set); } - int version; +bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return gpuFuncTable()->builtWith(feature_set); } +bool cv::gpu::TargetArchs::has(int major, int minor) { return gpuFuncTable()->has(major, minor); } +bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { return gpuFuncTable()->hasPtx(major, minor); } +bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return gpuFuncTable()->hasBin(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrLessPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuFuncTable()->hasEqualOrGreater(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); } - if (devId < cache_size && versions[devId] >= 0) - version = versions[devId]; - else - { - DeviceInfo dev(devId); - version = dev.majorVersion() * 10 + dev.minorVersion(); - if (devId < cache_size) - versions[devId] = version; - } +size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); } +void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); } +size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); } +size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); } +bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); } +bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); } +void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); } - return TargetArchs::builtWith(feature_set) && (version >= feature_set); -} +void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); } +void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); } -namespace -{ - class DeviceProps - { - public: - DeviceProps(); - ~DeviceProps(); - - cudaDeviceProp* get(int devID); - - private: - std::vector props_; - }; - - DeviceProps::DeviceProps() - { - props_.resize(10, 0); - } - - DeviceProps::~DeviceProps() - { - for (size_t i = 0; i < props_.size(); ++i) - { - if (props_[i]) - delete props_[i]; - } - props_.clear(); - } - - cudaDeviceProp* DeviceProps::get(int devID) - { - if (devID >= (int) props_.size()) - props_.resize(devID + 5, 0); - - if (!props_[devID]) - { - props_[devID] = new cudaDeviceProp; - cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); - } - - return props_[devID]; - } - - DeviceProps deviceProps; -} - -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const -{ - return deviceProps.get(device_id_)->sharedMemPerBlock; -} - -void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const -{ - int prevDeviceID = getDevice(); - if (prevDeviceID != device_id_) - setDevice(device_id_); - - cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - - if (prevDeviceID != device_id_) - setDevice(prevDeviceID); -} - -size_t cv::gpu::DeviceInfo::freeMemory() const -{ - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _freeMemory; -} - -size_t cv::gpu::DeviceInfo::totalMemory() const -{ - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _totalMemory; -} - -bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const -{ - int version = majorVersion() * 10 + minorVersion(); - return version >= feature_set; -} - -bool cv::gpu::DeviceInfo::isCompatible() const -{ - // Check PTX compatibility - if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion())) - return true; - - // Check BIN compatibility - for (int i = minorVersion(); i >= 0; --i) - if (TargetArchs::hasBin(majorVersion(), i)) - return true; - - return false; -} - -void cv::gpu::DeviceInfo::query() -{ - const cudaDeviceProp* prop = deviceProps.get(device_id_); - - name_ = prop->name; - multi_processor_count_ = prop->multiProcessorCount; - majorVersion_ = prop->major; - minorVersion_ = prop->minor; -} - -namespace -{ - int convertSMVer2Cores(int major, int minor) - { - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } SMtoCores; - - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; - - int index = 0; - while (gpuArchCoresPerSM[index].SM != -1) - { - if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) - return gpuArchCoresPerSM[index].Cores; - index++; - } - - return -1; - } -} - -void cv::gpu::printCudaDeviceInfo(int device) -{ - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); - printf("Device count: %d\n", count); - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - const char *computeMode[] = { - "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", - "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", - "Prohibited (no host thread can use ::cudaSetDevice() with this device)", - "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", - "Unknown", - NULL - }; - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - printf("\nDevice %d: \"%s\"\n", dev, prop.name); - printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); - printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); - - printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - - printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", - prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], - prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); - printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", - prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], - prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); - - printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); - printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); - printf(" Warp size: %d\n", prop.warpSize); - printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); - printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); - printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); - - printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); - printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); - - printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); - printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); - printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); - printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); - printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); - printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); - printf(" Compute Mode:\n"); - printf(" %s \n", computeMode[prop.computeMode]); - } - - printf("\n"); - printf("deviceQuery, CUDA Driver = CUDART"); - printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); - printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); - printf(", NumDevs = %d\n\n", count); - fflush(stdout); -} +#ifdef HAVE_CUDA -void cv::gpu::printShortCudaDeviceInfo(int device) +namespace cv { namespace gpu { - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; - printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); - printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(", %d cores", cores * prop.multiProcessorCount); - - printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - } - fflush(stdout); -} + CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t); + CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&); + CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, cudaStream_t = 0); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, cudaStream_t); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); +}} -#endif // HAVE_CUDA +#endif //////////////////////////////// GpuMat /////////////////////////////// @@ -830,601 +403,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat) return mat = GpuMat(rows, cols, type); } -namespace -{ - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} - - virtual void copy(const Mat& src, GpuMat& dst) const = 0; - virtual void copy(const GpuMat& src, Mat& dst) const = 0; - virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; - - virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; - - virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; - virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0; - - virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0; - - virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; - virtual void free(void* devPtr) const = 0; - }; -} - -#ifndef HAVE_CUDA - -namespace -{ - class EmptyFuncTable : public GpuFuncTable - { - public: - void copy(const Mat&, GpuMat&) const { throw_nogpu; } - void copy(const GpuMat&, Mat&) const { throw_nogpu; } - void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } - - void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } - - void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } - void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; } - - void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; } - - void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } - void free(void*) const {} - }; - - const GpuFuncTable* gpuFuncTable() - { - static EmptyFuncTable empty; - return ∅ - } -} - -#else // HAVE_CUDA - -namespace cv { namespace gpu { namespace device -{ - void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - - void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); -}}} - -namespace -{ - template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); - } - - template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); - } -} - - -namespace cv { namespace gpu -{ - CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); -}} - - -namespace cv { namespace gpu -{ - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) - { - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); - } - - void convertTo(const GpuMat& src, GpuMat& dst) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); - } - - void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); - } - - void setTo(GpuMat& src, Scalar s, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, stream); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, mask, stream); - } - - void setTo(GpuMat& src, Scalar s) - { - setTo(src, s, 0); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask) - { - setTo(src, s, mask, 0); - } -}} - -namespace -{ - template struct NPPTypeTraits; - template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; - template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; - - ////////////////////////////////////////////////////////////////////////// - // Convert - - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // Set - - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template<> struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // CopyMasked - - template struct NppCopyMaskedFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppCopyMasked - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template static inline bool isAligned(const T* ptr, size_t size) - { - return reinterpret_cast(ptr) % size == 0; - } - - ////////////////////////////////////////////////////////////////////////// - // CudaFuncTable - - class CudaFuncTable : public GpuFuncTable - { - public: - void copy(const Mat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); - } - void copy(const GpuMat& src, Mat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); - } - void copy(const GpuMat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); - } - - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - if (src.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - /* 8U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask } - }; - - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::copyWithMask; - - func(src, dst, mask, 0); - } - - void convert(const GpuMat& src, GpuMat& dst) const - { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst); - static const func_t funcs[7][7][4] = - { - { - /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 8S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 16U -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 16S -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16S -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16U */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 32S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 32F -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 64F -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 64F */ {0,0,0,0} - } - }; - - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - if (!aligned) - { - cv::gpu::convertTo(src, dst); - return; - } - - const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; - CV_DbgAssert(func != 0); - - func(src, dst); - } - - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - cv::gpu::convertTo(src, dst, alpha, beta); - } - - void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const - { - if (mask.empty()) - { - if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) - { - cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); - return; - } - - if (m.depth() == CV_8U) - { - int cn = m.channels(); - - if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) - { - int val = saturate_cast(s[0]); - cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); - return; - } - } - - typedef void (*func_t)(GpuMat& src, Scalar s); - static const func_t funcs[7][4] = - { - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo }, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s); - } - else - { - typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); - static const func_t funcs[7][4] = - { - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo }, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s, mask); - } - } - - void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const - { - cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); - } - - void free(void* devPtr) const - { - cudaFree(devPtr); - } - }; - - const GpuFuncTable* gpuFuncTable() - { - static CudaFuncTable funcTable; - return &funcTable; - } -} - -#endif // HAVE_CUDA - void cv::gpu::GpuMat::upload(const Mat& m) { CV_DbgAssert(!m.empty()); @@ -1492,9 +470,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet dst.create(size(), rtype); if (noScale) - gpuFuncTable()->convert(*psrc, dst); + cv::gpu::convertTo(*psrc, dst); else - gpuFuncTable()->convert(*psrc, dst, alpha, beta); + cv::gpu::convertTo(*psrc, dst, alpha, beta); } GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) @@ -1502,7 +480,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) CV_Assert(mask.empty() || mask.type() == CV_8UC1); CV_DbgAssert(!empty()); - gpuFuncTable()->setTo(*this, s, mask); + gpu::setTo(*this, s, mask); return *this; } @@ -1562,6 +540,43 @@ void cv::gpu::GpuMat::release() refcount = 0; } +#ifdef HAVE_CUDA + +namespace cv { namespace gpu +{ + void convertTo(const GpuMat& src, GpuMat& dst) + { + gpuFuncTable()->convert(src, dst); + } + + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) + { + gpuFuncTable()->convert(src, dst, alpha, beta, stream); + } + + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) + { + gpuFuncTable()->setTo(src, s, stream); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + gpuFuncTable()->setTo(src, s, mask, stream); + } + + void setTo(GpuMat& src, Scalar s) + { + setTo(src, s, 0); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) + { + setTo(src, s, mask, 0); + } +}} + +#endif + //////////////////////////////////////////////////////////////////////// // Error handling @@ -1578,5 +593,5 @@ void cv::gpu::error(const char *error_string, const char *file, const int line, cerr.flush(); } else - cv::error( cv::Exception(code, error_string, func, file, line) ); + ::cv::error( ::cv::Exception(code, error_string, func, file, line) ); } diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp new file mode 100644 index 0000000..631d6ea --- /dev/null +++ b/modules/core/src/gpumat_cuda.hpp @@ -0,0 +1,1069 @@ +namespace +{ +#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT) + + #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) + #define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) + + inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") + { + if (cudaSuccess != err) + cv::gpu::error(cudaGetErrorString(err), file, line, func); + } + + inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") + { + if (err < 0) + { + std::ostringstream msg; + msg << "NPP API Call Error: " << err; + cv::gpu::error(msg.str().c_str(), file, line, func); + } + } +#endif +} + +namespace +{ + class GpuFuncTable + { + public: + virtual ~GpuFuncTable() {} + + // DeviceInfo routines + virtual int getCudaEnabledDeviceCount() const = 0; + + virtual void setDevice(int) const = 0; + virtual int getDevice() const = 0; + + virtual void resetDevice() const = 0; + + virtual bool deviceSupports(FeatureSet) const = 0; + + virtual bool builtWith(FeatureSet) const = 0; + virtual bool has(int, int) const = 0; + virtual bool hasPtx(int, int) const = 0; + virtual bool hasBin(int, int) const = 0; + virtual bool hasEqualOrLessPtx(int, int) const = 0; + virtual bool hasEqualOrGreater(int, int) const = 0; + virtual bool hasEqualOrGreaterPtx(int, int) const = 0; + virtual bool hasEqualOrGreaterBin(int, int) const = 0; + + virtual size_t sharedMemPerBlock() const = 0; + virtual void queryMemory(size_t&, size_t&) const = 0; + virtual size_t freeMemory() const = 0; + virtual size_t totalMemory() const = 0; + virtual bool supports(FeatureSet) const = 0; + virtual bool isCompatible() const = 0; + virtual void query() const = 0; + + virtual void printCudaDeviceInfo(int) const = 0; + virtual void printShortCudaDeviceInfo(int) const = 0; + + // GpuMat routines + virtual void copy(const Mat& src, GpuMat& dst) const = 0; + virtual void copy(const GpuMat& src, Mat& dst) const = 0; + virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; + + virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; + + // gpu::device::convertTo funcs + virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; + virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; + + // for gpu::device::setTo funcs + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0; + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; + + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; + virtual void free(void* devPtr) const = 0; + }; +} + +#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) +namespace +{ + class EmptyFuncTable : public GpuFuncTable + { + public: + + // DeviceInfo routines + int getCudaEnabledDeviceCount() const { return 0; } + + void setDevice(int) const { throw_nogpu; } + int getDevice() const { throw_nogpu; return 0; } + + void resetDevice() const { throw_nogpu; } + + bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } + + bool builtWith(FeatureSet) const { throw_nogpu; return false; } + bool has(int, int) const { throw_nogpu; return false; } + bool hasPtx(int, int) const { throw_nogpu; return false; } + bool hasBin(int, int) const { throw_nogpu; return false; } + bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } + + size_t sharedMemPerBlock() const { throw_nogpu; return 0; } + void queryMemory(size_t&, size_t&) const { throw_nogpu; } + size_t freeMemory() const { throw_nogpu; return 0; } + size_t totalMemory() const { throw_nogpu; return 0; } + bool supports(FeatureSet) const { throw_nogpu; return false; } + bool isCompatible() const { throw_nogpu; return false; } + void query() const { throw_nogpu; } + + void printCudaDeviceInfo(int) const { throw_nogpu; } + void printShortCudaDeviceInfo(int) const { throw_nogpu; } + + void copy(const Mat&, GpuMat&) const { throw_nogpu; } + void copy(const GpuMat&, Mat&) const { throw_nogpu; } + void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } + + void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } + + void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } + void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } + + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; } + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } + + void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } + void free(void*) const {} + }; +} + +#else + +namespace cv { namespace gpu { namespace device +{ + void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); + + template + void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream); + + template + void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + + void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); +}}} + +namespace +{ + template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) + { + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); + } + + template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); + } +} + +namespace +{ + template struct NPPTypeTraits; + template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; + template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; + template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; + template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; + template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; + template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; + template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; + + ////////////////////////////////////////////////////////////////////////// + // Convert + + template struct NppConvertFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + template struct NppConvertFunc + { + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); + }; + + template::func_ptr func> struct NppCvt + { + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppCvt + { + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + ////////////////////////////////////////////////////////////////////////// + // Set + + template struct NppSetFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template struct NppSetFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template<> struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + + template::func_ptr func> struct NppSet + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppSet + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + template struct NppSetMaskFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + template struct NppSetMaskFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + + template::func_ptr func> struct NppSetMask + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppSetMask + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + ////////////////////////////////////////////////////////////////////////// + // CopyMasked + + template struct NppCopyMaskedFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + + template::func_ptr func> struct NppCopyMasked + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + template static inline bool isAligned(const T* ptr, size_t size) + { + return reinterpret_cast(ptr) % size == 0; + } +} + + namespace cv { namespace gpu { namespace devices + { + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) + { + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); + } + + void convertTo(const GpuMat& src, GpuMat& dst) + { + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); + } + + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) + { + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); + } + + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) + { + typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); + + static const caller_t callers[] = + { + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; + + callers[src.depth()](src, s, stream); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + + static const caller_t callers[] = + { + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; + + callers[src.depth()](src, s, mask, stream); + } + + void setTo(GpuMat& src, Scalar s) + { + setTo(src, s, 0); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) + { + setTo(src, s, mask, 0); + } + }} + +namespace +{ + class CudaFuncTable : public GpuFuncTable + { + protected: + + class CudaArch + { + public: + CudaArch(); + + bool builtWith(FeatureSet feature_set) const; + bool hasPtx(int major, int minor) const; + bool hasBin(int major, int minor) const; + bool hasEqualOrLessPtx(int major, int minor) const; + bool hasEqualOrGreaterPtx(int major, int minor) const; + bool hasEqualOrGreaterBin(int major, int minor) const; + + private: + static void fromStr(const string& set_as_str, vector& arr); + + vector bin; + vector ptx; + vector features; + }; + + const CudaArch cudaArch; + + CudaArch::CudaArch() + { + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + } + + bool CudaArch::builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool CudaArch::hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool CudaArch::hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool CudaArch::hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + void CudaArch::fromStr(const string& set_as_str, vector& arr) + { + if (set_as_str.find_first_not_of(" ") == string::npos) + return; + + istringstream stream(set_as_str); + int cur_value; + + while (!stream.eof()) + { + stream >> cur_value; + arr.push_back(cur_value); + } + + sort(arr.begin(), arr.end()); + } + + class DeviceProps + { + public: + DeviceProps(); + ~DeviceProps(); + + cudaDeviceProp* get(int devID); + + private: + std::vector props_; + }; + + DeviceProps::DeviceProps() + { + props_.resize(10, 0); + } + + DeviceProps::~DeviceProps() + { + for (size_t i = 0; i < props_.size(); ++i) + { + if (props_[i]) + delete props_[i]; + } + props_.clear(); + } + + cudaDeviceProp* DeviceProps::get(int devID) + { + if (devID >= (int) props_.size()) + props_.resize(devID + 5, 0); + + if (!props_[devID]) + { + props_[devID] = new cudaDeviceProp; + cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); + } + + return props_[devID]; + } + + DeviceProps deviceProps; + + int convertSMVer2Cores(int major, int minor) + { + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } SMtoCores; + + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; + + int index = 0; + while (gpuArchCoresPerSM[index].SM != -1) + { + if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) + return gpuArchCoresPerSM[index].Cores; + index++; + } + + return -1; + } + + public: + + int getCudaEnabledDeviceCount() const + { + int count; + cudaError_t error = cudaGetDeviceCount( &count ); + + if (error == cudaErrorInsufficientDriver) + return -1; + + if (error == cudaErrorNoDevice) + return 0; + + cudaSafeCall( error ); + return count; + } + + void setDevice(int device) const + { + cudaSafeCall( cudaSetDevice( device ) ); + } + + int getDevice() const + { + int device; + cudaSafeCall( cudaGetDevice( &device ) ); + return device; + } + + void resetDevice() const + { + cudaSafeCall( cudaDeviceReset() ); + } + + bool TargetArchs::builtWith(FeatureSet feature_set) const + { + return cudaArch.builtWith(feature_set); + } + + bool TargetArchs::has(int major, int minor) const + { + return hasPtx(major, minor) || hasBin(major, minor); + } + + bool TargetArchs::hasPtx(int major, int minor) const + { + return cudaArch.hasPtx(major, minor); + } + + bool TargetArchs::hasBin(int major, int minor) const + { + return cudaArch.hasBin(major, minor); + } + + bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const + { + return cudaArch.hasEqualOrLessPtx(major, minor); + } + + bool TargetArchs::hasEqualOrGreater(int major, int minor) const + { + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); + } + + bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterPtx(major, minor); + } + + bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterBin(major, minor); + } + + bool deviceSupports(FeatureSet feature_set) const + { + static int versions[] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else + { + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; + } + + return TargetArchs::builtWith(feature_set) && (version >= feature_set); + } + + size_t sharedMemPerBlock() const + { + return deviceProps.get(device_id_)->sharedMemPerBlock; + } + + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + { + int prevDeviceID = getDevice(); + if (prevDeviceID != device_id_) + setDevice(device_id_); + + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); + + if (prevDeviceID != device_id_) + setDevice(prevDeviceID); + } + + size_t freeMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _freeMemory; + } + + size_t totalMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _totalMemory; + } + + bool supports(FeatureSet feature_set) const + { + int version = majorVersion() * 10 + minorVersion(); + return version >= feature_set; + } + + bool isCompatible() const + { + // Check PTX compatibility + if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion())) + return true; + + // Check BIN compatibility + for (int i = minorVersion(); i >= 0; --i) + if (TargetArchs::hasBin(majorVersion(), i)) + return true; + + return false; + } + + void query() const + { + const cudaDeviceProp* prop = deviceProps.get(device_id_); + + name_ = prop->name; + multi_processor_count_ = prop->multiProcessorCount; + majorVersion_ = prop->major; + minorVersion_ = prop->minor; + } + + void printCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); + printf("Device count: %d\n", count); + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + const char *computeMode[] = { + "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", + "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this device)", + "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", + "Unknown", + NULL + }; + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + printf("\nDevice %d: \"%s\"\n", dev, prop.name); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); + printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); + + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); + + printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", + prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], + prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); + printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", + prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], + prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); + + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); + printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); + printf(" Warp size: %d\n", prop.warpSize); + printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); + printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); + printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); + printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); + + printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); + + printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); + printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); + printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); + printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); + printf(" Compute Mode:\n"); + printf(" %s \n", computeMode[prop.computeMode]); + } + + printf("\n"); + printf("deviceQuery, CUDA Driver = CUDART"); + printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); + printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); + printf(", NumDevs = %d\n\n", count); + fflush(stdout); + } + + void printShortCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; + printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); + printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(", %d cores", cores * prop.multiProcessorCount); + + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + } + fflush(stdout); + } + + void copy(const Mat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); + } + void copy(const GpuMat& src, Mat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); + } + void copy(const GpuMat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); + } + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + if (src.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); + static const func_t funcs[7][4] = + { + /* 8U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 8S */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask }, + /* 16U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 16S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32F */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 64F */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask } + }; + + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask; + + func(src, dst, mask, 0); + } + + void convert(const GpuMat& src, GpuMat& dst) const + { + typedef void (*func_t)(const GpuMat& src, GpuMat& dst); + static const func_t funcs[7][7][4] = + { + { + /* 8U -> 8U */ {0, 0, 0, 0}, + /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 8S */ {0,0,0,0}, + /* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 16U -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 16U */ {0,0,0,0}, + /* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 16S -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16S */ {0,0,0,0}, + /* 16S -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 32S */ {0,0,0,0}, + /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 32F -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32F */ {0,0,0,0}, + /* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 64F */ {0,0,0,0} + } + }; + + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + if (!aligned) + { + cv::gpu::device::convertTo(src, dst); + return; + } + + const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; + CV_DbgAssert(func != 0); + + func(src, dst); + } + + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + cv::gpu::device::convertTo(src, dst, alpha, beta); + } + + void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const + { + if (mask.empty()) + { + if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) + { + cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); + return; + } + + if (m.depth() == CV_8U) + { + int cn = m.channels(); + + if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) + { + int val = saturate_cast(s[0]); + cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); + return; + } + } + + typedef void (*func_t)(GpuMat& src, Scalar s); + static const func_t funcs[7][4] = + { + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } + }; + + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + funcs[m.depth()][m.channels() - 1](m, s); + } + else + { + typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); + static const func_t funcs[7][4] = + { + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo } + }; + + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + funcs[m.depth()][m.channels() - 1](m, s, mask); + } + } + + void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const + { + cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); + } + + void free(void* devPtr) const + { + cudaFree(devPtr); + } + }; +} +#endif \ No newline at end of file -- 2.7.4