From 64c94cb22c382aa3b9377d6d94648b91159a8744 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 11:18:04 +0400 Subject: [PATCH] CUDA related func tables refactored to remove unneeded dependencies. --- modules/core/src/gpumat.cpp | 30 +-- modules/core/src/gpumat_cuda.hpp | 384 +++++++++++++++++++-------------------- 2 files changed, 204 insertions(+), 210 deletions(-) diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index c8d1d05..03dcad2 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -239,23 +239,23 @@ static DeviceInfoFuncTable* deviceInfoFuncTable() //////////////////////////////// Initialization & Info //////////////////////// -int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); } +int cv::gpu::getCudaEnabledDeviceCount() { return deviceInfoFuncTable()->getCudaEnabledDeviceCount(); } -void cv::gpu::setDevice(int device) { gpuFuncTable()->setDevice(device); } -int cv::gpu::getDevice() { return gpuFuncTable()->getDevice(); } +void cv::gpu::setDevice(int device) { deviceInfoFuncTable()->setDevice(device); } +int cv::gpu::getDevice() { return deviceInfoFuncTable()->getDevice(); } -void cv::gpu::resetDevice() { gpuFuncTable()->resetDevice(); } +void cv::gpu::resetDevice() { deviceInfoFuncTable()->resetDevice(); } -bool cv::gpu::deviceSupports(FeatureSet feature_set) { return gpuFuncTable()->deviceSupports(feature_set); } +bool cv::gpu::deviceSupports(FeatureSet feature_set) { return deviceInfoFuncTable()->deviceSupports(feature_set); } -bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return gpuFuncTable()->builtWith(feature_set); } -bool cv::gpu::TargetArchs::has(int major, int minor) { return gpuFuncTable()->has(major, minor); } -bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { return gpuFuncTable()->hasPtx(major, minor); } -bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return gpuFuncTable()->hasBin(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrLessPtx(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuFuncTable()->hasEqualOrGreater(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); } +bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return deviceInfoFuncTable()->builtWith(feature_set); } +bool cv::gpu::TargetArchs::has(int major, int minor) { return deviceInfoFuncTable()->has(major, minor); } +bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { return deviceInfoFuncTable()->hasPtx(major, minor); } +bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return deviceInfoFuncTable()->hasBin(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrLessPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreater(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterBin(major, minor); } size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); } void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); } @@ -270,8 +270,8 @@ std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->na int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); } void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); } -void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); } -void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); } +void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); } +void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); } #ifdef HAVE_CUDA diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp index 83172d5..9281655 100644 --- a/modules/core/src/gpumat_cuda.hpp +++ b/modules/core/src/gpumat_cuda.hpp @@ -4,6 +4,7 @@ class DeviceInfoFuncTable { public: + // cv::DeviceInfo virtual size_t sharedMemPerBlock() const = 0; virtual void queryMemory(size_t&, size_t&) const = 0; virtual size_t freeMemory() const = 0; @@ -16,25 +17,13 @@ virtual int majorVersion() const = 0; virtual int minorVersion() const = 0; virtual int multiProcessorCount() const = 0; - virtual ~DeviceInfoFuncTable() {}; - }; - - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} - - // DeviceInfo routines virtual int getCudaEnabledDeviceCount() const = 0; - virtual void setDevice(int) const = 0; virtual int getDevice() const = 0; - virtual void resetDevice() const = 0; - virtual bool deviceSupports(FeatureSet) const = 0; - // TargetArchs + // cv::TargetArchs virtual bool builtWith(FeatureSet) const = 0; virtual bool has(int, int) const = 0; virtual bool hasPtx(int, int) const = 0; @@ -46,7 +35,15 @@ virtual void printCudaDeviceInfo(int) const = 0; virtual void printShortCudaDeviceInfo(int) const = 0; - + + virtual ~DeviceInfoFuncTable() {}; + }; + + class GpuFuncTable + { + public: + virtual ~GpuFuncTable() {} + // GpuMat routines virtual void copy(const Mat& src, GpuMat& dst) const = 0; virtual void copy(const GpuMat& src, Mat& dst) const = 0; @@ -60,7 +57,7 @@ // for gpu::device::setTo funcs virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; - + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; virtual void free(void* devPtr) const = 0; }; @@ -80,20 +77,14 @@ int majorVersion() const { throw_nogpu; return -1; } int minorVersion() const { throw_nogpu; return -1; } int multiProcessorCount() const { throw_nogpu; return -1; } - }; - - class EmptyFuncTable : public GpuFuncTable - { - public: - - // DeviceInfo routines + int getCudaEnabledDeviceCount() const { return 0; } - + void setDevice(int) const { throw_nogpu; } int getDevice() const { throw_nogpu; return 0; } - + void resetDevice() const { throw_nogpu; } - + bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } bool builtWith(FeatureSet) const { throw_nogpu; return false; } @@ -104,10 +95,15 @@ bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } - + void printCudaDeviceInfo(int) const { throw_nogpu; } void printShortCudaDeviceInfo(int) const { throw_nogpu; } - + }; + + class EmptyFuncTable : public GpuFuncTable + { + public: + void copy(const Mat&, GpuMat&) const { throw_nogpu; } void copy(const GpuMat&, Mat&) const { throw_nogpu; } void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } @@ -185,62 +181,62 @@ namespace cv { namespace gpu { namespace device { typedef typename NPPTypeTraits::npp_type src_t; typedef typename NPPTypeTraits::npp_type dst_t; - + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); }; template struct NppConvertFunc { typedef typename NPPTypeTraits::npp_type dst_t; - + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); }; - + template::func_ptr func> struct NppCvt { typedef typename NPPTypeTraits::npp_type src_t; typedef typename NPPTypeTraits::npp_type dst_t; - + static void call(const GpuMat& src, GpuMat& dst) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + template::func_ptr func> struct NppCvt { typedef typename NPPTypeTraits::npp_type dst_t; - + static void call(const GpuMat& src, GpuMat& dst) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + ////////////////////////////////////////////////////////////////////////// // Set - + template struct NppSetFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); }; template struct NppSetFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); }; template struct NppSetFunc @@ -251,172 +247,172 @@ namespace cv { namespace gpu { namespace device { typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); }; - + template::func_ptr func> struct NppSet { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppSet { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + template struct NppSetMaskFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); }; template struct NppSetMaskFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); }; - + template::func_ptr func> struct NppSetMask { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s, const GpuMat& mask) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppSetMask { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s, const GpuMat& mask) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + ////////////////////////////////////////////////////////////////////////// // CopyMasked - + template struct NppCopyMaskedFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); }; - + template::func_ptr func> struct NppCopyMasked { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + template static inline bool isAligned(const T* ptr, size_t size) { return reinterpret_cast(ptr) % size == 0; } - + namespace cv { namespace gpu { namespace device { void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) { CV_Assert(src.size() == dst.size() && src.type() == dst.type()); CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - + cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); } - + void convertTo(const GpuMat& src, GpuMat& dst) { cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); } - + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) { cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); } - + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) { typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - + static const caller_t callers[] = { kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller }; - + callers[src.depth()](src, s, stream); } - + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) { typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - + static const caller_t callers[] = { kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller }; - + callers[src.depth()](src, s, mask, stream); } - + void setTo(GpuMat& src, Scalar s) { setTo(src, s, 0); } - + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) { setTo(src, s, mask, 0); @@ -433,56 +429,56 @@ namespace cv { namespace gpu { namespace device fromStr(CUDA_ARCH_PTX, ptx); fromStr(CUDA_ARCH_FEATURES, features); } - + bool builtWith(FeatureSet feature_set) const { return !features.empty() && (features.back() >= feature_set); } - + bool hasPtx(int major, int minor) const { return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); } - + bool hasBin(int major, int minor) const { return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); } - + bool hasEqualOrLessPtx(int major, int minor) const { return !ptx.empty() && (ptx.front() <= major * 10 + minor); } - + bool hasEqualOrGreaterPtx(int major, int minor) const { return !ptx.empty() && (ptx.back() >= major * 10 + minor); } - + bool hasEqualOrGreaterBin(int major, int minor) const { return !bin.empty() && (bin.back() >= major * 10 + minor); } - - + + private: void fromStr(const string& set_as_str, vector& arr) { if (set_as_str.find_first_not_of(" ") == string::npos) return; - + istringstream stream(set_as_str); int cur_value; - + while (!stream.eof()) { stream >> cur_value; arr.push_back(cur_value); } - + sort(arr.begin(), arr.end()); } - + vector bin; vector ptx; vector features; @@ -495,7 +491,7 @@ namespace cv { namespace gpu { namespace device { props_.resize(10, 0); } - + ~DeviceProps() { for (size_t i = 0; i < props_.size(); ++i) @@ -505,18 +501,18 @@ namespace cv { namespace gpu { namespace device } props_.clear(); } - + cudaDeviceProp* get(int devID) { if (devID >= (int) props_.size()) props_.resize(devID + 5, 0); - + if (!props_[devID]) { props_[devID] = new cudaDeviceProp; cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); } - + return props_[devID]; } private: @@ -524,7 +520,7 @@ namespace cv { namespace gpu { namespace device }; DeviceProps deviceProps; - + class CudaDeviceInfoFuncTable: DeviceInfoFuncTable { public: @@ -532,57 +528,57 @@ namespace cv { namespace gpu { namespace device { return deviceProps.get(device_id_)->sharedMemPerBlock; } - + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const { int prevDeviceID = getDevice(); if (prevDeviceID != device_id_) setDevice(device_id_); - + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - + if (prevDeviceID != device_id_) setDevice(prevDeviceID); } - + size_t freeMemory() const { size_t _totalMemory, _freeMemory; queryMemory(_totalMemory, _freeMemory); return _freeMemory; } - + size_t totalMemory() const { size_t _totalMemory, _freeMemory; queryMemory(_totalMemory, _freeMemory); return _totalMemory; } - + bool supports(FeatureSet feature_set) const { int version = majorVersion_ * 10 + minorVersion_; return version >= feature_set; } - + bool isCompatible() const { // Check PTX compatibility - if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_)) + if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) return true; - + // Check BIN compatibility for (int i = minorVersion_; i >= 0; --i) - if (TargetArchs::hasBin(majorVersion_, i)) + if (hasBin(majorVersion_, i)) return true; - + return false; } - + void query() { const cudaDeviceProp* prop = deviceProps.get(device_id_); - + name_ = prop->name; multi_processor_count_ = prop->multiProcessorCount; majorVersion_ = prop->major; @@ -614,116 +610,78 @@ namespace cv { namespace gpu { namespace device return multi_processor_count_; } - private: - int device_id_; - - std::string name_; - int multi_processor_count_; - int majorVersion_; - int minorVersion_; - }; - - class CudaFuncTable : public GpuFuncTable - { - protected: - - const CudaArch cudaArch; - - int convertSMVer2Cores(int major, int minor) const - { - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } SMtoCores; - - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; - - int index = 0; - while (gpuArchCoresPerSM[index].SM != -1) - { - if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) - return gpuArchCoresPerSM[index].Cores; - index++; - } - - return -1; - } - - public: - int getCudaEnabledDeviceCount() const { int count; cudaError_t error = cudaGetDeviceCount( &count ); - + if (error == cudaErrorInsufficientDriver) return -1; - + if (error == cudaErrorNoDevice) return 0; - + cudaSafeCall( error ); return count; } - + void setDevice(int device) const { cudaSafeCall( cudaSetDevice( device ) ); } - + int getDevice() const { int device; cudaSafeCall( cudaGetDevice( &device ) ); return device; } - + void resetDevice() const { cudaSafeCall( cudaDeviceReset() ); } - + bool builtWith(FeatureSet feature_set) const { return cudaArch.builtWith(feature_set); } - + bool has(int major, int minor) const { return hasPtx(major, minor) || hasBin(major, minor); } - + bool hasPtx(int major, int minor) const { return cudaArch.hasPtx(major, minor); } - + bool hasBin(int major, int minor) const { return cudaArch.hasBin(major, minor); } - + bool hasEqualOrLessPtx(int major, int minor) const { return cudaArch.hasEqualOrLessPtx(major, minor); } - + bool hasEqualOrGreater(int major, int minor) const { return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); } - + bool hasEqualOrGreaterPtx(int major, int minor) const { return cudaArch.hasEqualOrGreaterPtx(major, minor); } - + bool hasEqualOrGreaterBin(int major, int minor) const { return cudaArch.hasEqualOrGreaterBin(major, minor); } - + bool deviceSupports(FeatureSet feature_set) const { static int versions[] = @@ -731,11 +689,11 @@ namespace cv { namespace gpu { namespace device -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); - + const int devId = getDevice(); - + int version; - + if (devId < cache_size && versions[devId] >= 0) version = versions[devId]; else @@ -745,25 +703,25 @@ namespace cv { namespace gpu { namespace device if (devId < cache_size) versions[devId] = version; } - + return TargetArchs::builtWith(feature_set) && (version >= feature_set); } - + void printCudaDeviceInfo(int device) const { int count = getCudaEnabledDeviceCount(); bool valid = (device >= 0) && (device < count); - + int beg = valid ? device : 0; int end = valid ? device+1 : count; - + printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); printf("Device count: %d\n", count); - + int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - + const char *computeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", @@ -772,30 +730,30 @@ namespace cv { namespace gpu { namespace device "Unknown", NULL }; - + for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - + printf("\nDevice %d: \"%s\"\n", dev, prop.name); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - + int cores = convertSMVer2Cores(prop.major, prop.minor); if (cores > 0) printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); - + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - + printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", - prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], - prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); + prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], + prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", - prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], - prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); - + prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], + prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); @@ -805,12 +763,12 @@ namespace cv { namespace gpu { namespace device printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); - + printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); - + printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); @@ -820,7 +778,7 @@ namespace cv { namespace gpu { namespace device printf(" Compute Mode:\n"); printf(" %s \n", computeMode[prop.computeMode]); } - + printf("\n"); printf("deviceQuery, CUDA Driver = CUDART"); printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); @@ -828,37 +786,73 @@ namespace cv { namespace gpu { namespace device printf(", NumDevs = %d\n\n", count); fflush(stdout); } - + void printShortCudaDeviceInfo(int device) const { int count = getCudaEnabledDeviceCount(); bool valid = (device >= 0) && (device < count); - + int beg = valid ? device : 0; int end = valid ? device+1 : count; - + int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - + for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - + const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); - + int cores = convertSMVer2Cores(prop.major, prop.minor); if (cores > 0) printf(", %d cores", cores * prop.multiProcessorCount); - + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); } fflush(stdout); } - + + private: + int device_id_; + + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; + + const CudaArch cudaArch; + + int convertSMVer2Cores(int major, int minor) const + { + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } SMtoCores; + + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; + + int index = 0; + while (gpuArchCoresPerSM[index].SM != -1) + { + if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) + return gpuArchCoresPerSM[index].Cores; + index++; + } + + return -1; + } + }; + + class CudaFuncTable : public GpuFuncTable + { + public: + void copy(const Mat& src, GpuMat& dst) const { cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); -- 2.7.4