From 64c94cb22c382aa3b9377d6d94648b91159a8744 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 11:18:04 +0400
Subject: [PATCH] CUDA related func tables refactored to remove unneeded
 dependencies.

---
 modules/core/src/gpumat.cpp      |  30 +--
 modules/core/src/gpumat_cuda.hpp | 384 +++++++++++++++++++--------------------
 2 files changed, 204 insertions(+), 210 deletions(-)

diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index c8d1d05..03dcad2 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -239,23 +239,23 @@ static DeviceInfoFuncTable* deviceInfoFuncTable()
 
 //////////////////////////////// Initialization & Info ////////////////////////
 
-int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
+int cv::gpu::getCudaEnabledDeviceCount() { return deviceInfoFuncTable()->getCudaEnabledDeviceCount(); }
 
-void cv::gpu::setDevice(int device) { gpuFuncTable()->setDevice(device); }
-int cv::gpu::getDevice() { return gpuFuncTable()->getDevice(); }
+void cv::gpu::setDevice(int device) { deviceInfoFuncTable()->setDevice(device); }
+int cv::gpu::getDevice() { return deviceInfoFuncTable()->getDevice(); }
 
-void cv::gpu::resetDevice() { gpuFuncTable()->resetDevice(); }
+void cv::gpu::resetDevice() { deviceInfoFuncTable()->resetDevice(); }
 
-bool cv::gpu::deviceSupports(FeatureSet feature_set) { return gpuFuncTable()->deviceSupports(feature_set); }
+bool cv::gpu::deviceSupports(FeatureSet feature_set) { return deviceInfoFuncTable()->deviceSupports(feature_set); }
 
-bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return gpuFuncTable()->builtWith(feature_set); }
-bool cv::gpu::TargetArchs::has(int major, int minor) { return gpuFuncTable()->has(major, minor); }
-bool cv::gpu::TargetArchs::hasPtx(int major, int minor) {  return gpuFuncTable()->hasPtx(major, minor); }
-bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return gpuFuncTable()->hasBin(major, minor);  }
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrLessPtx(major, minor); }
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuFuncTable()->hasEqualOrGreater(major, minor); }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }
+bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return deviceInfoFuncTable()->builtWith(feature_set); }
+bool cv::gpu::TargetArchs::has(int major, int minor) { return deviceInfoFuncTable()->has(major, minor); }
+bool cv::gpu::TargetArchs::hasPtx(int major, int minor) {  return deviceInfoFuncTable()->hasPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return deviceInfoFuncTable()->hasBin(major, minor);  }
+bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrLessPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreater(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterBin(major, minor); }
 
 size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
 void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
@@ -270,8 +270,8 @@ std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->na
 int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); }
 void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
 
-void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
-void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
+void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); }
+void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); }
 
 #ifdef HAVE_CUDA
 
diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp
index 83172d5..9281655 100644
--- a/modules/core/src/gpumat_cuda.hpp
+++ b/modules/core/src/gpumat_cuda.hpp
@@ -4,6 +4,7 @@
     class DeviceInfoFuncTable
     {
     public:
+        // cv::DeviceInfo
         virtual size_t sharedMemPerBlock() const = 0;
         virtual void queryMemory(size_t&, size_t&) const = 0;
         virtual size_t freeMemory() const = 0;
@@ -16,25 +17,13 @@
         virtual int majorVersion() const = 0;
         virtual int minorVersion() const = 0;
         virtual int multiProcessorCount() const = 0;
-        virtual ~DeviceInfoFuncTable() {};
-    };
-    
-    class GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
-
-        // DeviceInfo routines
         virtual int getCudaEnabledDeviceCount() const = 0;
-
         virtual void setDevice(int) const = 0;
         virtual int getDevice() const = 0;
-
         virtual void resetDevice() const  = 0;
-
         virtual bool deviceSupports(FeatureSet) const = 0;
 
-        // TargetArchs
+        // cv::TargetArchs
         virtual bool builtWith(FeatureSet) const = 0;
         virtual bool has(int, int) const = 0;
         virtual bool hasPtx(int, int) const = 0;
@@ -46,7 +35,15 @@
 
         virtual void printCudaDeviceInfo(int) const = 0;
         virtual void printShortCudaDeviceInfo(int) const = 0;
-        
+
+        virtual ~DeviceInfoFuncTable() {};
+    };
+
+    class GpuFuncTable
+    {
+    public:
+        virtual ~GpuFuncTable() {}
+
         // GpuMat routines
         virtual void copy(const Mat& src, GpuMat& dst) const = 0;
         virtual void copy(const GpuMat& src, Mat& dst) const = 0;
@@ -60,7 +57,7 @@
 
         // for gpu::device::setTo funcs
         virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
-        
+
         virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
         virtual void free(void* devPtr) const = 0;
     };
@@ -80,20 +77,14 @@
         int majorVersion() const { throw_nogpu; return -1; }
         int minorVersion() const { throw_nogpu; return -1; }
         int multiProcessorCount() const { throw_nogpu; return -1; }
-    };
-    
-    class EmptyFuncTable : public GpuFuncTable
-    {
-    public:
-        
-        // DeviceInfo routines
+
         int getCudaEnabledDeviceCount() const { return 0; }
-        
+
         void setDevice(int) const { throw_nogpu; }
         int getDevice() const { throw_nogpu; return 0; }
-        
+
         void resetDevice() const { throw_nogpu; }
-        
+
         bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
 
         bool builtWith(FeatureSet) const { throw_nogpu; return false; }
@@ -104,10 +95,15 @@
         bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
-                
+
         void printCudaDeviceInfo(int) const { throw_nogpu; }
         void printShortCudaDeviceInfo(int) const { throw_nogpu; }
-        
+    };
+
+    class EmptyFuncTable : public GpuFuncTable
+    {
+    public:
+
         void copy(const Mat&, GpuMat&) const { throw_nogpu; }
         void copy(const GpuMat&, Mat&) const { throw_nogpu; }
         void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
@@ -185,62 +181,62 @@ namespace cv { namespace gpu { namespace device
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
     };
     template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
     {
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
     };
-    
+
     template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         static void call(const GpuMat& src, GpuMat& dst)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
     {
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         static void call(const GpuMat& src, GpuMat& dst)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     //////////////////////////////////////////////////////////////////////////
     // Set
-    
+
     template<int SDEPTH, int SCN> struct NppSetFunc
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
     };
     template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
     };
     template<int SCN> struct NppSetFunc<CV_8S, SCN>
@@ -251,172 +247,172 @@ namespace cv { namespace gpu { namespace device
     {
         typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
     };
-    
+
     template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
     template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     template<int SDEPTH, int SCN> struct NppSetMaskFunc
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
     };
     template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
     };
-    
+
     template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s, const GpuMat& mask)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
     template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s, const GpuMat& mask)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     //////////////////////////////////////////////////////////////////////////
     // CopyMasked
-    
+
     template<int SDEPTH> struct NppCopyMaskedFunc
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
     };
-    
+
     template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     template <typename T> static inline bool isAligned(const T* ptr, size_t size)
     {
         return reinterpret_cast<size_t>(ptr) % size == 0;
     }
-     
+
     namespace cv { namespace gpu { namespace device
     {
         void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
         {
             CV_Assert(src.size() == dst.size() && src.type() == dst.type());
             CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-            
+
             cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
         }
-        
+
         void convertTo(const GpuMat& src, GpuMat& dst)
         {
             cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
         }
-        
+
         void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
         {
             cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
         }
-        
+
         void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
         {
             typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-            
+
             static const caller_t callers[] =
             {
                 kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
                 kernelSetCaller<float>, kernelSetCaller<double>
             };
-            
+
             callers[src.depth()](src, s, stream);
         }
-        
+
         void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
         {
             typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-            
+
             static const caller_t callers[] =
             {
                 kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
                 kernelSetCaller<float>, kernelSetCaller<double>
             };
-            
+
             callers[src.depth()](src, s, mask, stream);
         }
-        
+
         void setTo(GpuMat& src, Scalar s)
         {
             setTo(src, s, 0);
         }
-        
+
         void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
         {
             setTo(src, s, mask, 0);
@@ -433,56 +429,56 @@ namespace cv { namespace gpu { namespace device
             fromStr(CUDA_ARCH_PTX, ptx);
             fromStr(CUDA_ARCH_FEATURES, features);
         }
-        
+
         bool builtWith(FeatureSet feature_set) const
         {
             return !features.empty() && (features.back() >= feature_set);
         }
-        
+
         bool hasPtx(int major, int minor) const
         {
             return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
         }
-        
+
         bool hasBin(int major, int minor) const
         {
             return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
         }
-        
+
         bool hasEqualOrLessPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.front() <= major * 10 + minor);
         }
-        
+
         bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.back() >= major * 10 + minor);
         }
-        
+
         bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return !bin.empty() && (bin.back() >= major * 10 + minor);
         }
-        
-        
+
+
     private:
         void fromStr(const string& set_as_str, vector<int>& arr)
         {
             if (set_as_str.find_first_not_of(" ") == string::npos)
                 return;
-            
+
             istringstream stream(set_as_str);
             int cur_value;
-            
+
             while (!stream.eof())
             {
                 stream >> cur_value;
                 arr.push_back(cur_value);
             }
-            
+
             sort(arr.begin(), arr.end());
         }
-        
+
         vector<int> bin;
         vector<int> ptx;
         vector<int> features;
@@ -495,7 +491,7 @@ namespace cv { namespace gpu { namespace device
         {
             props_.resize(10, 0);
         }
-        
+
         ~DeviceProps()
         {
             for (size_t i = 0; i < props_.size(); ++i)
@@ -505,18 +501,18 @@ namespace cv { namespace gpu { namespace device
             }
             props_.clear();
         }
-        
+
         cudaDeviceProp* get(int devID)
         {
             if (devID >= (int) props_.size())
                 props_.resize(devID + 5, 0);
-            
+
             if (!props_[devID])
             {
                 props_[devID] = new cudaDeviceProp;
                 cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
             }
-            
+
             return props_[devID];
         }
     private:
@@ -524,7 +520,7 @@ namespace cv { namespace gpu { namespace device
     };
 
     DeviceProps deviceProps;
-    
+
     class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
     {
     public:
@@ -532,57 +528,57 @@ namespace cv { namespace gpu { namespace device
         {
             return deviceProps.get(device_id_)->sharedMemPerBlock;
         }
-        
+
         void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
         {
             int prevDeviceID = getDevice();
             if (prevDeviceID != device_id_)
                 setDevice(device_id_);
-            
+
             cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
-            
+
             if (prevDeviceID != device_id_)
                 setDevice(prevDeviceID);
         }
-        
+
         size_t freeMemory() const
         {
             size_t _totalMemory, _freeMemory;
             queryMemory(_totalMemory, _freeMemory);
             return _freeMemory;
         }
-        
+
         size_t totalMemory() const
         {
             size_t _totalMemory, _freeMemory;
             queryMemory(_totalMemory, _freeMemory);
             return _totalMemory;
         }
-        
+
         bool supports(FeatureSet feature_set) const
         {
             int version = majorVersion_ * 10 + minorVersion_;
             return version >= feature_set;
         }
-        
+
         bool isCompatible() const
         {
             // Check PTX compatibility
-            if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_))
+            if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
                 return true;
-            
+
             // Check BIN compatibility
                 for (int i = minorVersion_; i >= 0; --i)
-                    if (TargetArchs::hasBin(majorVersion_, i))
+                    if (hasBin(majorVersion_, i))
                         return true;
-                    
+
                     return false;
         }
-        
+
         void query()
         {
             const cudaDeviceProp* prop = deviceProps.get(device_id_);
-            
+
             name_ = prop->name;
             multi_processor_count_ = prop->multiProcessorCount;
             majorVersion_ = prop->major;
@@ -614,116 +610,78 @@ namespace cv { namespace gpu { namespace device
             return multi_processor_count_;
         }
 
-    private:
-        int device_id_;
-        
-        std::string name_;
-        int multi_processor_count_;
-        int majorVersion_;
-        int minorVersion_;
-    };
-    
-    class CudaFuncTable : public GpuFuncTable
-    {
-    protected:
-              
-        const CudaArch cudaArch;
-
-        int convertSMVer2Cores(int major, int minor) const
-        {
-            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-            typedef struct {
-                int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-                int Cores;
-            } SMtoCores;
-            
-            SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
-            
-            int index = 0;
-            while (gpuArchCoresPerSM[index].SM != -1)
-            {
-                if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
-                    return gpuArchCoresPerSM[index].Cores;
-                index++;
-            }
-            
-            return -1;
-        }
-        
-    public:
-
         int getCudaEnabledDeviceCount() const
         {
             int count;
             cudaError_t error = cudaGetDeviceCount( &count );
-            
+
             if (error == cudaErrorInsufficientDriver)
                 return -1;
-            
+
             if (error == cudaErrorNoDevice)
                 return 0;
-            
+
             cudaSafeCall( error );
             return count;
         }
-        
+
         void setDevice(int device) const
         {
             cudaSafeCall( cudaSetDevice( device ) );
         }
-        
+
         int getDevice() const
         {
             int device;
             cudaSafeCall( cudaGetDevice( &device ) );
             return device;
         }
-        
+
         void resetDevice() const
         {
             cudaSafeCall( cudaDeviceReset() );
         }
-        
+
         bool builtWith(FeatureSet feature_set) const
         {
             return cudaArch.builtWith(feature_set);
         }
-        
+
         bool has(int major, int minor) const
         {
             return hasPtx(major, minor) || hasBin(major, minor);
         }
-        
+
         bool hasPtx(int major, int minor) const
         {
             return cudaArch.hasPtx(major, minor);
         }
-        
+
         bool hasBin(int major, int minor) const
         {
             return cudaArch.hasBin(major, minor);
         }
-        
+
         bool hasEqualOrLessPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrLessPtx(major, minor);
         }
-        
+
         bool hasEqualOrGreater(int major, int minor) const
         {
             return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
         }
-        
+
         bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterPtx(major, minor);
         }
-        
+
         bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterBin(major, minor);
         }
-        
+
         bool deviceSupports(FeatureSet feature_set) const
         {
             static int versions[] =
@@ -731,11 +689,11 @@ namespace cv { namespace gpu { namespace device
                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
             };
             static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
-            
+
             const int devId = getDevice();
-            
+
             int version;
-            
+
             if (devId < cache_size && versions[devId] >= 0)
                 version = versions[devId];
             else
@@ -745,25 +703,25 @@ namespace cv { namespace gpu { namespace device
                 if (devId < cache_size)
                     versions[devId] = version;
             }
-            
+
             return TargetArchs::builtWith(feature_set) && (version >= feature_set);
         }
-                        
+
         void printCudaDeviceInfo(int device) const
         {
             int count = getCudaEnabledDeviceCount();
             bool valid = (device >= 0) && (device < count);
-            
+
             int beg = valid ? device   : 0;
             int end = valid ? device+1 : count;
-            
+
             printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
             printf("Device count: %d\n", count);
-            
+
             int driverVersion = 0, runtimeVersion = 0;
             cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
             cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-            
+
             const char *computeMode[] = {
                 "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
                 "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
@@ -772,30 +730,30 @@ namespace cv { namespace gpu { namespace device
                 "Unknown",
                 NULL
             };
-            
+
             for(int dev = beg; dev < end; ++dev)
             {
                 cudaDeviceProp prop;
                 cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-                
+
                 printf("\nDevice %d: \"%s\"\n", dev, prop.name);
                 printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
                 printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
                 printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-                
+
                 int cores = convertSMVer2Cores(prop.major, prop.minor);
                 if (cores > 0)
                     printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
-                
+
                 printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
-                
+
                 printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
-                prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
-                prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+                       prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+                       prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
                 printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
-                prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
-                prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-                
+                       prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+                       prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+
                 printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
                 printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
                 printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
@@ -805,12 +763,12 @@ namespace cv { namespace gpu { namespace device
                 printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
                 printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
                 printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
-                
+
                 printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
                 printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
                 printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
                 printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
-                
+
                 printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
                 printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
                 printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
@@ -820,7 +778,7 @@ namespace cv { namespace gpu { namespace device
                 printf("  Compute Mode:\n");
                 printf("      %s \n", computeMode[prop.computeMode]);
             }
-            
+
             printf("\n");
             printf("deviceQuery, CUDA Driver = CUDART");
             printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
@@ -828,37 +786,73 @@ namespace cv { namespace gpu { namespace device
             printf(", NumDevs = %d\n\n", count);
             fflush(stdout);
         }
-        
+
         void printShortCudaDeviceInfo(int device) const
         {
             int count = getCudaEnabledDeviceCount();
             bool valid = (device >= 0) && (device < count);
-            
+
             int beg = valid ? device   : 0;
             int end = valid ? device+1 : count;
-            
+
             int driverVersion = 0, runtimeVersion = 0;
             cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
             cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-            
+
             for(int dev = beg; dev < end; ++dev)
             {
                 cudaDeviceProp prop;
                 cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-                
+
                 const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
                 printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
                 printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
-                
+
                 int cores = convertSMVer2Cores(prop.major, prop.minor);
                 if (cores > 0)
                     printf(", %d cores", cores * prop.multiProcessorCount);
-                
+
                 printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
             }
             fflush(stdout);
         }
-        
+
+    private:
+        int device_id_;
+
+        std::string name_;
+        int multi_processor_count_;
+        int majorVersion_;
+        int minorVersion_;
+
+        const CudaArch cudaArch;
+
+        int convertSMVer2Cores(int major, int minor) const
+        {
+            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+            typedef struct {
+                int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+                int Cores;
+            } SMtoCores;
+
+            SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
+
+            int index = 0;
+            while (gpuArchCoresPerSM[index].SM != -1)
+            {
+                if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
+                    return gpuArchCoresPerSM[index].Cores;
+                index++;
+            }
+
+            return -1;
+        }
+    };
+
+    class CudaFuncTable : public GpuFuncTable
+    {
+    public:
+
         void copy(const Mat& src, GpuMat& dst) const
         {
             cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-- 
2.7.4