modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp

   1 #ifndef __GPUMAT_CUDA_HPP__
   2 #define __GPUMAT_CUDA_HPP__
   3
   4 #ifndef HAVE_CUDA
   5 typedef void* cudaStream_t;
   6 #endif
   7
   8 class DeviceInfoFuncTable
   9 {
  10 public:
  11     // cv::DeviceInfo
  12     virtual size_t sharedMemPerBlock(int id) const = 0;
  13     virtual void queryMemory(int id, size_t&, size_t&) const = 0;
  14     virtual size_t freeMemory(int id) const = 0;
  15     virtual size_t totalMemory(int id) const = 0;
  16     virtual bool supports(int id, FeatureSet) const = 0;
  17     virtual bool isCompatible(int id) const = 0;
  18     virtual std::string name(int id) const = 0;
  19     virtual int majorVersion(int id) const = 0;
  20     virtual int minorVersion(int id) const = 0;
  21     virtual int multiProcessorCount(int id) const = 0;
  22
  23     virtual int getCudaEnabledDeviceCount() const = 0;
  24     virtual void setDevice(int) const = 0;
  25     virtual int getDevice() const = 0;
  26     virtual void resetDevice() const  = 0;
  27     virtual bool deviceSupports(FeatureSet) const = 0;
  28
  29     // cv::TargetArchs
  30     virtual bool builtWith(FeatureSet) const = 0;
  31     virtual bool has(int, int) const = 0;
  32     virtual bool hasPtx(int, int) const = 0;
  33     virtual bool hasBin(int, int) const = 0;
  34     virtual bool hasEqualOrLessPtx(int, int) const = 0;
  35     virtual bool hasEqualOrGreater(int, int) const = 0;
  36     virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
  37     virtual bool hasEqualOrGreaterBin(int, int) const = 0;
  38
  39     virtual void printCudaDeviceInfo(int) const = 0;
  40     virtual void printShortCudaDeviceInfo(int) const = 0;
  41
  42     virtual ~DeviceInfoFuncTable() {};
  43 };
  44
  45 class GpuFuncTable
  46 {
  47 public:
  48     // GpuMat routines
  49     virtual void copy(const Mat& src, GpuMat& dst) const = 0;
  50     virtual void copy(const GpuMat& src, Mat& dst) const = 0;
  51     virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
  52
  53     virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
  54
  55     // gpu::device::convertTo funcs
  56     virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
  57     virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
  58
  59     // for gpu::device::setTo funcs
  60     virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const = 0;
  61
  62     virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
  63     virtual void free(void* devPtr) const = 0;
  64
  65     virtual ~GpuFuncTable() {}
  66 };
  67
  68 class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
  69 {
  70 public:
  71     size_t sharedMemPerBlock(int) const { throw_nogpu; return 0; }
  72     void queryMemory(int, size_t&, size_t&) const { throw_nogpu; }
  73     size_t freeMemory(int) const { throw_nogpu; return 0; }
  74     size_t totalMemory(int) const { throw_nogpu; return 0; }
  75     bool supports(int, FeatureSet) const { throw_nogpu; return false; }
  76     bool isCompatible(int) const { throw_nogpu; return false; }
  77     std::string name(int) const { throw_nogpu; return std::string(); }
  78     int majorVersion(int) const { throw_nogpu; return -1; }
  79     int minorVersion(int) const { throw_nogpu; return -1; }
  80     int multiProcessorCount(int) const { throw_nogpu; return -1; }
  81
  82     int getCudaEnabledDeviceCount() const { return 0; }
  83
  84     void setDevice(int) const { throw_nogpu; }
  85     int getDevice() const { throw_nogpu; return 0; }
  86
  87     void resetDevice() const { throw_nogpu; }
  88
  89     bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
  90
  91     bool builtWith(FeatureSet) const { throw_nogpu; return false; }
  92     bool has(int, int) const { throw_nogpu; return false; }
  93     bool hasPtx(int, int) const { throw_nogpu; return false; }
  94     bool hasBin(int, int) const { throw_nogpu; return false; }
  95     bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
  96     bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
  97     bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
  98     bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
  99
 100     void printCudaDeviceInfo(int) const
 101     {
 102         printf("The library is compiled without CUDA support\n");
 103     }
 104
 105     void printShortCudaDeviceInfo(int) const
 106     {
 107         printf("The library is compiled without CUDA support\n");
 108     }
 109 };
 110
 111 class EmptyFuncTable : public GpuFuncTable
 112 {
 113 public:
 114
 115     void copy(const Mat&, GpuMat&) const { throw_nogpu; }
 116     void copy(const GpuMat&, Mat&) const { throw_nogpu; }
 117     void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
 118
 119     void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
 120
 121     void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
 122     void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
 123
 124     virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const { throw_nogpu; }
 125
 126     void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
 127     void free(void*) const {}
 128 };
 129
 130 #if defined(USE_CUDA)
 131
 132 // Disable NPP for this file
 133 //#define USE_NPP
 134 #undef USE_NPP
 135
 136 #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
 137 inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
 138 {
 139     if (cudaSuccess != err)
 140         cv::gpu::error(cudaGetErrorString(err), file, line, func);
 141 }
 142
 143 #ifdef USE_NPP
 144
 145 #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
 146 inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
 147 {
 148     if (err < 0)
 149     {
 150         std::ostringstream msg;
 151         msg << "NPP API Call Error: " << err;
 152         cv::gpu::error(msg.str().c_str(), file, line, func);
 153     }
 154 }
 155
 156 #endif
 157
 158 namespace cv { namespace gpu { namespace device
 159 {
 160     void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
 161
 162     template <typename T>
 163     void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
 164
 165     template <typename T>
 166     void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
 167
 168     void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 169 }}}
 170
 171 template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
 172 {
 173     Scalar_<T> sf = s;
 174     cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
 175 }
 176
 177 template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
 178 {
 179     Scalar_<T> sf = s;
 180     cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
 181 }
 182
 183 #ifdef USE_NPP
 184
 185 template<int n> struct NPPTypeTraits;
 186 template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
 187 template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
 188 template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
 189 template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
 190 template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
 191 template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
 192 template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
 193
 194 #endif
 195
 196 //////////////////////////////////////////////////////////////////////////
 197 // Convert
 198
 199 #ifdef USE_NPP
 200
 201 template<int SDEPTH, int DDEPTH> struct NppConvertFunc
 202 {
 203     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 204     typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
 205
 206     typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
 207 };
 208 template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
 209 {
 210     typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
 211
 212     typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
 213 };
 214
 215 template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
 216 {
 217     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 218     typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
 219
 220     static void call(const GpuMat& src, GpuMat& dst)
 221     {
 222         NppiSize sz;
 223         sz.width = src.cols;
 224         sz.height = src.rows;
 225
 226         nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
 227
 228         cudaSafeCall( cudaDeviceSynchronize() );
 229     }
 230 };
 231
 232 template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
 233 {
 234     typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
 235
 236     static void call(const GpuMat& src, GpuMat& dst)
 237     {
 238         NppiSize sz;
 239         sz.width = src.cols;
 240         sz.height = src.rows;
 241
 242         nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
 243
 244         cudaSafeCall( cudaDeviceSynchronize() );
 245     }
 246 };
 247
 248 #endif
 249
 250 //////////////////////////////////////////////////////////////////////////
 251 // Set
 252
 253 #ifdef USE_NPP
 254
 255 template<int SDEPTH, int SCN> struct NppSetFunc
 256 {
 257     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 258
 259     typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
 260 };
 261 template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
 262 {
 263     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 264
 265     typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
 266 };
 267 template<int SCN> struct NppSetFunc<CV_8S, SCN>
 268 {
 269     typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
 270 };
 271 template<> struct NppSetFunc<CV_8S, 1>
 272 {
 273     typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
 274 };
 275
 276 template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
 277 {
 278     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 279
 280     static void call(GpuMat& src, Scalar s)
 281     {
 282         NppiSize sz;
 283         sz.width = src.cols;
 284         sz.height = src.rows;
 285
 286         Scalar_<src_t> nppS = s;
 287
 288         nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
 289
 290         cudaSafeCall( cudaDeviceSynchronize() );
 291     }
 292 };
 293 template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
 294 {
 295     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 296
 297     static void call(GpuMat& src, Scalar s)
 298     {
 299         NppiSize sz;
 300         sz.width = src.cols;
 301         sz.height = src.rows;
 302
 303         Scalar_<src_t> nppS = s;
 304
 305         nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
 306
 307         cudaSafeCall( cudaDeviceSynchronize() );
 308     }
 309 };
 310
 311 template<int SDEPTH, int SCN> struct NppSetMaskFunc
 312 {
 313     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 314
 315     typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
 316 };
 317 template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
 318 {
 319     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 320
 321     typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
 322 };
 323
 324 template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
 325 {
 326     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 327
 328     static void call(GpuMat& src, Scalar s, const GpuMat& mask)
 329     {
 330         NppiSize sz;
 331         sz.width = src.cols;
 332         sz.height = src.rows;
 333
 334         Scalar_<src_t> nppS = s;
 335
 336         nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
 337
 338         cudaSafeCall( cudaDeviceSynchronize() );
 339     }
 340 };
 341 template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
 342 {
 343     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 344
 345     static void call(GpuMat& src, Scalar s, const GpuMat& mask)
 346     {
 347         NppiSize sz;
 348         sz.width = src.cols;
 349         sz.height = src.rows;
 350
 351         Scalar_<src_t> nppS = s;
 352
 353         nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
 354
 355         cudaSafeCall( cudaDeviceSynchronize() );
 356     }
 357 };
 358
 359 #endif
 360
 361 //////////////////////////////////////////////////////////////////////////
 362 // CopyMasked
 363
 364 #ifdef USE_NPP
 365
 366 template<int SDEPTH> struct NppCopyMaskedFunc
 367 {
 368     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 369
 370     typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
 371 };
 372
 373 template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
 374 {
 375     typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 376
 377     static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
 378     {
 379         NppiSize sz;
 380         sz.width = src.cols;
 381         sz.height = src.rows;
 382
 383         nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
 384
 385         cudaSafeCall( cudaDeviceSynchronize() );
 386     }
 387 };
 388
 389 #endif
 390
 391 template <typename T> static inline bool isAligned(const T* ptr, size_t size)
 392 {
 393     return reinterpret_cast<size_t>(ptr) % size == 0;
 394 }
 395
 396 namespace cv { namespace gpu { namespace device
 397 {
 398     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0);
 399     void convertTo(const GpuMat& src, GpuMat& dst);
 400     void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0);
 401     void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
 402     void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
 403     void setTo(GpuMat& src, Scalar s);
 404     void setTo(GpuMat& src, Scalar s, const GpuMat& mask);
 405
 406     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
 407     {
 408         CV_Assert(src.size() == dst.size() && src.type() == dst.type());
 409         CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
 410
 411         cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
 412     }
 413
 414     void convertTo(const GpuMat& src, GpuMat& dst)
 415     {
 416         cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
 417     }
 418
 419     void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
 420     {
 421         cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
 422     }
 423
 424     void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
 425     {
 426         typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
 427
 428         static const caller_t callers[] =
 429         {
 430             kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
 431             kernelSetCaller<float>, kernelSetCaller<double>
 432         };
 433
 434         callers[src.depth()](src, s, stream);
 435     }
 436
 437     void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
 438     {
 439         typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
 440
 441         static const caller_t callers[] =
 442         {
 443             kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
 444             kernelSetCaller<float>, kernelSetCaller<double>
 445         };
 446
 447         callers[src.depth()](src, s, mask, stream);
 448     }
 449
 450     void setTo(GpuMat& src, Scalar s)
 451     {
 452         setTo(src, s, 0);
 453     }
 454
 455     void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
 456     {
 457         setTo(src, s, mask, 0);
 458     }
 459 }}}
 460
 461 class CudaArch
 462 {
 463 public:
 464     CudaArch()
 465     {
 466         fromStr(CUDA_ARCH_BIN, bin);
 467         fromStr(CUDA_ARCH_PTX, ptx);
 468         fromStr(CUDA_ARCH_FEATURES, features);
 469     }
 470
 471     bool builtWith(FeatureSet feature_set) const
 472     {
 473         return !features.empty() && (features.back() >= feature_set);
 474     }
 475
 476     bool hasPtx(int major, int minor) const
 477     {
 478         return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
 479     }
 480
 481     bool hasBin(int major, int minor) const
 482     {
 483         return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
 484     }
 485
 486     bool hasEqualOrLessPtx(int major, int minor) const
 487     {
 488         return !ptx.empty() && (ptx.front() <= major * 10 + minor);
 489     }
 490
 491     bool hasEqualOrGreaterPtx(int major, int minor) const
 492     {
 493         return !ptx.empty() && (ptx.back() >= major * 10 + minor);
 494     }
 495
 496     bool hasEqualOrGreaterBin(int major, int minor) const
 497     {
 498         return !bin.empty() && (bin.back() >= major * 10 + minor);
 499     }
 500
 501
 502 private:
 503     void fromStr(const string& set_as_str, vector<int>& arr)
 504     {
 505         if (set_as_str.find_first_not_of(" ") == string::npos)
 506             return;
 507
 508         istringstream stream(set_as_str);
 509         int cur_value;
 510
 511         while (!stream.eof())
 512         {
 513             stream >> cur_value;
 514             arr.push_back(cur_value);
 515         }
 516
 517         sort(arr.begin(), arr.end());
 518     }
 519
 520     vector<int> bin;
 521     vector<int> ptx;
 522     vector<int> features;
 523 };
 524
 525 class DeviceProps
 526 {
 527 public:
 528     DeviceProps()
 529     {
 530         props_.resize(10, 0);
 531     }
 532
 533     ~DeviceProps()
 534     {
 535         for (size_t i = 0; i < props_.size(); ++i)
 536         {
 537             if (props_[i])
 538                 delete props_[i];
 539         }
 540         props_.clear();
 541     }
 542
 543     cudaDeviceProp* get(int devID)
 544     {
 545         if (devID >= (int) props_.size())
 546             props_.resize(devID + 5, 0);
 547
 548         if (!props_[devID])
 549         {
 550             props_[devID] = new cudaDeviceProp;
 551             cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
 552         }
 553
 554         return props_[devID];
 555     }
 556 private:
 557     std::vector<cudaDeviceProp*> props_;
 558 };
 559
 560 DeviceProps deviceProps;
 561 const CudaArch cudaArch;
 562
 563 class CudaDeviceInfoFuncTable : public DeviceInfoFuncTable
 564 {
 565 public:
 566     size_t sharedMemPerBlock(int id) const
 567     {
 568         return deviceProps.get(id)->sharedMemPerBlock;
 569     }
 570
 571     void queryMemory(int id, size_t& _totalMemory, size_t& _freeMemory) const
 572     {
 573         int prevDeviceID = getDevice();
 574         if (prevDeviceID != id)
 575             setDevice(id);
 576
 577         cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
 578
 579         if (prevDeviceID != id)
 580             setDevice(prevDeviceID);
 581     }
 582
 583     size_t freeMemory(int id) const
 584     {
 585         size_t _totalMemory, _freeMemory;
 586         queryMemory(id, _totalMemory, _freeMemory);
 587         return _freeMemory;
 588     }
 589
 590     size_t totalMemory(int id) const
 591     {
 592         size_t _totalMemory, _freeMemory;
 593         queryMemory(id, _totalMemory, _freeMemory);
 594         return _totalMemory;
 595     }
 596
 597     bool supports(int id, FeatureSet feature_set) const
 598     {
 599         int version = majorVersion(id) * 10 + minorVersion(id);
 600         return version >= feature_set;
 601     }
 602
 603     bool isCompatible(int id) const
 604     {
 605         // Check PTX compatibility
 606         if (hasEqualOrLessPtx(majorVersion(id), minorVersion(id)))
 607             return true;
 608
 609         // Check BIN compatibility
 610             for (int i = minorVersion(id); i >= 0; --i)
 611                 if (hasBin(majorVersion(id), i))
 612                     return true;
 613
 614                 return false;
 615     }
 616
 617     std::string name(int id) const
 618     {
 619         const cudaDeviceProp* prop = deviceProps.get(id);
 620         return prop->name;
 621     }
 622
 623     int majorVersion(int id) const
 624     {
 625         const cudaDeviceProp* prop = deviceProps.get(id);
 626         return prop->major;
 627     }
 628
 629     int minorVersion(int id) const
 630     {
 631         const cudaDeviceProp* prop = deviceProps.get(id);
 632         return prop->minor;
 633     }
 634
 635     int multiProcessorCount(int id) const
 636     {
 637         const cudaDeviceProp* prop = deviceProps.get(id);
 638         return prop->multiProcessorCount;
 639     }
 640
 641     int getCudaEnabledDeviceCount() const
 642     {
 643         int count;
 644         cudaError_t error = cudaGetDeviceCount( &count );
 645
 646         if (error == cudaErrorInsufficientDriver)
 647             return -1;
 648
 649         if (error == cudaErrorNoDevice)
 650             return 0;
 651
 652         cudaSafeCall( error );
 653         return count;
 654     }
 655
 656     void setDevice(int device) const
 657     {
 658         cudaSafeCall( cudaSetDevice( device ) );
 659     }
 660
 661     int getDevice() const
 662     {
 663         int device;
 664         cudaSafeCall( cudaGetDevice( &device ) );
 665         return device;
 666     }
 667
 668     void resetDevice() const
 669     {
 670         cudaSafeCall( cudaDeviceReset() );
 671     }
 672
 673     bool builtWith(FeatureSet feature_set) const
 674     {
 675         return cudaArch.builtWith(feature_set);
 676     }
 677
 678     bool has(int major, int minor) const
 679     {
 680         return hasPtx(major, minor) || hasBin(major, minor);
 681     }
 682
 683     bool hasPtx(int major, int minor) const
 684     {
 685         return cudaArch.hasPtx(major, minor);
 686     }
 687
 688     bool hasBin(int major, int minor) const
 689     {
 690         return cudaArch.hasBin(major, minor);
 691     }
 692
 693     bool hasEqualOrLessPtx(int major, int minor) const
 694     {
 695         return cudaArch.hasEqualOrLessPtx(major, minor);
 696     }
 697
 698     bool hasEqualOrGreater(int major, int minor) const
 699     {
 700         return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
 701     }
 702
 703     bool hasEqualOrGreaterPtx(int major, int minor) const
 704     {
 705         return cudaArch.hasEqualOrGreaterPtx(major, minor);
 706     }
 707
 708     bool hasEqualOrGreaterBin(int major, int minor) const
 709     {
 710         return cudaArch.hasEqualOrGreaterBin(major, minor);
 711     }
 712
 713     bool deviceSupports(FeatureSet feature_set) const
 714     {
 715         static int versions[] =
 716         {
 717             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 718         };
 719         static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
 720
 721         const int devId = getDevice();
 722
 723         int version;
 724
 725         if (devId < cache_size && versions[devId] >= 0)
 726             version = versions[devId];
 727         else
 728         {
 729             DeviceInfo dev(devId);
 730             version = dev.majorVersion() * 10 + dev.minorVersion();
 731             if (devId < cache_size)
 732                 versions[devId] = version;
 733         }
 734
 735         return TargetArchs::builtWith(feature_set) && (version >= feature_set);
 736     }
 737
 738     void printCudaDeviceInfo(int device) const
 739     {
 740         int count = getCudaEnabledDeviceCount();
 741         bool valid = (device >= 0) && (device < count);
 742
 743         int beg = valid ? device   : 0;
 744         int end = valid ? device+1 : count;
 745
 746         printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
 747         printf("Device count: %d\n", count);
 748
 749         int driverVersion = 0, runtimeVersion = 0;
 750         cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
 751         cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
 752
 753         const char *computeMode[] = {
 754             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
 755                "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
 756                "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
 757                "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
 758                "Unknown",
 759                NULL
 760         };
 761
 762         for(int dev = beg; dev < end; ++dev)
 763         {
 764             cudaDeviceProp prop;
 765             cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
 766
 767             printf("\nDevice %d: \"%s\"\n", dev, prop.name);
 768             printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
 769             printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
 770             printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
 771
 772         int cores = convertSMVer2Cores(prop.major, prop.minor);
 773         if (cores > 0)
 774             printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
 775
 776         printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
 777
 778         printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
 779                prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
 780                prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
 781         printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
 782                prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
 783                prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
 784
 785         printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
 786         printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
 787         printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
 788         printf("  Warp size:                                     %d\n", prop.warpSize);
 789         printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
 790         printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
 791         printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
 792         printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
 793         printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
 794
 795         printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
 796         printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
 797         printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
 798         printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
 799
 800         printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
 801         printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
 802         printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
 803         printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
 804         printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
 805         printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
 806         printf("  Compute Mode:\n");
 807         printf("      %s \n", computeMode[prop.computeMode]);
 808         }
 809
 810         printf("\n");
 811         printf("deviceQuery, CUDA Driver = CUDART");
 812         printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
 813         printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
 814         printf(", NumDevs = %d\n\n", count);
 815         fflush(stdout);
 816     }
 817
 818     void printShortCudaDeviceInfo(int device) const
 819     {
 820         int count = getCudaEnabledDeviceCount();
 821         bool valid = (device >= 0) && (device < count);
 822
 823         int beg = valid ? device   : 0;
 824         int end = valid ? device+1 : count;
 825
 826         int driverVersion = 0, runtimeVersion = 0;
 827         cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
 828         cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
 829
 830         for(int dev = beg; dev < end; ++dev)
 831         {
 832             cudaDeviceProp prop;
 833             cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
 834
 835             const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
 836             printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
 837             printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
 838
 839             int cores = convertSMVer2Cores(prop.major, prop.minor);
 840             if (cores > 0)
 841                 printf(", %d cores", cores * prop.multiProcessorCount);
 842
 843             printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
 844         }
 845         fflush(stdout);
 846     }
 847
 848 private:
 849     int convertSMVer2Cores(int major, int minor) const
 850     {
 851         // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
 852         typedef struct {
 853             int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
 854             int Cores;
 855         } SMtoCores;
 856
 857         SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
 858
 859         int index = 0;
 860         while (gpuArchCoresPerSM[index].SM != -1)
 861         {
 862             if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
 863                 return gpuArchCoresPerSM[index].Cores;
 864             index++;
 865         }
 866
 867         return -1;
 868     }
 869 };
 870
 871 class CudaFuncTable : public GpuFuncTable
 872 {
 873 public:
 874
 875     void copy(const Mat& src, GpuMat& dst) const
 876     {
 877         cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
 878     }
 879
 880     void copy(const GpuMat& src, Mat& dst) const
 881     {
 882         cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
 883     }
 884
 885     void copy(const GpuMat& src, GpuMat& dst) const
 886     {
 887         cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
 888     }
 889
 890     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
 891     {
 892         CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
 893         CV_Assert(src.size() == dst.size() && src.type() == dst.type());
 894         CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
 895
 896         if (src.depth() == CV_64F)
 897         {
 898             if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
 899                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
 900         }
 901
 902         typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
 903
 904 #ifdef USE_NPP
 905         static const func_t funcs[7][4] =
 906         {
 907             /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
 908             /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
 909             /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
 910             /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
 911             /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
 912             /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
 913             /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
 914          };
 915
 916          const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
 917 #else
 918         const func_t func = cv::gpu::device::copyWithMask;
 919 #endif
 920
 921          func(src, dst, mask, 0);
 922     }
 923
 924     void convert(const GpuMat& src, GpuMat& dst) const
 925     {
 926         typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
 927
 928 #ifdef USE_NPP
 929         static const func_t funcs[7][7][4] =
 930         {
 931             {
 932                 /*  8U ->  8U */ {0, 0, 0, 0},
 933                 /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 934                 /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
 935                 /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
 936                 /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 937                 /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 938                 /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
 939             },
 940             {
 941                 /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 942                 /*  8S ->  8S */ {0,0,0,0},
 943                 /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 944                 /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 945                 /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 946                 /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 947                 /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
 948             },
 949             {
 950                 /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
 951                 /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 952                 /* 16U -> 16U */ {0,0,0,0},
 953                 /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 954                 /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 955                 /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 956                 /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
 957             },
 958             {
 959                 /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
 960                 /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 961                 /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 962                 /* 16S -> 16S */ {0,0,0,0},
 963                 /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 964                 /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
 965                 /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
 966             },
 967             {
 968                 /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 969                 /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 970                 /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 971                 /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 972                 /* 32S -> 32S */ {0,0,0,0},
 973                 /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 974                 /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
 975             },
 976             {
 977                 /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 978                 /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 979                 /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 980                 /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 981                 /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 982                 /* 32F -> 32F */ {0,0,0,0},
 983                 /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
 984             },
 985             {
 986                 /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 987                 /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 988                 /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 989                 /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 990                 /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 991                 /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
 992                 /* 64F -> 64F */ {0,0,0,0}
 993             }
 994         };
 995 #endif
 996
 997         CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
 998         CV_Assert(dst.depth() <= CV_64F);
 999         CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
1000
1001         if (src.depth() == CV_64F || dst.depth() == CV_64F)
1002         {
1003             if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
1004                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
1005         }
1006
1007         bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
1008         if (!aligned)
1009         {
1010             cv::gpu::device::convertTo(src, dst);
1011             return;
1012         }
1013
1014 #ifdef USE_NPP
1015         const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
1016         CV_DbgAssert(func != 0);
1017 #else
1018         const func_t func = cv::gpu::device::convertTo;
1019 #endif
1020
1021         func(src, dst);
1022     }
1023
1024     void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
1025     {
1026         CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
1027         CV_Assert(dst.depth() <= CV_64F);
1028
1029         if (src.depth() == CV_64F || dst.depth() == CV_64F)
1030         {
1031             if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
1032                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
1033         }
1034
1035         cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
1036     }
1037
1038     void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
1039     {
1040         if (mask.empty())
1041         {
1042             if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
1043             {
1044                 cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
1045                 return;
1046             }
1047
1048             if (m.depth() == CV_8U)
1049             {
1050                 int cn = m.channels();
1051
1052                 if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
1053                 {
1054                     int val = saturate_cast<uchar>(s[0]);
1055                     cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
1056                     return;
1057                 }
1058             }
1059
1060             typedef void (*func_t)(GpuMat& src, Scalar s);
1061
1062 #ifdef USE_NPP
1063             static const func_t funcs[7][4] =
1064             {
1065                 {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
1066                 {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
1067                 {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
1068                 {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
1069                 {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
1070                 {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
1071                 {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
1072             };
1073 #endif
1074
1075             CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
1076
1077             if (m.depth() == CV_64F)
1078             {
1079                 if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
1080                     CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
1081             }
1082
1083 #ifdef USE_NPP
1084         const func_t func = funcs[m.depth()][m.channels() - 1];
1085 #else
1086         const func_t func = cv::gpu::device::setTo;
1087 #endif
1088
1089             if (stream)
1090                 cv::gpu::device::setTo(m, s, stream);
1091             else
1092                 func(m, s);
1093         }
1094         else
1095         {
1096             typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
1097
1098 #ifdef USE_NPP
1099             static const func_t funcs[7][4] =
1100             {
1101                 {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
1102                 {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
1103                 {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
1104                 {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
1105                 {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
1106                 {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
1107                 {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
1108             };
1109 #endif
1110
1111             CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
1112
1113             if (m.depth() == CV_64F)
1114             {
1115                 if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
1116                     CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
1117             }
1118
1119 #ifdef USE_NPP
1120         const func_t func = funcs[m.depth()][m.channels() - 1];
1121 #else
1122         const func_t func = cv::gpu::device::setTo;
1123 #endif
1124
1125             if (stream)
1126                 cv::gpu::device::setTo(m, s, mask, stream);
1127             else
1128                 func(m, s, mask);
1129         }
1130     }
1131
1132     void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
1133     {
1134         cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
1135     }
1136
1137     void free(void* devPtr) const
1138     {
1139         cudaFree(devPtr);
1140     }
1141 };
1142 #endif
1143 #endif