modules/dnn/src/cuda/execution.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
   6 #define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
   7
   8 #include "../cuda4dnn/csl/error.hpp"
   9 #include "../cuda4dnn/csl/stream.hpp"
  10
  11 #include <opencv2/core.hpp>
  12
  13 #include <cuda_runtime_api.h>
  14
  15 #include <cstddef>
  16
  17 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
  18
  19     struct execution_policy {
  20         execution_policy(dim3 grid_size, dim3 block_size)
  21             : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
  22
  23         execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
  24             : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
  25
  26         execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
  27             : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
  28
  29         execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
  30             : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
  31
  32         dim3 grid;
  33         dim3 block;
  34         std::size_t sharedMem;
  35         cudaStream_t stream;
  36     };
  37
  38     /* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
  39     /*
  40     template <class Kernel> inline
  41     execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
  42         int grid_size, block_size;
  43         CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
  44         return execution_policy(grid_size, block_size, sharedMem, stream);
  45     }*/
  46
  47     template <class Kernel> inline
  48     execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
  49         CV_Assert(max_threads > 0);
  50
  51         int grid_size = 0, block_size = 0;
  52         CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
  53         if (grid_size * block_size > max_threads) {
  54             grid_size = (max_threads + block_size - 1) / block_size;
  55             if (block_size > max_threads)
  56                 block_size = max_threads;
  57         }
  58
  59         CV_Assert(grid_size >= 1 && block_size >= 1);
  60         return execution_policy(grid_size, block_size, sharedMem, stream);
  61     }
  62
  63     template <class Kernel, typename ...Args> inline
  64     void launch_kernel(Kernel kernel, Args ...args) {
  65         auto policy = make_policy(kernel);
  66         kernel <<<policy.grid, policy.block>>> (std::forward<Args>(args)...);
  67     }
  68
  69     template <class Kernel, typename ...Args> inline
  70     void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
  71         kernel <<<grid, block>>> (std::forward<Args>(args)...);
  72     }
  73
  74     template <class Kernel, typename ...Args> inline
  75     void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
  76         kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (std::forward<Args>(args)...);
  77     }
  78
  79 }}}} /* namespace cv::dnn::cuda4dnn::csl */
  80
  81 #endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */