Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
[platform/upstream/opencv.git] / modules / dnn / src / cuda / execution.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
6 #define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
7
8 #include "../cuda4dnn/csl/error.hpp"
9 #include "../cuda4dnn/csl/stream.hpp"
10
11 #include <opencv2/core.hpp>
12
13 #include <cuda_runtime_api.h>
14
15 #include <cstddef>
16
17 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
18
19     struct execution_policy {
20         execution_policy(dim3 grid_size, dim3 block_size)
21             : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
22
23         execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
24             : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
25
26         execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
27             : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
28
29         execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
30             : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
31
32         dim3 grid;
33         dim3 block;
34         std::size_t sharedMem;
35         cudaStream_t stream;
36     };
37
38     /* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
39     /*
40     template <class Kernel> inline
41     execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
42         int grid_size, block_size;
43         CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
44         return execution_policy(grid_size, block_size, sharedMem, stream);
45     }*/
46
47     template <class Kernel> inline
48     execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
49         CV_Assert(max_threads > 0);
50
51         int grid_size = 0, block_size = 0;
52         CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
53         if (grid_size * block_size > max_threads) {
54             grid_size = (max_threads + block_size - 1) / block_size;
55             if (block_size > max_threads)
56                 block_size = max_threads;
57         }
58
59         CV_Assert(grid_size >= 1 && block_size >= 1);
60         return execution_policy(grid_size, block_size, sharedMem, stream);
61     }
62
63     template <class Kernel, typename ...Args> inline
64     void launch_kernel(Kernel kernel, Args ...args) {
65         auto policy = make_policy(kernel);
66         kernel <<<policy.grid, policy.block>>> (std::forward<Args>(args)...);
67     }
68
69     template <class Kernel, typename ...Args> inline
70     void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
71         kernel <<<grid, block>>> (std::forward<Args>(args)...);
72     }
73
74     template <class Kernel, typename ...Args> inline
75     void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
76         kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (std::forward<Args>(args)...);
77     }
78
79 }}}} /* namespace cv::dnn::cuda4dnn::csl */
80
81 #endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */