Add cuda::streams to by_rows and 8UC1 functions

author Claudio <claudio.fantacci@gmail.com>

Sat, 25 Mar 2017 14:43:56 +0000 (15:43 +0100)

committer Claudio <claudio.fantacci@gmail.com>

Mon, 27 Mar 2017 08:54:07 +0000 (10:54 +0200)
author Claudio <claudio.fantacci@gmail.com>
Sat, 25 Mar 2017 14:43:56 +0000 (15:43 +0100)
committer Claudio <claudio.fantacci@gmail.com>
Mon, 27 Mar 2017 08:54:07 +0000 (10:54 +0200)
diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu

index 40d2dec..45a3ecb 100644 (file)
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -518,8 +518,10 @@ namespace cv { namespace cuda { namespace device
  
  
          template <int nthreads>
-        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-                                                      const float* block_hists, PtrStepf descriptors)
+        __global__ void extract_descrs_by_rows_kernel(const int img_block_width,
+                                                      const int win_block_stride_x, const int win_block_stride_y,
+                                                      const float* block_hists,
+                                                      PtrStepf descriptors)
          {
              // Get left top corner of the window in src
              const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
@@ -538,8 +540,14 @@ namespace cv { namespace cuda { namespace device
          }
  
  
-        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
-                                    int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors)
+        void extract_descrs_by_rows(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists, int cell_size_x,
+                                    int ncells_block_x,
+                                    PtrStepSzf descriptors,
+                                    const cudaStream_t& stream)
          {
              const int nthreads = 256;
  
@@ -551,17 +559,16 @@ namespace cv { namespace cuda { namespace device
              dim3 grid(img_win_width, img_win_height);
  
              int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
-            extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
-                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
-            cudaSafeCall( cudaGetLastError() );
+            extract_descrs_by_rows_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
  
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaGetLastError() );
          }
  
  
          template <int nthreads>
-        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
-                                                      const int win_block_stride_y, const float* block_hists,
+        __global__ void extract_descrs_by_cols_kernel(const int img_block_width,
+                                                      const int win_block_stride_x, const int win_block_stride_y,
+                                                      const float* block_hists,
                                                        PtrStepf descriptors)
          {
              // Get left top corner of the window in src
@@ -792,8 +799,12 @@ namespace cv { namespace cuda { namespace device
          }
  
  
-        void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
-                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
+        void compute_gradients_8UC1(int nbins,
+                                    int height, int width, const PtrStepSzb& img,
+                                    float angle_scale,
+                                    PtrStepSzf grad, PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream)
          {
              (void)nbins;
              const int nthreads = 256;
@@ -802,13 +813,11 @@ namespace cv { namespace cuda { namespace device
              dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
  
              if (correct_gamma)
-                compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+                compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
              else
-                compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+                compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
  
              cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
          }
  
  
diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp

index f59bd0c..f06ce36 100644 (file)
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -64,19 +64,29 @@ namespace cv { namespace cuda { namespace device
  {
      namespace hog
      {
-        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
+        void set_up_constants(int nbins,
+                              int block_stride_x, int block_stride_y,
                                int nblocks_win_x, int nblocks_win_y,
                                int ncells_block_x, int ncells_block_y,
                                const cudaStream_t& stream);
  
-        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
-                           int height, int width, const PtrStepSzf& grad,
-                           const PtrStepSzb& qangle, float sigma, float* block_hists,
-                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+        void compute_hists(int nbins,
+                           int block_stride_x, int block_stride_y,
+                           int height, int width,
+                           const PtrStepSzf& grad, const PtrStepSzb& qangle,
+                           float sigma,
+                           float* block_hists,
+                           int cell_size_x, int cell_size_y,
+                           int ncells_block_x, int ncells_block_y,
                             const cudaStream_t& stream);
  
-        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+        void normalize_hists(int nbins,
+                             int block_stride_x, int block_stride_y,
+                             int height, int width,
+                             float* block_hists,
+                             float threshold,
+                             int cell_size_x, int cell_size_y,
+                             int ncells_block_x, int ncells_block_y,
                               const cudaStream_t& stream);
  
          void classify_hists(int win_height, int win_width, int block_stride_y,
@@ -85,21 +95,37 @@ namespace cv { namespace cuda { namespace device
                              float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels);
  
          void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                           int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                           float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences);
-
-        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
-                                    cv::cuda::PtrStepSzf descriptors);
-        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
+                                      int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                      float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences);
+
+        void extract_descrs_by_rows(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
+                                    cv::cuda::PtrStepSzf descriptors,
+                                    const cudaStream_t& stream);
+        void extract_descrs_by_cols(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
                                      cv::cuda::PtrStepSzf descriptors,
                                      const cudaStream_t& stream);
  
-        void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
-                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
-        void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
-                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma,
+        void compute_gradients_8UC1(int nbins,
+                                    int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale,
+                                    cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream);
+        void compute_gradients_8UC4(int nbins,
+                                    int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale,
+                                    cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle,
+                                    bool correct_gamma,
                                      const cudaStream_t& stream);
  
          void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
@@ -483,7 +509,8 @@ namespace
                                          img.rows, img.cols,
                                          block_hists.ptr<float>(),
                                          cell_size_.width, cells_per_block_.width,
-                                        descriptors);
+                                        descriptors,
+                                        StreamAccessor::getStream(stream));
              break;
          case DESCR_FORMAT_COL_BY_COL:
              hog::extract_descrs_by_cols(win_size_.height, win_size_.width,
@@ -524,8 +551,12 @@ namespace
          switch (img.type())
          {
              case CV_8UC1:
-                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img,
-                                            angleScale, grad, qangle, gamma_correction_);
+                hog::compute_gradients_8UC1(nbins_,
+                                            img.rows, img.cols, img,
+                                            angleScale,
+                                            grad, qangle,
+                                            gamma_correction_,
+                                            StreamAccessor::getStream(stream));
                  break;
              case CV_8UC4:
                  hog::compute_gradients_8UC4(nbins_,
author	Claudio <claudio.fantacci@gmail.com>
	Sat, 25 Mar 2017 14:43:56 +0000 (15:43 +0100)
committer	Claudio <claudio.fantacci@gmail.com>
	Mon, 27 Mar 2017 08:54:07 +0000 (10:54 +0200)
modules/cudaobjdetect/src/cuda/hog.cu		patch \| blob \| history
modules/cudaobjdetect/src/hog.cpp		patch \| blob \| history