add support for CUDA streams

author marina.kolpakova <marina.kolpakova@itseez.com>

Fri, 9 Nov 2012 23:59:09 +0000 (03:59 +0400)

committer marina.kolpakova <marina.kolpakova@itseez.com>

Sat, 10 Nov 2012 01:13:26 +0000 (05:13 +0400)
author marina.kolpakova <marina.kolpakova@itseez.com>
Fri, 9 Nov 2012 23:59:09 +0000 (03:59 +0400)
committer marina.kolpakova <marina.kolpakova@itseez.com>
Sat, 10 Nov 2012 01:13:26 +0000 (05:13 +0400)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp

index 4fc6179..8f327f2 100644 (file)
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1577,7 +1577,7 @@ public:
      virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
      virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const;
  
-    void genRoi(InputArray roi, OutputArray mask) const;
+    void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
  
  private:
  
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp

index 1e62af8..3e82cc5 100644 (file)
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -284,4 +284,44 @@ RUN_GPU(SCascadeTest, detectOnIntegral)
      SANITY_CHECK(sortDetections(objectBoxes));
  }
  
-NO_CPU(SCascadeTest, detectOnIntegral)
-\ No newline at end of file
+NO_CPU(SCascadeTest, detectOnIntegral)
+
+GPU_PERF_TEST_P(SCascadeTest, detectStream,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
+{ }
+
+RUN_GPU(SCascadeTest, detectStream)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois;
+    rois.setTo(1);
+
+    cv::gpu::Stream s;
+
+    cascade.genRoi(rois, trois, s);
+
+    cascade.detect(colored, trois, objectBoxes, s);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, trois, objectBoxes, s);
+    }
+
+    cudaDeviceSynchronize();
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detectStream)
+\ No newline at end of file
diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu

index 5bd35bd..200960b 100644 (file)
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -444,7 +444,6 @@ namespace cv { namespace gpu { namespace device
          }
  
          // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
-        // ToDo: partial dy
          void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
              int blockStep, cudaStream_t stream)
          {
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu

index ee9a9f6..0de2d8e 100644 (file)
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -71,7 +71,7 @@ namespace icf {
      }
  
      void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
-                  const int fw,  const int fh, const int bins)
+                  const int fw,  const int fh, const int bins, cudaStream_t stream )
      {
          const uchar* mag = (const uchar*)hogluv.ptr(fh * bins);
          uchar* hog = (uchar*)hogluv.ptr();
@@ -80,9 +80,12 @@ namespace icf {
          dim3 block(32, 8);
          dim3 grid(fw / 32, fh / 8);
  
-        magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
-        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaDeviceSynchronize() );
+        magToHist<<<grid, block, 0, stream>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
+        if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
      }
  
      texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
@@ -305,7 +308,7 @@ namespace icf {
  
      template<>
      void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale) const
+        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
      {
          int fw = 160;
          int fh = 120;
@@ -325,22 +328,25 @@ namespace icf {
  
          if (scale == -1)
          {
-            test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
+            test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
              cudaSafeCall( cudaGetLastError());
  
              grid = dim3(fw, fh / 8, 47 - downscales);
-            test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
+            test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
          }
          else
          {
              if (scale >= downscales)
-                test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+                test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
              else
-                test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+                test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
          }
  
-        cudaSafeCall( cudaGetLastError());
-        cudaSafeCall( cudaDeviceSynchronize());
+        if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError());
+            cudaSafeCall( cudaDeviceSynchronize());
+        }
      }
  }
  }}}
 \ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp

index d829012..60df558 100644 (file)
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -139,7 +139,7 @@ struct CascadeInvoker
      const float*  leaves;
  
      void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales, const int csale = -1) const;
+        PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const;
  };
  
  }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp

index 02481ed..fdde261 100644 (file)
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -54,7 +54,7 @@ bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
  void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
  void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); }
  
-void cv::gpu::SCascade::genRoi(InputArray, OutputArray) const { throw_nogpu(); }
+void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_nogpu(); }
  
  void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
  
@@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device {
  
  namespace icf {
      void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
-        const int fw, const int fh, const int bins);
+        const int fw, const int fh, const int bins, cudaStream_t stream);
  }
  
  namespace imgproc {
@@ -341,27 +341,30 @@ struct cv::gpu::SCascade::Fields
  
      }
  
-    void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, cudaStream_t stream) const
+    void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
      {
          cudaMemset(count.data, 0, sizeof(Detection));
          cudaSafeCall( cudaGetLastError());
-        invoker(roi, hogluv, objects, count, downscales, scale);
+        invoker(roi, hogluv, objects, count, downscales, scale, stream);
      }
  
-    void preprocess(const cv::gpu::GpuMat& colored)
+    void preprocess(const cv::gpu::GpuMat& colored, Stream& s)
      {
-        cudaMemset(plane.data, 0, plane.step * plane.rows);
+        if (s)
+            s.enqueueMemSet(plane, 0);
+        else
+            cudaMemset(plane.data, 0, plane.step * plane.rows);
  
          static const int fw = Fields::FRAME_WIDTH;
          static const int fh = Fields::FRAME_HEIGHT;
  
          GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh));
-        cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
-        createHogBins(gray);
+        cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY, s);
+        createHogBins(gray ,s);
  
-        createLuvBins(colored);
+        createLuvBins(colored, s);
  
-        integrate();
+        integrate(s);
      }
  
  private:
@@ -386,7 +389,7 @@ private:
          return res;
      }
  
-    void createHogBins(const cv::gpu::GpuMat& gray)
+    void createHogBins(const cv::gpu::GpuMat& gray, Stream& s)
      {
          static const int fw = Fields::FRAME_WIDTH;
          static const int fh = Fields::FRAME_HEIGHT;
@@ -394,35 +397,38 @@ private:
          GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
          GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
  
-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1);
+        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s);
+        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s);
  
          GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
          GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
  
-        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true);
+        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
  
          // normolize magnitude to uchar interval and angles to 6 bins
-
          GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
          GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
  
-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag);
-        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
+        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag, 1, -1, s);
+        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
  
          //create uchar magnitude
          GpuMat cmag(plane, cv::Rect(0, fh * Fields::HOG_BINS, fw, fh));
-        nmag.convertTo(cmag, CV_8UC1);
+        if (s)
+            s.enqueueConvert(nmag, cmag, CV_8UC1);
+        else
+            nmag.convertTo(cmag, CV_8UC1);
  
-        device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS);
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS, stream);
      }
  
-    void createLuvBins(const cv::gpu::GpuMat& colored)
+    void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
      {
          static const int fw = Fields::FRAME_WIDTH;
          static const int fh = Fields::FRAME_HEIGHT;
  
-        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv);
+        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
  
          std::vector<GpuMat> splited;
          for(int i = 0; i < Fields::LUV_BINS; ++i)
@@ -430,17 +436,18 @@ private:
              splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
          }
  
-        cv::gpu::split(luv, splited);
+        cv::gpu::split(luv, splited, s);
      }
  
-    void integrate()
+    void integrate( Stream& s)
      {
          int fw = Fields::FRAME_WIDTH;
          int fh = Fields::FRAME_HEIGHT;
  
          GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS));
-        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-        device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0);
+        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s);
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream);
      }
  
  public:
@@ -482,6 +489,8 @@ public:
      GpuMat leaves;
      GpuMat levels;
  
+    GpuMat sobelBuf;
+
      device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
  
      enum { BOOST = 0 };
@@ -516,6 +525,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
      // only color images are supperted
      CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
  
+    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+
      // we guess user knows about shrincage
      // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
  
@@ -525,14 +536,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
      {
          // only this window size allowed
          CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
-        flds.preprocess(colored);
+        flds.preprocess(colored, s);
      }
      else
      {
          colored.copyTo(flds.hogluv);
      }
  
-    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
  
      GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
      objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols -  sizeof(Detection), 1));
@@ -556,7 +566,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
      {
          // only this window size allowed
          CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
-        flds.preprocess(colored);
+        flds.preprocess(colored, s);
      }
      else
      {
@@ -572,15 +582,15 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
      flds.detect(level, rois, tmp, objects, stream);
  }
  
-void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask) const
+void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
  {
      const GpuMat roi = _roi.getGpuMat();
      _mask.create( roi.cols / 4, roi.rows / 4, roi.type() );
      GpuMat mask = _mask.getGpuMat();
      cv::gpu::GpuMat tmp;
  
-    cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-    cv::gpu::transpose(tmp, mask);
+    cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA, stream);
+    cv::gpu::transpose(tmp, mask, stream);
  }
  
  void cv::gpu::SCascade::read(const FileNode& fn)
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp

index f26c44f..cfae940 100644 (file)
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -330,4 +330,43 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral,
  
      ASSERT_EQ( a ,1024);
  }
+
+GPU_TEST_P(SCascadeTestAll, detectStream,
+        ALL_DEVICES
+        )
+{
+    cv::gpu::setDevice(GetParam().deviceID());
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
+        + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
+    ASSERT_FALSE(coloredCpu.empty());
+
+    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
+    sub.setTo(cv::Scalar::all(1));
+
+    cv::gpu::Stream s;
+
+    cv::gpu::GpuMat trois;
+    cascade.genRoi(rois, trois, s);
+
+    cascade.detect(colored, trois, objectBoxes, s);
+
+    cudaDeviceSynchronize();
+
+    typedef cv::gpu::SCascade::Detection Detection;
+    cv::Mat detections(objectBoxes);
+    int a = *(detections.ptr<int>(0));
+    ASSERT_EQ(a ,2460);
+}
+
+
  #endif
 \ No newline at end of file
author	marina.kolpakova <marina.kolpakova@itseez.com>
	Fri, 9 Nov 2012 23:59:09 +0000 (03:59 +0400)
committer	marina.kolpakova <marina.kolpakova@itseez.com>
	Sat, 10 Nov 2012 01:13:26 +0000 (05:13 +0400)
modules/gpu/include/opencv2/gpu/gpu.hpp		patch \| blob \| history
modules/gpu/perf/perf_softcascade.cpp		patch \| blob \| history
modules/gpu/src/cuda/integral_image.cu		patch \| blob \| history
modules/gpu/src/cuda/isf-sc.cu		patch \| blob \| history
modules/gpu/src/icf.hpp		patch \| blob \| history
modules/gpu/src/softcascade.cpp		patch \| blob \| history
modules/gpu/test/test_softcascade.cpp		patch \| blob \| history