fix CUDA support for streams for NMS; refactor tests
authormarina.kolpakova <marina.kolpakova@itseez.com>
Mon, 26 Nov 2012 13:53:25 +0000 (17:53 +0400)
committermarina.kolpakova <marina.kolpakova@itseez.com>
Mon, 26 Nov 2012 13:53:25 +0000 (17:53 +0400)
modules/gpu/src/cuda/icf-sc.cu
modules/gpu/src/icf.hpp
modules/gpu/src/softcascade.cpp
modules/gpu/test/test_softcascade.cpp

index 12b511f..e323799 100644 (file)
@@ -134,7 +134,8 @@ namespace icf {
         }
     }
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed)
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
+        PtrStepSzb suppressed, cudaStream_t stream)
     {
         int block = 192;
         int grid = 1;
@@ -146,7 +147,7 @@ namespace icf {
         overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0));
         collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1);
 
-        // if (!stream)
+        if (!stream)
         {
             cudaSafeCall( cudaGetLastError());
             cudaSafeCall( cudaDeviceSynchronize());
@@ -330,15 +331,15 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
 
 template<typename Policy>
 void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const
+    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 {
     int fw = roi.rows;
     int fh = roi.cols;
 
     dim3 grid(fw, fh / Policy::STA_Y, downscales);
 
-    uint* ctr = (uint*)(counter.ptr(0));
-    Detection* det = (Detection*)objects.ptr();
+    uint* ctr = (uint*)(objects.ptr(0));
+    Detection* det = ((Detection*)objects.ptr(0)) + 1;
     uint max_det = objects.cols / sizeof(Detection);
 
     cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
@@ -363,7 +364,7 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
 }
 
 template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const;
+    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
 
 }
 }}}
\ No newline at end of file
index 454ac30..06f99f2 100644 (file)
@@ -147,7 +147,7 @@ struct CascadeInvoker
     int scales;
 
     void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales, const cudaStream_t& stream = 0) const;
+        const int downscales, const cudaStream_t& stream = 0) const;
 
     template<bool isUp>
     __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
index 5324e2e..37e7e3f 100644 (file)
@@ -86,7 +86,8 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins, cudaStream_t stream);
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed);
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
+        PtrStepSzb suppressed, cudaStream_t stream);
 }
 
 namespace imgproc {
@@ -328,13 +329,20 @@ struct cv::gpu::SCascade::Fields
         leaves.upload(hleaves);
     }
 
-    void detect(const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
+    void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, Stream& s) const
     {
-        cudaMemset(count.data, 0, sizeof(Detection));
+        if (s)
+            s.enqueueMemSet(objects, 0);
+        else
+            cudaMemset(objects.data, 0, sizeof(Detection));
+
         cudaSafeCall( cudaGetLastError());
+
         device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
         = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
-        invoker(roi, hogluv, objects, count, downscales, stream);
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        invoker(roi, hogluv, objects, downscales, stream);
     }
 
     void preprocess(const cv::gpu::GpuMat& colored, Stream& s)
@@ -356,6 +364,26 @@ struct cv::gpu::SCascade::Fields
         integrate(fh, fw, s);
     }
 
+    void suppress(GpuMat& objects, Stream& s)
+    {
+        GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
+
+        if (s)
+        {
+            s.enqueueMemSet(overlaps, 0);
+            s.enqueueMemSet(suppressed, 0);
+        }
+        else
+        {
+            overlaps.setTo(0);
+            suppressed.setTo(0);
+        }
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
+    }
+
 private:
 
     typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
@@ -442,17 +470,7 @@ private:
         }
     }
 
-#include <iostream>
 public:
-    void suppress(GpuMat& ndetections, GpuMat& objects)
-    {
-        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
-        overlaps.setTo(0);
-        suppressed.setTo(0);
-
-        device::icf::suppress(objects, overlaps, ndetections, suppressed);
-        // std::cout << cv::Mat(overlaps) << std::endl;
-    }
 
     // scales range
     float minScale;
@@ -547,20 +565,18 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     }
     else
     {
-        colored.copyTo(flds.hogluv);
+        if (s)
+            s.enqueueCopy(colored, flds.hogluv);
+        else
+            colored.copyTo(flds.hogluv);
     }
 
-    GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
-
-    GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
-    objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1));
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    flds.detect(rois, tmp, objects, stream);
+    flds.detect(rois, objects, s);
 
     if (rejCriteria != NO_REJECT)
     {
-        flds.suppress(tmp, objects);
+        GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
+        flds.suppress(objects, s);
         flds.suppressed.copyTo(spr);
     }
 }
index e36c289..da97d41 100644 (file)
@@ -47,7 +47,7 @@
 using cv::gpu::GpuMat;
 
 // show detection results on input image with cv::imshow
-#define SHOW_DETECTIONS
+// #define SHOW_DETECTIONS
 
 #if defined SHOW_DETECTIONS
 # define SHOW(res)           \
@@ -99,21 +99,9 @@ namespace {
         return std::string(s);
     }
 
-    static std::string getImageName(int level)
-    {
-        time_t rawtime;
-        struct tm * timeinfo;
-        char buffer [80];
-
-        time ( &rawtime );
-        timeinfo = localtime ( &rawtime );
-
-        strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo);
-        return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
-    }
-
     static void print(std::ostream &out, const Detection& d)
     {
+#if defined SHOW_DETECTIONS
         out << "\x1b[32m[ detection]\x1b[0m ("
             << std::setw(4)  << d.x
             << " "
@@ -125,11 +113,32 @@ namespace {
             << ") "
             << std::setw(12) << d.confidence
             <<  std::endl;
+#else
+        (void)out; (void)d;
+#endif
     }
 
     static void printTotal(std::ostream &out, int detbytes)
     {
+#if defined SHOW_DETECTIONS
         out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
+#else
+        (void)out; (void)detbytes;
+#endif
+    }
+
+#if defined SHOW_DETECTIONS
+    static std::string getImageName(int level)
+    {
+        time_t rawtime;
+        struct tm * timeinfo;
+        char buffer [80];
+
+        time ( &rawtime );
+        timeinfo = localtime ( &rawtime );
+
+        strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo);
+        return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
     }
 
     static void writeResult(const cv::Mat& result, const int level)
@@ -138,6 +147,7 @@ namespace {
         cv::imwrite(path, result);
         std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
     }
+#endif
 }
 
 typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestRoi;