From bd3179bda8035f3f349678aad2d1ae0510c52870 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 26 Nov 2012 17:53:25 +0400 Subject: [PATCH] fix CUDA support for streams for NMS; refactor tests --- modules/gpu/src/cuda/icf-sc.cu | 13 ++++---- modules/gpu/src/icf.hpp | 2 +- modules/gpu/src/softcascade.cpp | 62 ++++++++++++++++++++++------------- modules/gpu/test/test_softcascade.cpp | 38 +++++++++++++-------- 4 files changed, 71 insertions(+), 44 deletions(-) diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu index 12b511f..e323799 100644 --- a/modules/gpu/src/cuda/icf-sc.cu +++ b/modules/gpu/src/cuda/icf-sc.cu @@ -134,7 +134,8 @@ namespace icf { } } - void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed) + void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, + PtrStepSzb suppressed, cudaStream_t stream) { int block = 192; int grid = 1; @@ -146,7 +147,7 @@ namespace icf { overlap<<>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0)); collect<<>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1); - // if (!stream) + if (!stream) { cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); @@ -330,15 +331,15 @@ __global__ void soft_cascade(const CascadeInvoker invoker, Detection* ob template void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const + PtrStepSz objects, const int downscales, const cudaStream_t& stream) const { int fw = roi.rows; int fh = roi.cols; dim3 grid(fw, fh / Policy::STA_Y, downscales); - uint* ctr = (uint*)(counter.ptr(0)); - Detection* det = (Detection*)objects.ptr(); + uint* ctr = (uint*)(objects.ptr(0)); + Detection* det = ((Detection*)objects.ptr(0)) + 1; uint max_det = objects.cols / sizeof(Detection); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); @@ -363,7 +364,7 @@ void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& } template void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const; + PtrStepSz objects, const int downscales, const cudaStream_t& stream) const; } }}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 454ac30..06f99f2 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -147,7 +147,7 @@ struct CascadeInvoker int scales; void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter, const int downscales, const cudaStream_t& stream = 0) const; + const int downscales, const cudaStream_t& stream = 0) const; template __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 5324e2e..37e7e3f 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -86,7 +86,8 @@ namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins, cudaStream_t stream); - void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed); + void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, + PtrStepSzb suppressed, cudaStream_t stream); } namespace imgproc { @@ -328,13 +329,20 @@ struct cv::gpu::SCascade::Fields leaves.upload(hleaves); } - void detect(const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const + void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, Stream& s) const { - cudaMemset(count.data, 0, sizeof(Detection)); + if (s) + s.enqueueMemSet(objects, 0); + else + cudaMemset(objects.data, 0, sizeof(Detection)); + cudaSafeCall( cudaGetLastError()); + device::icf::CascadeInvoker invoker = device::icf::CascadeInvoker(levels, stages, nodes, leaves); - invoker(roi, hogluv, objects, count, downscales, stream); + + cudaStream_t stream = StreamAccessor::getStream(s); + invoker(roi, hogluv, objects, downscales, stream); } void preprocess(const cv::gpu::GpuMat& colored, Stream& s) @@ -356,6 +364,26 @@ struct cv::gpu::SCascade::Fields integrate(fh, fw, s); } + void suppress(GpuMat& objects, Stream& s) + { + GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); + ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps); + + if (s) + { + s.enqueueMemSet(overlaps, 0); + s.enqueueMemSet(suppressed, 0); + } + else + { + overlaps.setTo(0); + suppressed.setTo(0); + } + + cudaStream_t stream = StreamAccessor::getStream(s); + device::icf::suppress(objects, overlaps, ndetections, suppressed, stream); + } + private: typedef std::vector::const_iterator octIt_t; @@ -442,17 +470,7 @@ private: } } -#include public: - void suppress(GpuMat& ndetections, GpuMat& objects) - { - ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps); - overlaps.setTo(0); - suppressed.setTo(0); - - device::icf::suppress(objects, overlaps, ndetections, suppressed); - // std::cout << cv::Mat(overlaps) << std::endl; - } // scales range float minScale; @@ -547,20 +565,18 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ } else { - colored.copyTo(flds.hogluv); + if (s) + s.enqueueCopy(colored, flds.hogluv); + else + colored.copyTo(flds.hogluv); } - GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows)); - - GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); - objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); - cudaStream_t stream = StreamAccessor::getStream(s); - - flds.detect(rois, tmp, objects, stream); + flds.detect(rois, objects, s); if (rejCriteria != NO_REJECT) { - flds.suppress(tmp, objects); + GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows)); + flds.suppress(objects, s); flds.suppressed.copyTo(spr); } } diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index e36c289..da97d41 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -47,7 +47,7 @@ using cv::gpu::GpuMat; // show detection results on input image with cv::imshow -#define SHOW_DETECTIONS +// #define SHOW_DETECTIONS #if defined SHOW_DETECTIONS # define SHOW(res) \ @@ -99,21 +99,9 @@ namespace { return std::string(s); } - static std::string getImageName(int level) - { - time_t rawtime; - struct tm * timeinfo; - char buffer [80]; - - time ( &rawtime ); - timeinfo = localtime ( &rawtime ); - - strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo); - return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png"; - } - static void print(std::ostream &out, const Detection& d) { +#if defined SHOW_DETECTIONS out << "\x1b[32m[ detection]\x1b[0m (" << std::setw(4) << d.x << " " @@ -125,11 +113,32 @@ namespace { << ") " << std::setw(12) << d.confidence << std::endl; +#else + (void)out; (void)d; +#endif } static void printTotal(std::ostream &out, int detbytes) { +#if defined SHOW_DETECTIONS out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl; +#else + (void)out; (void)detbytes; +#endif + } + +#if defined SHOW_DETECTIONS + static std::string getImageName(int level) + { + time_t rawtime; + struct tm * timeinfo; + char buffer [80]; + + time ( &rawtime ); + timeinfo = localtime ( &rawtime ); + + strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo); + return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png"; } static void writeResult(const cv::Mat& result, const int level) @@ -138,6 +147,7 @@ namespace { cv::imwrite(path, result); std::cout << "\x1b[32m" << "[ ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl; } +#endif } typedef ::testing::TestWithParam > SCascadeTestRoi; -- 2.7.4