From 0cbf9eb22a264493da9ebf2e1101af60534cc12c Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 10 Nov 2012 03:59:09 +0400 Subject: [PATCH] add support for CUDA streams --- modules/gpu/include/opencv2/gpu/gpu.hpp | 2 +- modules/gpu/perf/perf_softcascade.cpp | 42 ++++++++++++++++++- modules/gpu/src/cuda/integral_image.cu | 1 - modules/gpu/src/cuda/isf-sc.cu | 28 ++++++++----- modules/gpu/src/icf.hpp | 2 +- modules/gpu/src/softcascade.cpp | 72 +++++++++++++++++++-------------- modules/gpu/test/test_softcascade.cpp | 39 ++++++++++++++++++ 7 files changed, 140 insertions(+), 46 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 4fc6179..8f327f2 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1577,7 +1577,7 @@ public: virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const; - void genRoi(InputArray roi, OutputArray mask) const; + void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const; private: diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp index 1e62af8..3e82cc5 100644 --- a/modules/gpu/perf/perf_softcascade.cpp +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -284,4 +284,44 @@ RUN_GPU(SCascadeTest, detectOnIntegral) SANITY_CHECK(sortDetections(objectBoxes)); } -NO_CPU(SCascadeTest, detectOnIntegral) \ No newline at end of file +NO_CPU(SCascadeTest, detectOnIntegral) + +GPU_PERF_TEST_P(SCascadeTest, detectStream, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) +{ } + +RUN_GPU(SCascadeTest, detectStream) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois; + rois.setTo(1); + + cv::gpu::Stream s; + + cascade.genRoi(rois, trois, s); + + cascade.detect(colored, trois, objectBoxes, s); + + TEST_CYCLE() + { + cascade.detect(colored, trois, objectBoxes, s); + } + + cudaDeviceSynchronize(); + + SANITY_CHECK(sortDetections(objectBoxes)); +} + +NO_CPU(SCascadeTest, detectStream) \ No newline at end of file diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index 5bd35bd..200960b 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -444,7 +444,6 @@ namespace cv { namespace gpu { namespace device } // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed - // ToDo: partial dy void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz buffer, PtrStepSz integral, int blockStep, cudaStream_t stream) { diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index ee9a9f6..0de2d8e 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -71,7 +71,7 @@ namespace icf { } void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, - const int fw, const int fh, const int bins) + const int fw, const int fh, const int bins, cudaStream_t stream ) { const uchar* mag = (const uchar*)hogluv.ptr(fh * bins); uchar* hog = (uchar*)hogluv.ptr(); @@ -80,9 +80,12 @@ namespace icf { dim3 block(32, 8); dim3 grid(fw / 32, fh / 8); - magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh); - cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh); + if (!stream) + { + cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); + } } texture thogluv; @@ -305,7 +308,7 @@ namespace icf { template<> void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale) const + PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const { int fw = 160; int fh = 120; @@ -325,22 +328,25 @@ namespace icf { if (scale == -1) { - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0); cudaSafeCall( cudaGetLastError()); grid = dim3(fw, fh / 8, 47 - downscales); - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales); } else { if (scale >= downscales) - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); else - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); } - cudaSafeCall( cudaGetLastError()); - cudaSafeCall( cudaDeviceSynchronize()); + if (!stream) + { + cudaSafeCall( cudaGetLastError()); + cudaSafeCall( cudaDeviceSynchronize()); + } } } }}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index d829012..60df558 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -139,7 +139,7 @@ struct CascadeInvoker const float* leaves; void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter, const int downscales, const int csale = -1) const; + PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const; }; } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 02481ed..fdde261 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -54,7 +54,7 @@ bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;} void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); } void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); } -void cv::gpu::SCascade::genRoi(InputArray, OutputArray) const { throw_nogpu(); } +void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_nogpu(); } void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); } @@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device { namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, - const int fw, const int fh, const int bins); + const int fw, const int fh, const int bins, cudaStream_t stream); } namespace imgproc { @@ -341,27 +341,30 @@ struct cv::gpu::SCascade::Fields } - void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, cudaStream_t stream) const + void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const { cudaMemset(count.data, 0, sizeof(Detection)); cudaSafeCall( cudaGetLastError()); - invoker(roi, hogluv, objects, count, downscales, scale); + invoker(roi, hogluv, objects, count, downscales, scale, stream); } - void preprocess(const cv::gpu::GpuMat& colored) + void preprocess(const cv::gpu::GpuMat& colored, Stream& s) { - cudaMemset(plane.data, 0, plane.step * plane.rows); + if (s) + s.enqueueMemSet(plane, 0); + else + cudaMemset(plane.data, 0, plane.step * plane.rows); static const int fw = Fields::FRAME_WIDTH; static const int fh = Fields::FRAME_HEIGHT; GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh)); - cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); - createHogBins(gray); + cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY, s); + createHogBins(gray ,s); - createLuvBins(colored); + createLuvBins(colored, s); - integrate(); + integrate(s); } private: @@ -386,7 +389,7 @@ private: return res; } - void createHogBins(const cv::gpu::GpuMat& gray) + void createHogBins(const cv::gpu::GpuMat& gray, Stream& s) { static const int fw = Fields::FRAME_WIDTH; static const int fh = Fields::FRAME_HEIGHT; @@ -394,35 +397,38 @@ private: GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); - cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0); - cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1); + cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s); + cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s); GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh)); GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh)); - cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true); + cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s); // normolize magnitude to uchar interval and angles to 6 bins - GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh)); GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh)); - cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag); - cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); + cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag, 1, -1, s); + cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang, 1, -1, s); //create uchar magnitude GpuMat cmag(plane, cv::Rect(0, fh * Fields::HOG_BINS, fw, fh)); - nmag.convertTo(cmag, CV_8UC1); + if (s) + s.enqueueConvert(nmag, cmag, CV_8UC1); + else + nmag.convertTo(cmag, CV_8UC1); - device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS); + cudaStream_t stream = StreamAccessor::getStream(s); + device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS, stream); } - void createLuvBins(const cv::gpu::GpuMat& colored) + void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s) { static const int fw = Fields::FRAME_WIDTH; static const int fh = Fields::FRAME_HEIGHT; - cv::gpu::cvtColor(colored, luv, CV_BGR2Luv); + cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s); std::vector splited; for(int i = 0; i < Fields::LUV_BINS; ++i) @@ -430,17 +436,18 @@ private: splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); } - cv::gpu::split(luv, splited); + cv::gpu::split(luv, splited, s); } - void integrate() + void integrate( Stream& s) { int fw = Fields::FRAME_WIDTH; int fh = Fields::FRAME_HEIGHT; GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS)); - cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0); + cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s); + cudaStream_t stream = StreamAccessor::getStream(s); + device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream); } public: @@ -482,6 +489,8 @@ public: GpuMat leaves; GpuMat levels; + GpuMat sobelBuf; + device::icf::CascadeInvoker invoker; enum { BOOST = 0 }; @@ -516,6 +525,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); + GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); + // we guess user knows about shrincage // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); @@ -525,14 +536,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ { // only this window size allowed CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); - flds.preprocess(colored); + flds.preprocess(colored, s); } else { colored.copyTo(flds.hogluv); } - GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); @@ -556,7 +566,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ { // only this window size allowed CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); - flds.preprocess(colored); + flds.preprocess(colored, s); } else { @@ -572,15 +582,15 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ flds.detect(level, rois, tmp, objects, stream); } -void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask) const +void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const { const GpuMat roi = _roi.getGpuMat(); _mask.create( roi.cols / 4, roi.rows / 4, roi.type() ); GpuMat mask = _mask.getGpuMat(); cv::gpu::GpuMat tmp; - cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - cv::gpu::transpose(tmp, mask); + cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA, stream); + cv::gpu::transpose(tmp, mask, stream); } void cv::gpu::SCascade::read(const FileNode& fn) diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index f26c44f..cfae940 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -330,4 +330,43 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral, ASSERT_EQ( a ,1024); } + +GPU_TEST_P(SCascadeTestAll, detectStream, + ALL_DEVICES + ) +{ + cv::gpu::setDevice(GetParam().deviceID()); + std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; + cv::gpu::SCascade cascade; + + cv::FileStorage fs(xml, cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); + ASSERT_FALSE(coloredCpu.empty()); + + GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1); + rois.setTo(0); + GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); + sub.setTo(cv::Scalar::all(1)); + + cv::gpu::Stream s; + + cv::gpu::GpuMat trois; + cascade.genRoi(rois, trois, s); + + cascade.detect(colored, trois, objectBoxes, s); + + cudaDeviceSynchronize(); + + typedef cv::gpu::SCascade::Detection Detection; + cv::Mat detections(objectBoxes); + int a = *(detections.ptr(0)); + ASSERT_EQ(a ,2460); +} + + #endif \ No newline at end of file -- 2.7.4