From: Vladislav Vinogradov Date: Tue, 30 Dec 2014 12:37:14 +0000 (+0300) Subject: added stream parameter to all cudaimgproc routines X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~2746^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f50a0612254be1116140886ba733fa4861737038;p=platform%2Fupstream%2Fopencv.git added stream parameter to all cudaimgproc routines --- diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp index 7aa74aa..52bfcef 100644 --- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp +++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp @@ -240,8 +240,9 @@ CV_EXPORTS Ptr createCLAHE(double clipLimit = 40.0, Size tileGridSi @param nLevels Number of computed levels. nLevels must be at least 2. @param lowerLevel Lower boundary value of the lowest level. @param upperLevel Upper boundary value of the greatest level. +@param stream Stream for the asynchronous version. */ -CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel); +CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); /** @brief Calculates a histogram with evenly distributed bins. @@ -281,15 +282,17 @@ public: /** @brief Finds edges in an image using the @cite Canny86 algorithm. @param image Single-channel 8-bit input image. - @param edges Output edge map. It has the same size and type as image . + @param edges Output edge map. It has the same size and type as image. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray image, OutputArray edges) = 0; + virtual void detect(InputArray image, OutputArray edges, Stream& stream = Stream::Null()) = 0; /** @overload @param dx First derivative of image in the vertical direction. Support only CV_32S type. @param dy First derivative of image in the horizontal direction. Support only CV_32S type. - @param edges Output edge map. It has the same size and type as image . + @param edges Output edge map. It has the same size and type as image. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0; + virtual void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream = Stream::Null()) = 0; virtual void setLowThreshold(double low_thresh) = 0; virtual double getLowThreshold() const = 0; @@ -336,18 +339,20 @@ public: \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of the image). \f$\theta\f$ is the line rotation angle in radians ( \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ). + @param stream Stream for the asynchronous version. @sa HoughLines */ - virtual void detect(InputArray src, OutputArray lines) = 0; + virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0; /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory. @param d_lines Result of cuda::HoughLinesDetector::detect . @param h_lines Output host array. @param h_votes Optional output array for line's votes. + @param stream Stream for the asynchronous version. */ - virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0; + virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray(), Stream& stream = Stream::Null()) = 0; virtual void setRho(float rho) = 0; virtual float getRho() const = 0; @@ -391,10 +396,11 @@ public: @param lines Output vector of lines. Each line is represented by a 4-element vector \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected line segment. + @param stream Stream for the asynchronous version. @sa HoughLinesP */ - virtual void detect(InputArray src, OutputArray lines) = 0; + virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0; virtual void setRho(float rho) = 0; virtual float getRho() const = 0; @@ -435,10 +441,11 @@ public: @param src 8-bit, single-channel grayscale input image. @param circles Output vector of found circles. Each vector is encoded as a 3-element floating-point vector \f$(x, y, radius)\f$ . + @param stream Stream for the asynchronous version. @sa HoughCircles */ - virtual void detect(InputArray src, OutputArray circles) = 0; + virtual void detect(InputArray src, OutputArray circles, Stream& stream = Stream::Null()) = 0; virtual void setDp(float dp) = 0; virtual float getDp() const = 0; @@ -553,8 +560,9 @@ public: positions). @param mask Optional region of interest. If the image is not empty (it needs to have the type CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0; + virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray(), Stream& stream = Stream::Null()) = 0; }; /** @brief Creates implementation for cuda::CornersDetector . @@ -590,7 +598,7 @@ as src . @param sp Spatial window radius. @param sr Color window radius. @param criteria Termination criteria. See TermCriteria. -@param stream +@param stream Stream for the asynchronous version. It maps each point of the source image into another point. As a result, you have a new color and new position of each point. @@ -610,7 +618,7 @@ src size. The type is CV_16SC2 . @param sp Spatial window radius. @param sr Color window radius. @param criteria Termination criteria. See TermCriteria. -@param stream +@param stream Stream for the asynchronous version. @sa cuda::meanShiftFiltering */ @@ -626,9 +634,11 @@ CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dsts @param sr Color window radius. @param minsize Minimum segment size. Smaller segments are merged. @param criteria Termination criteria. See TermCriteria. +@param stream Stream for the asynchronous version. */ CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), + Stream& stream = Stream::Null()); /////////////////////////// Match Template //////////////////////////// diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp index eed4a28..1e52bd2 100644 --- a/modules/cudaimgproc/src/canny.cpp +++ b/modules/cudaimgproc/src/canny.cpp @@ -53,16 +53,16 @@ Ptr cv::cuda::createCannyEdgeDetector(double, double, int, bo namespace canny { - void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); - void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream); + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream); - void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh); + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream); - void edgesHysteresisLocal(PtrStepSzi map, short2* st1); + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream); - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2); + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream); - void getEdges(PtrStepSzi map, PtrStepSzb dst); + void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream); } namespace @@ -76,8 +76,8 @@ namespace old_apperture_size_ = -1; } - void detect(InputArray image, OutputArray edges); - void detect(InputArray dx, InputArray dy, OutputArray edges); + void detect(InputArray image, OutputArray edges, Stream& stream); + void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream); void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; } double getLowThreshold() const { return low_thresh_; } @@ -111,7 +111,7 @@ namespace private: void createBuf(Size image_size); - void CannyCaller(GpuMat& edges); + void CannyCaller(GpuMat& edges, Stream& stream); double low_thresh_; double high_thresh_; @@ -128,7 +128,7 @@ namespace int old_apperture_size_; }; - void CannyImpl::detect(InputArray _image, OutputArray _edges) + void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream) { GpuMat image = _image.getGpuMat(); @@ -150,24 +150,24 @@ namespace image.locateROI(wholeSize, ofs); GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step); - canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); } else { #ifndef HAVE_OPENCV_CUDAFILTERS throw_no_cuda(); #else - filterDX_->apply(image, dx_); - filterDY_->apply(image, dy_); + filterDX_->apply(image, dx_, stream); + filterDY_->apply(image, dy_, stream); - canny::calcMagnitude(dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); #endif } - CannyCaller(edges); + CannyCaller(edges, stream); } - void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges) + void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges, Stream& stream) { GpuMat dx = _dx.getGpuMat(); GpuMat dy = _dy.getGpuMat(); @@ -176,8 +176,8 @@ namespace CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() ); CV_Assert( deviceSupports(SHARED_ATOMICS) ); - dx.copyTo(dx_); - dy.copyTo(dy_); + dx.copyTo(dx_, stream); + dy.copyTo(dy_, stream); if (low_thresh_ > high_thresh_) std::swap(low_thresh_, high_thresh_); @@ -187,9 +187,9 @@ namespace _edges.create(dx.size(), CV_8UC1); GpuMat edges = _edges.getGpuMat(); - canny::calcMagnitude(dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); - CannyCaller(edges); + CannyCaller(edges, stream); } void CannyImpl::createBuf(Size image_size) @@ -215,16 +215,16 @@ namespace ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_); } - void CannyImpl::CannyCaller(GpuMat& edges) + void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream) { map_.setTo(Scalar::all(0)); - canny::calcMap(dx_, dy_, mag_, map_, static_cast(low_thresh_), static_cast(high_thresh_)); + canny::calcMap(dx_, dy_, mag_, map_, static_cast(low_thresh_), static_cast(high_thresh_), StreamAccessor::getStream(stream)); - canny::edgesHysteresisLocal(map_, st1_.ptr()); + canny::edgesHysteresisLocal(map_, st1_.ptr(), StreamAccessor::getStream(stream)); - canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr()); + canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr(), StreamAccessor::getStream(stream)); - canny::getEdges(map_, edges); + canny::getEdges(map_, edges, StreamAccessor::getStream(stream)); } } diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu index 3d770e1..e0ba515 100644 --- a/modules/cudaimgproc/src/cuda/canny.cu +++ b/modules/cudaimgproc/src/cuda/canny.cu @@ -120,7 +120,7 @@ namespace canny mag(y, x) = norm(dxVal, dyVal); } - void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); @@ -131,30 +131,31 @@ namespace canny if (L2Grad) { L2 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); } else { L1 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } - void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) { if (L2Grad) { L2 norm; - transform(dx, dy, mag, norm, WithOutMask(), 0); + transform(dx, dy, mag, norm, WithOutMask(), stream); } else { L1 norm; - transform(dx, dy, mag, norm, WithOutMask(), 0); + transform(dx, dy, mag, norm, WithOutMask(), stream); } } } @@ -217,17 +218,18 @@ namespace canny map(y, x) = edge_type; } - void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); bindTexture(&tex_mag, mag); - calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); + calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -328,20 +330,21 @@ namespace canny } } - void edgesHysteresisLocal(PtrStepSzi map, short2* st1) + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); const dim3 block(16, 16); const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - edgesHysteresisLocalKernel<<>>(map, st1); + edgesHysteresisLocalKernel<<>>(map, st1); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -441,27 +444,30 @@ namespace canny } } - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2) + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); while (count > 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); const dim3 block(128); const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); + edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); count = min(count, map.cols * map.rows); @@ -499,9 +505,9 @@ namespace cv { namespace cuda { namespace device namespace canny { - void getEdges(PtrStepSzi map, PtrStepSzb dst) + void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream) { - transform(map, dst, GetEdges(), WithOutMask(), 0); + transform(map, dst, GetEdges(), WithOutMask(), stream); } } diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp index 2436650..162ee46 100644 --- a/modules/cudaimgproc/src/gftt.cpp +++ b/modules/cudaimgproc/src/gftt.cpp @@ -68,7 +68,7 @@ namespace GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance, int blockSize, bool useHarrisDetector, double harrisK); - void detect(InputArray image, OutputArray corners, InputArray mask = noArray()); + void detect(InputArray image, OutputArray corners, InputArray mask, Stream& stream); private: int maxCorners_; @@ -96,8 +96,11 @@ namespace cuda::createMinEigenValCorner(srcType, blockSize, 3); } - void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask) + void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::gfft; GpuMat image = _image.getGpuMat(); diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index a965242..e942e9e 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -53,7 +53,7 @@ void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); cv::Ptr cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr(); } -void cv::cuda::evenLevels(OutputArray, int, int, int) { throw_no_cuda(); } +void cv::cuda::evenLevels(OutputArray, int, int, int, Stream&) { throw_no_cuda(); } void cv::cuda::histEven(InputArray, OutputArray, InputOutputArray, int, int, int, Stream&) { throw_no_cuda(); } void cv::cuda::histEven(InputArray, GpuMat*, InputOutputArray, int*, int*, int*, Stream&) { throw_no_cuda(); } @@ -460,7 +460,7 @@ namespace }; } -void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel) +void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream) { const int kind = _levels.kind(); @@ -475,7 +475,7 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr(), nLevels, lowerLevel, upperLevel) ); if (kind == _InputArray::CUDA_GPU_MAT) - _levels.getGpuMatRef().upload(host_levels); + _levels.getGpuMatRef().upload(host_levels, stream); } namespace hist diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp index 3f9b933..6bdaf16 100644 --- a/modules/cudaimgproc/src/hough_circles.cpp +++ b/modules/cudaimgproc/src/hough_circles.cpp @@ -74,7 +74,7 @@ namespace public: HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles); - void detect(InputArray src, OutputArray circles); + void detect(InputArray src, OutputArray circles, Stream& stream); void setDp(float dp) { dp_ = dp; } float getDp() const { return dp_; } @@ -154,8 +154,11 @@ namespace filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1); } - void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles) + void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_circles; diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp index b9f159a..7b9c082 100644 --- a/modules/cudaimgproc/src/hough_lines.cpp +++ b/modules/cudaimgproc/src/hough_lines.cpp @@ -75,8 +75,8 @@ namespace { } - void detect(InputArray src, OutputArray lines); - void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()); + void detect(InputArray src, OutputArray lines, Stream& stream); + void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream); void setRho(float rho) { rho_ = rho; } float getRho() const { return rho_; } @@ -125,8 +125,11 @@ namespace GpuMat result_; }; - void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines) + void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_lines; @@ -170,7 +173,7 @@ namespace result_.copyTo(lines); } - void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes) + void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream) { GpuMat d_lines = _d_lines.getGpuMat(); @@ -184,12 +187,18 @@ namespace CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 ); - d_lines.row(0).download(h_lines); + if (stream) + d_lines.row(0).download(h_lines, stream); + else + d_lines.row(0).download(h_lines); if (h_votes.needed()) { GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr(1)); - d_votes.download(h_votes); + if (stream) + d_votes.download(h_votes, stream); + else + d_votes.download(h_votes); } } } diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp index 2434f6d..e3e34ec 100644 --- a/modules/cudaimgproc/src/hough_segments.cpp +++ b/modules/cudaimgproc/src/hough_segments.cpp @@ -79,7 +79,7 @@ namespace { } - void detect(InputArray src, OutputArray lines); + void detect(InputArray src, OutputArray lines, Stream& stream); void setRho(float rho) { rho_ = rho; } float getRho() const { return rho_; } @@ -128,8 +128,11 @@ namespace GpuMat result_; }; - void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines) + void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_lines; using namespace cv::cuda::device::hough_segments; diff --git a/modules/cudaimgproc/src/mssegmentation.cpp b/modules/cudaimgproc/src/mssegmentation.cpp index ad58198..54926f3 100644 --- a/modules/cudaimgproc/src/mssegmentation.cpp +++ b/modules/cudaimgproc/src/mssegmentation.cpp @@ -43,7 +43,7 @@ #if !defined HAVE_CUDA || defined(CUDA_DISABLER) -void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria) { throw_no_cuda(); } +void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria, Stream&) { throw_no_cuda(); } #else @@ -222,7 +222,7 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs) } // anonymous namespace -void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria) +void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria, Stream& stream) { GpuMat src = _src.getGpuMat(); @@ -235,7 +235,10 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, // Perform mean shift procedure and obtain region and spatial maps GpuMat d_rmap, d_spmap; - cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria); + cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria, stream); + + stream.waitForCompletion(); + Mat rmap(d_rmap); Mat spmap(d_spmap);