From 0e1005ca92c2e5a979f4b9e92f8649ec8d02fd10 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 28 Nov 2012 15:40:00 +0400 Subject: [PATCH] optimize hog bin computing --- modules/gpu/include/opencv2/gpu/gpu.hpp | 25 +++++++- modules/gpu/src/cuda/icf-sc.cu | 90 +++++++++++++++++++++------ modules/gpu/src/softcascade.cpp | 104 ++++++++++++++++++++++++++------ 3 files changed, 178 insertions(+), 41 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index bdb9f8c..7003c8f 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1534,6 +1534,28 @@ class CV_EXPORTS SCascade : public Algorithm { public: + enum { GENERIC = 1, SEPARABLE = 2}; + class CV_EXPORTS Preprocessor + { + public: + + // Appends specified number of HOG first-order features integrals into given vector. + // Param frame is an input 3-channel bgr image. + // Param channels is a GPU matrix of integrals. + // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution. + virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0; + + // Creates a specific preprocessor implementation. + // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum + // Param bins is a number of HOG-like channels. + // Param method is a channel computing method. + static cv::Ptr create(const int shrinkage, const int bins, const int method = GENERIC); + + + protected: + Preprocessor(); + }; + // Representation of detectors result. struct CV_EXPORTS Detection { @@ -1576,9 +1598,6 @@ public: // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; - // Preprocesing only - virtual void preprocess(InputArray image, OutputArray channels, Stream& stream = Stream::Null()) const; - // Convert ROI matrix into the suitable for detect method. // Param roi is an input matrix of the same size as the image. // There non zero value mean that detector should be executed in this point. diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu index f59b08e..812be7a 100644 --- a/modules/gpu/src/cuda/icf-sc.cu +++ b/modules/gpu/src/cuda/icf-sc.cu @@ -99,6 +99,7 @@ namespace icf { cudaSafeCall(cudaDeviceSynchronize()); } + template __device__ __forceinline__ int fast_angle_bin(const float& dx, const float& dy) { const float angle_quantum = M_PI / 6.f; @@ -110,36 +111,82 @@ namespace icf { return static_cast(angle * angle_scaling) % 6; } - texture tgray; - - __global__ void magnitude_d(PtrStepSzb mag) + template<> + __device__ __forceinline__ int fast_angle_bin(const float& dy, const float& dx) { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + int index = 0; + + float max_dot = fabs(dx); + + { + const float dot_product = fabs(dx * 0.8660254037844386f + dy * 0.5f); - const float dx_a = tex2D(tgray, x + 1, y), - dx_b = tex2D(tgray, x - 1, y), - dx = dx_a - dx_b, + if(dot_product > max_dot) + { + max_dot = dot_product; + index = 1; + } + } + { + const float dot_product = fabs(dy * 0.8660254037844386f + dx * 0.5f); - dy_a = tex2D(tgray, x, y + 1), - dy_b = tex2D(tgray, x, y - 1), - dy = dy_a - dy_b; + if(dot_product > max_dot) + { + max_dot = dot_product; + index = 2; + } + } + { + int i = 3; + float2 bin_vector_i; + bin_vector_i.x = ::cos(i * (M_PI / 6.f)); + bin_vector_i.y = ::sin(i * (M_PI / 6.f)); + const float dot_product = fabs(dx * bin_vector_i.x + dy * bin_vector_i.y); + if(dot_product > max_dot) + { + max_dot = dot_product; + index = i; + } + } + { + const float dot_product = fabs(dx * (-0.4999999999999998f) + dy * 0.8660254037844387f); + if(dot_product > max_dot) + { + max_dot = dot_product; + index = 4; + } + } + { + const float dot_product = fabs(dx * (-0.8660254037844387f) + dy * 0.49999999999999994f); + if(dot_product > max_dot) + { + max_dot = dot_product; + index = 5; + } + } + return index; + } - const float magnitude_scaling = 1.0f/ sqrtf(2); + texture tgray; - const float magnitude = sqrtf((dx * dx) + (dy * dy)) * magnitude_scaling; - const uchar magnitude_u8 = static_cast(magnitude); + template + __global__ void gray2hog(PtrStepSzb mag) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - mag( 480 * 6 + y, x) = magnitude_u8; + const float dx = tex2D(tgray, x + 1, y + 0) - tex2D(tgray, x - 1, y - 0); + const float dy = tex2D(tgray, x + 0, y + 1) - tex2D(tgray, x - 0, y - 1); - int angle_channel_index; + const float magnitude = sqrtf((dx * dx) + (dy * dy)) * (1.0f / sqrtf(2)); + const uchar cmag = static_cast(magnitude); - angle_channel_index = fast_angle_bin(dy, dx); - mag( 480 * angle_channel_index + y, x) = magnitude_u8; + mag( 480 * 6 + y, x) = cmag; + mag( 480 * fast_angle_bin(dy, dx) + y, x) = cmag; } - void magnitude(const PtrStepSzb& gray, PtrStepSzb mag) + void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins) { dim3 block(32, 8); dim3 grid(gray.cols / 32, gray.rows / 8); @@ -147,7 +194,10 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, tgray, gray.data, desc, gray.cols, gray.rows, gray.step) ); - magnitude_d<<>>(mag); + if (bins == 6) + gray2hog<<>>(mag); + else + gray2hog<<>>(mag); cudaSafeCall(cudaDeviceSynchronize()); } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 1d32736..2c7d42b 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -57,6 +57,11 @@ void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_n void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); } +cv::gpu::SCascade::Preprocessor::Preprocessor() { throw_nogpu(); } + +void cv::gpu::SCascade::Preprocessor::create(const int, const int, const int) { throw_nogpu(); } + + #else #include @@ -90,7 +95,7 @@ namespace icf { PtrStepSzb suppressed, cudaStream_t stream); void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv); - void magnitude(const PtrStepSzb& gray, PtrStepSzb mag); + void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins); } namespace imgproc { @@ -609,34 +614,97 @@ void cv::gpu::SCascade::read(const FileNode& fn) Algorithm::read(fn); } +// namespace { + +// void bgr2Luv(const cv::gpu::GpuMat& input, cv::gpu::GpuMat& luv /*integral*/) +// { +// cv::gpu::GpuMat bgr; +// cv::gpu::GaussianBlur(input, bgr, cv::Size(3, 3), -1); + +// cv::gpu::GpuMat gray, /*luv,*/ shrunk, buffer; +// luv.create(bgr.rows * 10, bgr.cols, CV_8UC1); +// luv.setTo(0); + +// cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY); +// cv::gpu::device::icf::magnitude(gray, luv(cv::Rect(0, 0, bgr.cols, bgr.rows * 7))); + +// cv::gpu::GpuMat __luv(luv, cv::Rect(0, bgr.rows * 7, bgr.cols, bgr.rows * 3)); +// cv::gpu::device::icf::bgr2Luv(bgr, __luv); + +// // cv::gpu::resize(luv, shrunk, cv::Size(), 0.25f, 0.25f, CV_INTER_AREA); +// // cv::gpu::integralBuffered(shrunk, integral, buffer); +// } +// } + namespace { -void bgr2Luv(const cv::gpu::GpuMat& input, cv::gpu::GpuMat& integral) +using cv::InputArray; +using cv::OutputArray; +using cv::gpu::Stream; +using cv::gpu::GpuMat; + +struct GenricPreprocessor : public cv::gpu::SCascade::Preprocessor { - cv::gpu::GpuMat bgr; - cv::gpu::GaussianBlur(input, bgr, cv::Size(3, 3), -1); + GenricPreprocessor(const int s, const int b) : cv::gpu::SCascade::Preprocessor(), shrinkage(s), bins(b) {} - cv::gpu::GpuMat gray, luv, shrunk, buffer; - luv.create(bgr.rows * 10, bgr.cols, CV_8UC1); - luv.setTo(0); + virtual void apply(InputArray /*frame*/, OutputArray /*channels*/, Stream& /*s*/ = Stream::Null()) + { - cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY); - cv::gpu::device::icf::magnitude(gray, luv(cv::Rect(0, 0, bgr.cols, bgr.rows * 7))); + } - cv::gpu::GpuMat __luv(luv, cv::Rect(0, bgr.rows * 7, bgr.cols, bgr.rows * 3)); - cv::gpu::device::icf::bgr2Luv(bgr, __luv); +private: + const int shrinkage; + const int bins; +}; - cv::gpu::resize(luv, shrunk, cv::Size(), 0.25f, 0.25f, CV_INTER_AREA); - cv::gpu::integralBuffered(shrunk, integral, buffer); -} +inline void setZero(cv::gpu::GpuMat& m, Stream& s) +{ + if (s) + s.enqueueMemSet(m, 0); + else + m.setTo(0); } -void cv::gpu::SCascade::preprocess(InputArray _bgr, OutputArray _channels, Stream& stream) const +struct SeparablePreprocessor : public cv::gpu::SCascade::Preprocessor { - CV_Assert(fields); - (void)stream; - const GpuMat bgr = _bgr.getGpuMat(), channels = _channels.getGpuMat(); + SeparablePreprocessor(const int s, const int b) : cv::gpu::SCascade::Preprocessor(), shrinkage(s), bins(b) {} + + virtual void apply(InputArray _frame, OutputArray _channels, Stream& s = Stream::Null()) + { + const GpuMat frame = _frame.getGpuMat(); + cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0); + + _channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1); + GpuMat channels = _channels.getGpuMat(); + setZero(channels, s); + + cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY); + cv::gpu::device::icf::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins); + + cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3)); + cv::gpu::device::icf::bgr2Luv(bgr, luv); + } + +private: + const int shrinkage; + const int bins; + + GpuMat bgr; + GpuMat gray; +}; + } +cv::gpu::SCascade::Preprocessor::Preprocessor(){} + +cv::Ptr cv::gpu::SCascade::Preprocessor::create(const int s, const int b, const int m) +{ + CV_Assert(m == SEPARABLE || m == GENERIC); + + if (m == GENERIC) + return cv::Ptr(new GenricPreprocessor(s, b)); + + return cv::Ptr(new SeparablePreprocessor(s, b)); +} #endif \ No newline at end of file -- 2.7.4