From: marina.kolpakova <marina.kolpakova@itseez.com>
Date: Wed, 28 Nov 2012 11:40:00 +0000 (+0400)
Subject: optimize hog bin computing
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~4052^2~27
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0e1005ca92c2e5a979f4b9e92f8649ec8d02fd10;p=platform%2Fupstream%2Fopencv.git

optimize hog bin computing
---

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index bdb9f8c..7003c8f 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1534,6 +1534,28 @@ class CV_EXPORTS SCascade : public Algorithm
 {
 public:
 
+    enum { GENERIC = 1, SEPARABLE = 2};
+    class CV_EXPORTS Preprocessor
+    {
+    public:
+
+        // Appends specified number of HOG first-order features integrals into given vector.
+        // Param frame is an input 3-channel bgr image.
+        // Param channels is a GPU matrix of integrals.
+        // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
+        virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
+
+        // Creates a specific preprocessor implementation.
+        // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
+        // Param bins is a number of HOG-like channels.
+        // Param method is a channel computing method.
+        static cv::Ptr<Preprocessor> create(const int shrinkage, const int bins, const int method = GENERIC);
+
+
+    protected:
+        Preprocessor();
+    };
+
     // Representation of detectors result.
     struct CV_EXPORTS Detection
     {
@@ -1576,9 +1598,6 @@ public:
     // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
     virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
 
-    // Preprocesing only
-    virtual void preprocess(InputArray image, OutputArray channels, Stream& stream = Stream::Null()) const;
-
     // Convert ROI matrix into the suitable for detect method.
     // Param roi is an input matrix of the same size as the image.
     //    There non zero value mean that detector should be executed in this point.
diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
index f59b08e..812be7a 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -99,6 +99,7 @@ namespace icf {
         cudaSafeCall(cudaDeviceSynchronize());
     }
 
+    template<bool isDefaultNum>
     __device__ __forceinline__ int fast_angle_bin(const float& dx, const float& dy)
     {
         const float angle_quantum = M_PI / 6.f;
@@ -110,36 +111,82 @@ namespace icf {
         return static_cast<int>(angle * angle_scaling) % 6;
     }
 
-    texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;
-
-    __global__ void magnitude_d(PtrStepSzb mag)
+    template<>
+    __device__ __forceinline__ int fast_angle_bin<true>(const float& dy, const float& dx)
     {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int index = 0;
+
+        float max_dot = fabs(dx);
+
+        {
+            const float dot_product = fabs(dx * 0.8660254037844386f + dy * 0.5f);
 
-        const float dx_a = tex2D(tgray, x + 1, y),
-                    dx_b = tex2D(tgray, x - 1, y),
-                    dx = dx_a - dx_b,
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 1;
+            }
+        }
+        {
+            const float dot_product = fabs(dy * 0.8660254037844386f + dx * 0.5f);
 
-                    dy_a = tex2D(tgray, x, y + 1),
-                    dy_b = tex2D(tgray, x, y - 1),
-                    dy = dy_a - dy_b;
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 2;
+            }
+        }
+        {
+            int i = 3;
+            float2 bin_vector_i;
+            bin_vector_i.x = ::cos(i * (M_PI / 6.f));
+            bin_vector_i.y = ::sin(i * (M_PI / 6.f));
 
+            const float dot_product = fabs(dx * bin_vector_i.x + dy * bin_vector_i.y);
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = i;
+            }
+        }
+        {
+            const float dot_product = fabs(dx * (-0.4999999999999998f) + dy * 0.8660254037844387f);
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 4;
+            }
+        }
+        {
+            const float dot_product = fabs(dx * (-0.8660254037844387f) + dy * 0.49999999999999994f);
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 5;
+            }
+        }
+        return index;
+    }
 
-        const float magnitude_scaling = 1.0f/ sqrtf(2);
+    texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;
 
-        const float magnitude = sqrtf((dx * dx) + (dy * dy)) * magnitude_scaling;
-        const uchar magnitude_u8 = static_cast<uchar>(magnitude);
+    template<bool isDefaultNum>
+    __global__ void gray2hog(PtrStepSzb mag)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-        mag( 480 * 6 + y, x) = magnitude_u8;
+        const float dx = tex2D(tgray, x + 1, y + 0) - tex2D(tgray, x - 1, y - 0);
+        const float dy = tex2D(tgray, x + 0, y + 1) - tex2D(tgray, x - 0, y - 1);
 
-        int angle_channel_index;
+        const float magnitude = sqrtf((dx * dx) + (dy * dy)) * (1.0f / sqrtf(2));
+        const uchar cmag = static_cast<uchar>(magnitude);
 
-        angle_channel_index = fast_angle_bin(dy, dx);
-        mag( 480 * angle_channel_index + y, x) = magnitude_u8;
+        mag( 480 * 6 + y, x) = cmag;
+        mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
     }
 
-    void magnitude(const PtrStepSzb& gray, PtrStepSzb mag)
+    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins)
     {
         dim3 block(32, 8);
         dim3 grid(gray.cols / 32, gray.rows / 8);
@@ -147,7 +194,10 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
         cudaSafeCall( cudaBindTexture2D(0, tgray, gray.data, desc, gray.cols, gray.rows, gray.step) );
 
-        magnitude_d<<<grid, block>>>(mag);
+        if (bins == 6)
+            gray2hog<true><<<grid, block>>>(mag);
+        else
+            gray2hog<false><<<grid, block>>>(mag);
 
         cudaSafeCall(cudaDeviceSynchronize());
     }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 1d32736..2c7d42b 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -57,6 +57,11 @@ void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_n
 
 void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 
+cv::gpu::SCascade::Preprocessor::Preprocessor() { throw_nogpu(); }
+
+void cv::gpu::SCascade::Preprocessor::create(const int, const int, const int) { throw_nogpu(); }
+
+
 #else
 
 #include <icf.hpp>
@@ -90,7 +95,7 @@ namespace icf {
         PtrStepSzb suppressed, cudaStream_t stream);
 
     void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv);
-    void magnitude(const PtrStepSzb& gray, PtrStepSzb mag);
+    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins);
 }
 
 namespace imgproc {
@@ -609,34 +614,97 @@ void cv::gpu::SCascade::read(const FileNode& fn)
     Algorithm::read(fn);
 }
 
+// namespace {
+
+// void bgr2Luv(const cv::gpu::GpuMat& input, cv::gpu::GpuMat& luv /*integral*/)
+// {
+//     cv::gpu::GpuMat bgr;
+//     cv::gpu::GaussianBlur(input, bgr, cv::Size(3, 3), -1);
+
+//     cv::gpu::GpuMat gray, /*luv,*/ shrunk, buffer;
+//     luv.create(bgr.rows * 10, bgr.cols, CV_8UC1);
+//     luv.setTo(0);
+
+//     cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+//     cv::gpu::device::icf::magnitude(gray, luv(cv::Rect(0, 0, bgr.cols, bgr.rows * 7)));
+
+//     cv::gpu::GpuMat __luv(luv, cv::Rect(0, bgr.rows * 7, bgr.cols, bgr.rows * 3));
+//     cv::gpu::device::icf::bgr2Luv(bgr, __luv);
+
+//     // cv::gpu::resize(luv, shrunk, cv::Size(), 0.25f, 0.25f, CV_INTER_AREA);
+//     // cv::gpu::integralBuffered(shrunk, integral, buffer);
+// }
+// }
+
 namespace {
 
-void bgr2Luv(const cv::gpu::GpuMat& input, cv::gpu::GpuMat& integral)
+using cv::InputArray;
+using cv::OutputArray;
+using cv::gpu::Stream;
+using cv::gpu::GpuMat;
+
+struct GenricPreprocessor : public cv::gpu::SCascade::Preprocessor
 {
-    cv::gpu::GpuMat bgr;
-    cv::gpu::GaussianBlur(input, bgr, cv::Size(3, 3), -1);
+    GenricPreprocessor(const int s, const int b) : cv::gpu::SCascade::Preprocessor(), shrinkage(s), bins(b) {}
 
-    cv::gpu::GpuMat gray, luv, shrunk, buffer;
-    luv.create(bgr.rows * 10, bgr.cols, CV_8UC1);
-    luv.setTo(0);
+    virtual void apply(InputArray /*frame*/, OutputArray /*channels*/, Stream& /*s*/ = Stream::Null())
+    {
 
-    cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
-    cv::gpu::device::icf::magnitude(gray, luv(cv::Rect(0, 0, bgr.cols, bgr.rows * 7)));
+    }
 
-    cv::gpu::GpuMat __luv(luv, cv::Rect(0, bgr.rows * 7, bgr.cols, bgr.rows * 3));
-    cv::gpu::device::icf::bgr2Luv(bgr, __luv);
+private:
+    const int shrinkage;
+    const int bins;
+};
 
-    cv::gpu::resize(luv, shrunk, cv::Size(), 0.25f, 0.25f, CV_INTER_AREA);
-    cv::gpu::integralBuffered(shrunk, integral, buffer);
-}
+inline void setZero(cv::gpu::GpuMat& m, Stream& s)
+{
+    if (s)
+        s.enqueueMemSet(m, 0);
+    else
+        m.setTo(0);
 }
 
-void cv::gpu::SCascade::preprocess(InputArray _bgr, OutputArray _channels, Stream& stream) const
+struct SeparablePreprocessor : public cv::gpu::SCascade::Preprocessor
 {
-    CV_Assert(fields);
-    (void)stream;
-    const GpuMat bgr = _bgr.getGpuMat(), channels = _channels.getGpuMat();
+    SeparablePreprocessor(const int s, const int b) : cv::gpu::SCascade::Preprocessor(), shrinkage(s), bins(b) {}
+
+    virtual void apply(InputArray _frame, OutputArray _channels, Stream& s = Stream::Null())
+    {
+        const GpuMat frame = _frame.getGpuMat();
+        cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
+
+        _channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
+        GpuMat channels = _channels.getGpuMat();
+        setZero(channels, s);
+
+        cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        cv::gpu::device::icf::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
+
+        cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
+        cv::gpu::device::icf::bgr2Luv(bgr, luv);
+    }
+
+private:
+    const int shrinkage;
+    const int bins;
+
+    GpuMat bgr;
+    GpuMat gray;
+};
+
 }
 
+cv::gpu::SCascade::Preprocessor::Preprocessor(){}
+
+cv::Ptr<cv::gpu::SCascade::Preprocessor> cv::gpu::SCascade::Preprocessor::create(const int s, const int b, const int m)
+{
+    CV_Assert(m == SEPARABLE || m == GENERIC);
+
+    if (m == GENERIC)
+        return cv::Ptr<cv::gpu::SCascade::Preprocessor>(new GenricPreprocessor(s, b));
+
+    return cv::Ptr<cv::gpu::SCascade::Preprocessor>(new SeparablePreprocessor(s, b));
+}
 
 #endif
\ No newline at end of file