From 0898c3c651ef6196a2287d61ef9d6644b5653743 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 18:24:48 +0400
Subject: [PATCH] kernel policy

---
 modules/gpu/src/cuda/isf-sc.cu  | 62 +++++++++++------------------------------
 modules/gpu/src/icf.hpp         | 27 ++++++++++++++++++
 modules/gpu/src/softcascade.cpp | 44 ++++++-----------------------
 3 files changed, 52 insertions(+), 81 deletions(-)
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index f74673c..74e47ba 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -303,21 +303,16 @@ namespace icf {
     }
 #endif
 
-    void detect(const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-                const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
+    template<>
+    void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale) const
     {
         int fw = 160;
         int fh = 120;
 
         dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, downscales);
+        dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);
 
-        const Level* l = (const Level*)levels.ptr();
-        const Octave* oct = ((const Octave*)octaves.ptr());
-        const float* st = (const float*)stages.ptr();
-        const Node* nd = (const Node*)nodes.ptr();
-        const float* lf = (const float*)leaves.ptr();
         uint* ctr = (uint*)counter.ptr();
         Detection* det = (Detection*)objects.ptr();
         uint max_det = objects.cols / sizeof(Detection);
@@ -328,44 +323,21 @@ namespace icf {
         cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
         cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
 
-        test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
-        cudaSafeCall( cudaGetLastError());
-
-        grid = dim3(fw, fh / 8, 47 - downscales);
-        test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
-        cudaSafeCall( cudaGetLastError());
-        cudaSafeCall( cudaDeviceSynchronize());
-    }
-
-    void detectAtScale(const int scale, const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves,
-        const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
-    {
-        int fw = 160;
-        int fh = 120;
-
-        dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, 1);
-
-        const Level* l = (const Level*)levels.ptr();
-        const Octave* oct = ((const Octave*)octaves.ptr());
-        const float* st = (const float*)stages.ptr();
-        const Node* nd = (const Node*)nodes.ptr();
-        const float* lf = (const float*)leaves.ptr();
-        uint* ctr = (uint*)counter.ptr();
-        Detection* det = (Detection*)objects.ptr();
-        uint max_det = objects.cols / sizeof(Detection);
-
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
-        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
-
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
+        if (scale == -1)
+        {
+            test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
+            cudaSafeCall( cudaGetLastError());
 
-        if (scale >= downscales)
-            test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+            grid = dim3(fw, fh / 8, 47 - downscales);
+            test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
+        }
         else
-            test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+        {
+            if (scale >= downscales)
+                test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+            else
+                test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+        }
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index a103341..06c8114 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -124,6 +124,33 @@ struct __align__(16) Detection
     : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
 
+struct CascadePolicy
+{
+    enum {STA_X = 32, STA_Y = 8};
+};
+
+template<typename Policy>
+struct CascadeInvoker
+{
+    CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {}
+    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
+                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()),
+       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr())
+    {}
+
+    const Level*  levels;
+    const Octave* octaves;
+
+    const float*  stages;
+
+    const Node*   nodes;
+    const float*  leaves;
+
+    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter, const int downscales, const int csale = -1) const;
+};
+
 }
 }}}
 
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index c93949f..f25c5a3 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -69,29 +69,6 @@ namespace cv { namespace gpu { namespace device {
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
-
-    void detect(const PtrStepSzb& rois,
-                const PtrStepSzb& levels,
-                const PtrStepSzb& octaves,
-                const PtrStepSzf& stages,
-                const PtrStepSzb& nodes,
-                const PtrStepSzf& leaves,
-                const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects,
-                PtrStepSzi counter,
-                const int downscales);
-
-    void detectAtScale(const int scale,
-                       const PtrStepSzb& rois,
-                       const PtrStepSzb& levels,
-                       const PtrStepSzb& octaves,
-                       const PtrStepSzf& stages,
-                       const PtrStepSzb& nodes,
-                       const PtrStepSzf& leaves,
-                       const PtrStepSzi& hogluv,
-                       PtrStepSz<uchar4> objects,
-                       PtrStepSzi counter,
-                       const int downscales);
 }
 namespace imgproc
 {
@@ -150,6 +127,8 @@ struct cv::gpu::SoftCascade::Filds
 
     std::vector<float> scales;
 
+    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
+
     static const int shrinkage = 4;
 
     enum { BOOST = 0 };
@@ -166,17 +145,11 @@ struct cv::gpu::SoftCascade::Filds
     };
 
     bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
-    {
-        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
-    }
-
-    void detectAtScale(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
+    void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects,
-            detCounter, downscales);
+        // device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker(levels, octaves, stages, nodes, leaves);
+        invoker(roi, hogluv, objects, detCounter, downscales, scale);
     }
 
     void preprocess(const cv::gpu::GpuMat& colored)
@@ -439,6 +412,8 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c
     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
     CV_Assert(!levels.empty());
 
+    invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
+
     return true;
 }
 
@@ -569,10 +544,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 
     flds.preprocess(colored);
 
-    if (specificScale == -1)
-        flds.detect(rois,objects, 0);
-    else
-        flds.detectAtScale(specificScale, rois, objects, 0);
+    flds.detect(specificScale, rois, objects, 0);
 
     cv::Mat out(flds.detCounter);
     int ndetections = *(out.data);
-- 
2.7.4