From a743eca076c6a6c1b35d7cbe8ff9258dad5b7a6f Mon Sep 17 00:00:00 2001
From: Marina Kolpakova <no@email>
Date: Tue, 26 Jun 2012 12:15:19 +0000
Subject: [PATCH] LBP features: GPU representation

---
 modules/gpu/include/opencv2/gpu/gpu.hpp    |  1 +
 modules/gpu/src/cascadeclassifier.cpp      | 48 ++++++++++++++++++++++--------
 modules/gpu/src/cuda/lbp.cu                | 33 ++++++++++++--------
 modules/gpu/src/opencv2/gpu/device/lbp.hpp | 43 ++++++++++++++++++++++++--
 4 files changed, 98 insertions(+), 27 deletions(-)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 3bca642..aa3e368 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1460,6 +1460,7 @@ private:
     GpuMat nodes_mat;
     GpuMat leaves_mat;
     GpuMat subsets_mat;
+    GpuMat features_mat;
 
     // current integral image
     GpuMat integral;
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 9abc453..e959342 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -99,10 +99,9 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP()
 
 void cv::gpu::CascadeClassifier_GPU_LBP::preallocateIntegralBuffer(cv::Size desired)
 {
-    integral.create(desired.width + 1, desired.height + 1, CV_32FC1);
+    integral.create(desired.width + 1, desired.height + 1, CV_32SC1);
 }
 
-
 bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const
 {
     return stage_mat.empty();
@@ -132,6 +131,8 @@ bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string& classifierAsXml)
 #define GPU_CC_WEAK_CLASSIFIERS     "weakClassifiers"
 #define GPU_CC_INTERNAL_NODES       "internalNodes"
 #define GPU_CC_LEAF_VALUES          "leafValues"
+#define GPU_CC_FEATURES             "features"
+#define GPU_CC_RECT                 "rect"
 
 bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
 {
@@ -225,6 +226,22 @@ bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
                 cl_leaves.push_back((float)*iIt);
         }
     }
+    fn = root[GPU_CC_FEATURES];
+    if( fn.empty() )
+        return false;
+    std::vector<char> features;
+    features.reserve(fn.size() * 4);
+    FileNodeIterator f_it = fn.begin(), f_end = fn.end();
+    for (; f_it != f_end; ++f_it)
+    {
+        FileNode rect = fn[GPU_CC_RECT];
+        FileNodeIterator r_it = rect.begin();
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+    }
+
     // copy data structures on gpu
     stage_mat = cv::gpu::GpuMat(1, (int)stages.size() * sizeof(Stage), CV_8UC1);
     stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, &(stages[0]) ));
@@ -241,6 +258,9 @@ bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
     subsets_mat = cv::gpu::GpuMat(1, (int)subsets.size(), CV_32SC1);
     stage_mat.upload(cv::Mat(subsets));
 
+    features_mat = cv::gpu::GpuMat(1, (int)features.size(), CV_8UC1);
+    features_mat.upload(cv::Mat(features));
+
     return true;
 }
 
@@ -270,22 +290,25 @@ namespace cv { namespace gpu { namespace device
 {
     namespace lbp
     {
-        void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
-            const DevMem2Db integral, int workWidth, int workHeight, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors = 4, cudaStream_t stream = 0);
+        void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db features,
+            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors = 4, cudaStream_t stream = 0);
     }
 }}}
 
 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects, double scaleFactor, int minNeighbors /*, Size minSize=Size()*/)
 {
     CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
-    CV_Assert(empty());
+    CV_Assert(!empty());
 
     const int defaultObjSearchNum = 100;
 
-    if( !objects.empty() && objects.depth() == CV_32S)
-        objects.reshape(4, 1);
-    else
-        objects.create(1 , defaultObjSearchNum, CV_32SC4);
+    // if( !objects.empty() && objects.depth() == CV_32S)
+    //     objects.reshape(4, 1);
+    // else
+    //     objects.create(1 , defaultObjSearchNum, CV_32SC4);
+
+    // temp solution
+    objects.create(image.rows, image.cols, CV_32SC4);
 
     scaledImageBuffer.create(image.size(), image.type());
 
@@ -302,14 +325,13 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
         // TODO: min max object sizes cheching
         cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, INTER_NEAREST);
         //prepare image for evaluation
-        integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32FC1);
+        integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32SC1);
         cv::gpu::integral(scaledImageBuffer, integral);
 
         int step = (factor <= 2.) + 1;
-        int stripCount = 1, stripSize = processingRectSize.height;
 
-        cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat,
-         integral, processingRectSize.width, processingRectSize.height, step, subsetSize, objects, minNeighbors);
+        cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat, features_mat,
+         integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, minNeighbors);
     }
     // TODO: reject levels
 
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu
index 61f67d4..888bf3b 100644
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -46,13 +46,14 @@ namespace cv { namespace gpu { namespace device
 {
     namespace lbp
     {
-        __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
-            const DevMem2Db integral, float step, int subsetSize, DevMem2D_<int4> objects)
+        __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2D_< ::cv::gpu::device::ClNode> nodes,
+            const DevMem2Df leaves, const DevMem2Di subsets,
+            const DevMem2D_<uchar4> features, const DevMem2Di integral, float step, int subsetSize, DevMem2D_<int4> objects, float scale, int clWidth, int clHeight)
         {
-            unsigned int x = threadIdx.x;
-            unsigned int y = blockIdx.x;
+            unsigned int x = threadIdx.x * step;
+            unsigned int y = blockIdx.x  * step;
             int nodeOfs = 0, leafOfs = 0;
-            ::cv::gpu::device::Feature feature;
+            ::cv::gpu::device::Feature evaluator;
 
             for (int s = 0; s < stages.cols; s++ )
             {
@@ -61,7 +62,9 @@ namespace cv { namespace gpu { namespace device
                 for (int w = 0; w < stage.ntrees; w++)
                 {
                     ::cv::gpu::device::ClNode node = nodes(0, nodeOfs);
-                    char c = feature();// TODO: inmplement it
+                    uchar4 feature = features(0, node.featureIdx);
+
+                    uchar c = evaluator(y, x, feature, integral);
                     const int subsetIdx = (nodeOfs * subsetSize);
                     int idx = subsetIdx + ((c >> 5) & ( 1 << (c & 31)) ? leafOfs : leafOfs + 1);
                     sum += leaves(0, subsets(0, idx) );
@@ -70,21 +73,27 @@ namespace cv { namespace gpu { namespace device
                 }
 
                 if (sum < stage.threshold)
-                    return; // nothing matched
-                return;//mathed
+                    return;
             }
-
+            int4 rect;
+            rect.x = roundf(x * scale);
+            rect.y = roundf(y * scale);
+            rect.z = roundf(clWidth * scale);
+            rect.w = roundf(clHeight * scale);
+            objects(blockIdx.x, threadIdx.x) = rect;
         }
 
-        void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
-            const DevMem2Db integral, int workWidth, int workHeight, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors, cudaStream_t stream)
+        void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db bnodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db bfeatures,
+            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors, cudaStream_t stream)
         {
             printf("CascadeClassify");
             int blocks = ceilf(workHeight / (float)step);
             int threads = ceilf(workWidth / (float)step);
             DevMem2D_< ::cv::gpu::device::Stage> stages = DevMem2D_< ::cv::gpu::device::Stage>(bstages);
+            DevMem2D_<uchar4> features = (DevMem2D_<uchar4>)bfeatures;
+            DevMem2D_< ::cv::gpu::device::ClNode> nodes = DevMem2D_< ::cv::gpu::device::ClNode>(bnodes);
 
-            lbp_classify<<<blocks, threads>>>(stages, trees, nodes, leaves, subsets, integral, step, subsetSize, objects);
+            lbp_classify<<<blocks, threads>>>(stages, trees, nodes, leaves, subsets, features, integral, step, subsetSize, objects, scale, clWidth, clHeight);
         }
     }
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
index ede48bf..debda53 100644
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -82,9 +82,48 @@ namespace cv { namespace gpu { namespace device {
     {
         __device__ __forceinline__ Feature(const Feature& other) {(void)other;}
         __device__ __forceinline__ Feature() {}
-        __device__ __forceinline__ char operator() ()//(volatile int* ptr, int offset)
+
+        //feature as uchar x, y - left top, z,w - right bottom
+        __device__ __forceinline__ uchar operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const
         {
-            return char(0);
+            int x_off = 2 * feature.z;
+            int y_off = 2 * feature.w;
+
+            // load feature key points
+            int anchors[16];
+            anchors[0]  = integral(y + feature.y, x + feature.x);
+            anchors[1]  = integral(y + feature.y, x + feature.z);
+            anchors[2]  = integral(y + feature.y, x + x_off + feature.x);
+            anchors[3]  = integral(y + feature.y, x + x_off + feature.z);
+
+            anchors[4]  = integral(y + feature.w, x + feature.x);
+            anchors[5]  = integral(y + feature.w, x + feature.z);
+            anchors[6]  = integral(y + feature.w, x + x_off + feature.x);
+            anchors[7]  = integral(y + feature.w, x + x_off + feature.z);
+
+            anchors[8]  = integral(y + y_off + feature.y, x + feature.x);
+            anchors[9]  = integral(y + y_off + feature.y, x + feature.z);
+            anchors[10] = integral(y + y_off + feature.y, x + x_off + feature.x);
+            anchors[11] = integral(y + y_off + feature.y, x + x_off + feature.z);
+
+            anchors[12] = integral(y + y_off + feature.w, x + feature.x);
+            anchors[13] = integral(y + y_off + feature.w, x + feature.z);
+            anchors[14] = integral(y + y_off + feature.w, x + x_off + feature.x);
+            anchors[15] = integral(y + y_off + feature.w, x + x_off + feature.z);
+
+            // calculate feature
+            int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10];
+
+            uchar response = (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)
+                            |(( (anchors[ 1] - anchors[ 2] - anchors[ 5] + anchors[ 6]) >= sum )? 64  : 0)
+                            |(( (anchors[ 2] - anchors[ 3] - anchors[ 6] + anchors[ 7]) >= sum )? 32  : 0)
+                            |(( (anchors[ 6] - anchors[ 7] - anchors[10] + anchors[11]) >= sum )? 16  : 0)
+                            |(( (anchors[10] - anchors[11] - anchors[14] + anchors[15]) >= sum )? 8   : 0)
+                            |(( (anchors[ 9] - anchors[10] - anchors[13] + anchors[14]) >= sum )? 4   : 0)
+                            |(( (anchors[ 8] - anchors[ 9] - anchors[12] + anchors[13]) >= sum )? 2   : 0)
+                            |(( (anchors[ 4] - anchors[ 5] - anchors[ 8] + anchors[ 9]) >= sum )? 1   : 0);
+
+            return response;
         }
 
     };
-- 
2.7.4