added hipotesis filtration
authorMarina Kolpakova <no@email>
Wed, 4 Jul 2012 04:51:09 +0000 (04:51 +0000)
committerMarina Kolpakova <no@email>
Wed, 4 Jul 2012 04:51:09 +0000 (04:51 +0000)
modules/gpu/src/cascadeclassifier.cpp
modules/gpu/src/cuda/lbp.cu
modules/gpu/src/opencv2/gpu/device/lbp.hpp

index 2057f1a..8932667 100644 (file)
@@ -273,7 +273,7 @@ namespace cv { namespace gpu { namespace device
 {\r
     namespace lbp\r
     {\r
-        classifyStump(const DevMem2Db mstages,\r
+        void classifyStump(const DevMem2Db mstages,\r
                       const int nstages,\r
                       const DevMem2Di mnodes,\r
                       const DevMem2Df mleaves,\r
@@ -289,16 +289,19 @@ namespace cv { namespace gpu { namespace device
                       int subsetSize,\r
                       DevMem2D_<int4> objects,\r
                       unsigned int* classified);\r
+\r
+        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses);\r
     }\r
 }}}\r
 \r
 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,\r
-                                                        double scaleFactor, int minNeighbors, cv::Size maxObjectSize /*, Size minSize=Size()*/)\r
+                                                        double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)\r
 {\r
     CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );\r
     CV_Assert(!empty());\r
 \r
     const int defaultObjSearchNum = 100;\r
+    const float grouping_eps = 0.2;\r
 \r
     if( !objects.empty() && objects.depth() == CV_32S)\r
         objects.reshape(4, 1);\r
@@ -340,11 +343,14 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
         cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,\r
         integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, dclassified);\r
     }\r
-        cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);\r
-        std::cout  << *classified << "Results:    " << cv::Mat(objects).row(0).colRange(0, *classified) << std::endl;\r
-    // TODO: reject levels\r
 \r
-    return 0;\r
+    cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);\r
+    GpuMat candidates(1, *classified, objects.type(), objects.ptr());\r
+    // std::cout  << *classified << " Results: " << cv::Mat(candidates) << std::endl;\r
+\r
+    if (groupThreshold <= 0  || objects.empty())\r
+        return 0;\r
+    return cv::gpu::device::lbp::connectedConmonents(candidates, groupThreshold, grouping_eps, dclassified);\r
 }\r
 \r
 // ============ old fashioned haar cascade ==============================================//\r
index b07ecad..5c273b3 100644 (file)
@@ -41,6 +41,8 @@
 //M*/
 
 #include <opencv2/gpu/device/lbp.hpp>
+#include <opencv2/gpu/device/vec_traits.hpp>
+#include <opencv2/gpu/device/saturate_cast.hpp>
 
 namespace cv { namespace gpu { namespace device
 {
@@ -89,13 +91,83 @@ namespace cv { namespace gpu { namespace device
             objects(0, res) = rect;
         }
 
-        classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
+        template<typename Pr>
+        __global__ void disjoin(int4* candidates, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            using cv::gpu::device::VecTraits;
+            unsigned int tid = threadIdx.x;
+            extern __shared__ int sbuff[];
+
+            int* labels = sbuff;
+            int* rrects = (int*)(sbuff + n);
+
+            Pr predicate(grouping_eps);
+            partition(candidates, n, labels, predicate);
+
+            rrects[tid * 4 + 0] = 0;
+            rrects[tid * 4 + 1] = 0;
+            rrects[tid * 4 + 2] = 0;
+            rrects[tid * 4 + 3] = 0;
+            __syncthreads();
+
+            int cls = labels[tid];
+            atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
+            atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
+            atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
+            atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+            labels[tid] = 0;
+            __syncthreads();
+
+            atomicInc((unsigned int*)labels + cls, n);
+            labels[n - 1] = 0;
+
+            int active = labels[tid];
+            if (active)
+            {
+                int* r1 = rrects + tid * 4;
+                float s = 1.f / active;
+                r1[0] = saturate_cast<int>(r1[0] * s);
+                r1[1] = saturate_cast<int>(r1[1] * s);
+                r1[2] = saturate_cast<int>(r1[2] * s);
+                r1[3] = saturate_cast<int>(r1[3] * s);
+
+                int n1 = active;
+                __syncthreads();
+                unsigned int j = 0;
+                if( active > groupThreshold )
+                {
+                    for (j = 0; j < n; j++)
+                    {
+                        int n2 = labels[j];
+                        if(!n2 || j == tid || n2 <= groupThreshold )
+                        continue;
+
+                        int* r2 = rrects + j * 4;
+
+                        int dx = saturate_cast<int>( r2[2] * grouping_eps );
+                        int dy = saturate_cast<int>( r2[3] * grouping_eps );
+
+                        if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
+                            r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
+                            (n2 > max(3, n1) || n1 < 3) )
+                            break;
+                    }
+
+                    if( j == n)
+                    {
+                        // printf("founded gpu %d %d %d %d \n", r1[0], r1[1], r1[2], r1[3]);
+                        candidates[atomicInc((unsigned int*)labels + n -1, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+                    }
+                }
+            }
+        }
+
+        void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
                            const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
                            DevMem2D_<int4> objects, unsigned int* classified)
         {
             int blocks  = ceilf(workHeight / (float)step);
             int threads = ceilf(workWidth / (float)step);
-            // printf("blocks %d, threads %d\n", blocks, threads);
 
             Stage* stages = (Stage*)(mstages.ptr());
             ClNode* nodes = (ClNode*)(mnodes.ptr());
@@ -106,5 +178,13 @@ namespace cv { namespace gpu { namespace device
             lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integral,
                 workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
         }
+
+        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            int threads = candidates.cols;
+            int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
+            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
+            return 0;
+        }
     }
 }}}
\ No newline at end of file
index 3296ee3..2b620b5 100644 (file)
@@ -62,6 +62,50 @@ namespace lbp{
         int   featureIdx;
     };
 
+    struct InSameComponint
+    {
+    public:
+        __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps * 0.5) {}
+        __device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {}
+
+        __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
+        {
+            double delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w));
+
+            return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
+                && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
+        }
+        float eps;
+    };
+
+    template<typename Pr>
+    __device__ __forceinline__ void partition(int4* vec, unsigned int n, int* labels, Pr predicate)
+    {
+        unsigned tid = threadIdx.x;
+        labels[tid] = tid;
+        __syncthreads();
+
+        for (unsigned int id = 0; id < n; id++)
+        {
+            if (tid != id && predicate(vec[tid], vec[id]))
+            {
+                int p = labels[tid];
+                int q = labels[id];
+
+                if (p < q)
+                {
+                    atomicMin(labels + id, p);
+                }
+                else if (p > q)
+                {
+                    atomicMin(labels + tid, q);
+                }
+            }
+        }
+        __syncthreads();
+        // printf("tid %d label %d\n", tid, labels[tid]);
+    }
+
     struct LBP
     {
         __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
@@ -72,7 +116,6 @@ namespace lbp{
         {
             int x_off = 2 * feature.z;
             int y_off = 2 * feature.w;
-            // printf("feature: %d %d %d %d\n", (int)feature.x, (int)feature.y, (int)feature.z, (int)feature.w);
             feature.z += feature.x;
             feature.w += feature.y;
 
@@ -107,7 +150,7 @@ namespace lbp{
             anchors[14] = integral(y + y_off + feature.w, x + x_off + feature.x);
             anchors[15] = integral(y + y_off + feature.w, x + x_off + feature.z);
 
-            // calculate feature
+            // calculate responce
             int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10];
 
             int response =   (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)