From 2aacff4c39538a421338080158f00871f573aafa Mon Sep 17 00:00:00 2001
From: Marina Kolpakova <no@email>
Date: Thu, 12 Jul 2012 07:11:26 +0000
Subject: [PATCH] swithed to the fixed size thread block

---
 modules/gpu/src/cascadeclassifier.cpp | 40 ++---------------------------------
 modules/gpu/src/cuda/lbp.cu           | 21 ++++++++----------
 2 files changed, 11 insertions(+), 50 deletions(-)
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index bf13a98..f8e585b 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -97,7 +97,6 @@ void cv::gpu::CascadeClassifier_GPU_LBP::initializeBuffers(cv::Size frame)
 
         Ncv32u bufSize;
         ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-        // printf("HERE!!!!!!!%d\n", bufSize);
         integralBuffer.create(1, bufSize, CV_8UC1);
     }
 }
@@ -329,8 +328,7 @@ namespace cv { namespace gpu { namespace device
                               int step,
                               int subsetSize,
                               DevMem2D_<int4> objects,
-                              unsigned int* classified,
-                              const int maxX);
+                              unsigned int* classified);
 
         int connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
         void bindIntegral(DevMem2Di integral);
@@ -375,10 +373,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
 
     double factor = 1;
 
-    for (; processingRectSize.width / step >= 256;)
+    for (; ;)
     {
-        // std::cout << "IN FIXED: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;
-        // if (factor > 2.0) break;
         if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
             break;
 
@@ -398,37 +394,6 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
         step = (factor <= 2.) + 1;
 
         cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
-        processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified, processingRectSize.width);
-
-        factor *= scaleFactor;
-        windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
-        scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
-        processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
-    }
-
-    for (; /*processingRectSize.width / step >= 128*/;)
-    {
-        // std::cout << "In FLOATING: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;
-        // if (factor > 2.0) break;
-        if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
-            break;
-
-        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
-            break;
-
-        // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
-        //     continue;
-
-        GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
-        GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
-        GpuMat currBuff = integralBuffer;
-
-        cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
-        cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
-
-        step = (factor <= 2.) + 1;
-
-        cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
         processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
 
         factor *= scaleFactor;
@@ -441,7 +406,6 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
     if (groupThreshold <= 0  || objects.empty())
         return 0;
     cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
-    // std::cout << "!!! CLASSIFIED " << *classified << std::endl;
     cv::gpu::device::lbp::connectedConmonents(candidates, *classified, objects, groupThreshold, grouping_eps, dclassified);
     cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
     cudaSafeCall( cudaDeviceSynchronize() );
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu
index eab41b5..b8a6df8 100644
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -212,14 +212,13 @@ namespace cv { namespace gpu { namespace device
             classifier(y, x, objects, maxN, n);
         }
 
-        __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n, int lines, int maxX)
+        __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n, int maxX)
         {
-            int x = threadIdx.x * lines * classifier.step;
-            if (x >= maxX) return;
+            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = ftid / maxX;
+            int x = ftid - y * maxX;
 
-            int y = blockIdx.x * classifier.step / lines;
-
-            classifier(y, x, objects, maxN, n);
+            classifier(y * classifier.step, x * classifier.step, objects, maxN, n);
         }
 
         template<typename Pr>
@@ -304,16 +303,14 @@ namespace cv { namespace gpu { namespace device
         }
 
         void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
-                           const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified,
-                           int maxX)
+                           const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
         {
             const int THREADS_BLOCK = 256;
-            int blocks  = ceilf(workHeight / (float)step);
-            int threads = ceilf(workWidth / (float)step);
+            int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
+            int blocks  = divUp(work_amount, THREADS_BLOCK);
 
             Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
-            int lines = divUp(threads, THREADS_BLOCK);
-            lbp_classify_stump<<<blocks * lines, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, lines, maxX);
+            lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
         }
 
         int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
-- 
2.7.4