refactored GPU LBP cascade. Added support for big images. Fixed bug in connected...

author Marina Kolpakova <no@email>

Wed, 11 Jul 2012 12:22:22 +0000 (12:22 +0000)

committer Marina Kolpakova <no@email>

Wed, 11 Jul 2012 12:22:22 +0000 (12:22 +0000)
author Marina Kolpakova <no@email>
Wed, 11 Jul 2012 12:22:22 +0000 (12:22 +0000)
committer Marina Kolpakova <no@email>
Wed, 11 Jul 2012 12:22:22 +0000 (12:22 +0000)
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp

index 5422dcf..09c1067 100644 (file)
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -315,7 +315,24 @@ namespace cv { namespace gpu { namespace device
                            DevMem2D_<int4> objects,\r
                            unsigned int* classified);\r
  \r
-        int connectedConmonents(DevMem2D_<int4>  candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);\r
+        void classifyStumpFixed(const DevMem2Db& mstages,\r
+                              const int nstages,\r
+                              const DevMem2Di& mnodes,\r
+                              const DevMem2Df& mleaves,\r
+                              const DevMem2Di& msubsets,\r
+                              const DevMem2Db& mfeatures,\r
+                              const int workWidth,\r
+                              const int workHeight,\r
+                              const int clWidth,\r
+                              const int clHeight,\r
+                              float scale,\r
+                              int step,\r
+                              int subsetSize,\r
+                              DevMem2D_<int4> objects,\r
+                              unsigned int* classified,\r
+                              const int maxX);\r
+\r
+        int connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);\r
          void bindIntegral(DevMem2Di integral);\r
          void unbindIntegral();\r
      }\r
@@ -337,8 +354,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
      GpuMat candidates(1 , image.cols >> 1, CV_32SC4);\r
      // GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);\r
      // used for debug\r
-    // candidates.setTo(cv::Scalar::all(0));\r
-    // objects.setTo(cv::Scalar::all(0));\r
+    candidates.setTo(cv::Scalar::all(0));\r
+    objects.setTo(cv::Scalar::all(0));\r
      if (maxObjectSize == cv::Size())\r
          maxObjectSize = image.size();\r
  \r
@@ -349,16 +366,50 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
      unsigned int* dclassified;\r
      cudaMalloc(&dclassified, sizeof(int));\r
      cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);\r
-    int step;\r
+    int step = 2;\r
      cv::gpu::device::lbp::bindIntegral(integral);\r
  \r
-    for( double factor = 1; ; factor *= scaleFactor )\r
+    cv::Size scaledImageSize(image.cols, image.rows);\r
+    cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
+    cv::Size windowSize(NxM.width, NxM.height);\r
+\r
+    double factor = 1;\r
+\r
+    for (; processingRectSize.width / step >= 256;)\r
      {\r
+        // std::cout << "IN FIXED: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;\r
          // if (factor > 2.0) break;\r
-        cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));\r
-        cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));\r
-        cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
+        if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )\r
+            break;\r
+\r
+        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )\r
+            break;\r
+\r
+        // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )\r
+        //     continue;\r
+\r
+        GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));\r
+        GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));\r
+        GpuMat currBuff = integralBuffer;\r
+\r
+        cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);\r
+        cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);\r
+\r
+        step = (factor <= 2.) + 1;\r
+\r
+        cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,\r
+        processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified, processingRectSize.width);\r
  \r
+        factor *= scaleFactor;\r
+        windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));\r
+        scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));\r
+        processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
+    }\r
+\r
+    for (; /*processingRectSize.width / step >= 128*/;)\r
+    {\r
+        // std::cout << "In FLOATING: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;\r
+        // if (factor > 2.0) break;\r
          if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )\r
              break;\r
  \r
@@ -379,12 +430,19 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
  \r
          cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,\r
          processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);\r
+\r
+        factor *= scaleFactor;\r
+        windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));\r
+        scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));\r
+        processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );\r
      }\r
  \r
      cv::gpu::device::lbp::unbindIntegral();\r
      if (groupThreshold <= 0  || objects.empty())\r
          return 0;\r
-    cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);\r
+    cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);\r
+    // std::cout << "!!! CLASSIFIED " << *classified << std::endl;\r
+    cv::gpu::device::lbp::connectedConmonents(candidates, *classified, objects, groupThreshold, grouping_eps, dclassified);\r
      cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);\r
      cudaSafeCall( cudaDeviceSynchronize() );\r
      step = *classified;\r
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu

index ba2e294..eab41b5 100644 (file)
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -53,28 +53,27 @@ namespace cv { namespace gpu { namespace device
  
          struct LBP
          {
-            __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
-            __device__ __forceinline__ LBP() {}
+            __host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
+            __host__ __device__ __forceinline__ LBP() {}
  
-            //feature as uchar x, y - left top, z,w - right bottom
-            __device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
+            __device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const
              {
                  int anchors[9];
  
                  anchors[0]  = tex2D(tintegral, tx, ty);
-                anchors[1]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[1]  = tex2D(tintegral, tx + fw, ty);
                  anchors[0] -= anchors[1];
-                anchors[2]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[2]  = tex2D(tintegral, tx + fw * 2, ty);
                  anchors[1] -= anchors[2];
-                anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+                anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
  
                  ty += fh;
                  anchors[3]  = tex2D(tintegral, tx, ty);
-                anchors[4]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[4]  = tex2D(tintegral, tx + fw, ty);
                  anchors[3] -= anchors[4];
-                anchors[5]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[5]  = tex2D(tintegral, tx + fw * 2, ty);
                  anchors[4] -= anchors[5];
-                anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
+                anchors[5] -= tex2D(tintegral, tx + fw * 3, ty);
  
                  anchors[0] -= anchors[3];
                  anchors[1] -= anchors[4];
@@ -83,11 +82,11 @@ namespace cv { namespace gpu { namespace device
  
                  ty += fh;
                  anchors[6]  = tex2D(tintegral, tx, ty);
-                anchors[7]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[7]  = tex2D(tintegral, tx + fw, ty);
                  anchors[6] -= anchors[7];
-                anchors[8]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[8]  = tex2D(tintegral, tx + fw * 2, ty);
                  anchors[7] -= anchors[8];
-                anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
+                anchors[8] -= tex2D(tintegral, tx + fw * 3, ty);
  
                  anchors[3] -= anchors[6];
                  anchors[4] -= anchors[7];
@@ -109,11 +108,11 @@ namespace cv { namespace gpu { namespace device
  
                  ty += fh;
                  anchors[0]  = tex2D(tintegral, tx, ty);
-                anchors[1]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[1]  = tex2D(tintegral, tx + fw, ty);
                  anchors[0] -= anchors[1];
-                anchors[2]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[2]  = tex2D(tintegral, tx + fw * 2, ty);
                  anchors[1] -= anchors[2];
-                anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+                anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
  
                  anchors[6] -= anchors[0];
                  anchors[7] -= anchors[1];
@@ -142,54 +141,90 @@ namespace cv { namespace gpu { namespace device
               cudaSafeCall( cudaUnbindTexture(&tintegral));
          }
  
-        __global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
-           /* const int* integral,const int istep,  const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
-            const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
+        struct Classifier
          {
-            int x = threadIdx.x * step;
-            int y = blockIdx.x * step;
+            __host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
+                const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
+            : stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
+              scale(_scale), step(_step), subsetSize(_subsetSize){}
  
-            int current_node = 0;
-            int current_leave = 0;
-
-            LBP evaluator;
-            for (int s = 0; s < nstages; s++ )
+            __device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
              {
-                float sum = 0;
-                Stage stage = stages[s];
-                for (int t = 0; t < stage.ntrees; t++)
+                int current_node = 0;
+                int current_leave = 0;
+
+                for (int s = 0; s < nstages; ++s)
                  {
-                    ClNode node = nodes[current_node];
-
-                    uchar4 feature = features[node.featureIdx];
-                    int shift;
-                    int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
-                    int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
-                    sum += leaves[idx];
-                    current_node += 1;
-                    current_leave += 2;
+                    float sum = 0;
+                    Stage stage = stages[s];
+                    for (int t = 0; t < stage.ntrees; t++)
+                    {
+                        ClNode node = nodes[current_node];
+                        uchar4 feature = features[node.featureIdx];
+
+                        int shift;
+                        int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
+                        int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
+                        sum += leaves[idx];
+
+                        current_node += 1;
+                        current_leave += 2;
+                    }
+
+                    if (sum < stage.threshold)
+                        return;
                  }
-                if (sum < stage.threshold)
-                    return;
-            }
  
-            int4 rect;
-            rect.x = roundf(x * scale);
-            rect.y = roundf(y * scale);
-            rect.z = clWidth;
-            rect.w = clHeight;
+                int4 rect;
+                rect.x = roundf(x * scale);
+                rect.y = roundf(y * scale);
+                rect.z = clWidth;
+                rect.w = clHeight;
+
  #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-            int res = __atomicInc(n, 100U);
+                int res = __atomicInc(n, maxN);
  #else
-            int res = atomicInc(n, 100U);
+                int res = atomicInc(n, maxN);
  #endif
-            objects(0, res) = rect;
+                objects(0, res) = rect;
+            }
+
+            const Stage*  stages;
+            const ClNode* nodes;
+            const float* leaves;
+            const int* subsets;
+            const uchar4* features;
+
+            const int nstages;
+            const int clWidth;
+            const int clHeight;
+            const float scale;
+            const int step;
+            const int subsetSize;
+            const LBP evaluator;
+        };
+
+        __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n)
+        {
+            int x = threadIdx.x * classifier.step;
+            int y = blockIdx.x * classifier.step;
+
+            classifier(y, x, objects, maxN, n);
+        }
+
+        __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n, int lines, int maxX)
+        {
+            int x = threadIdx.x * lines * classifier.step;
+            if (x >= maxX) return;
+
+            int y = blockIdx.x * classifier.step / lines;
+
+            classifier(y, x, objects, maxN, n);
          }
  
          template<typename Pr>
          __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
          {
-            using cv::gpu::device::VecTraits;
              unsigned int tid = threadIdx.x;
              extern __shared__ int sbuff[];
  
@@ -207,23 +242,26 @@ namespace cv { namespace gpu { namespace device
  
              int cls = labels[tid];
  #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-            __atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
-            __atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
-            __atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
-            __atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+            __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+            __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+            __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+            __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
  #else
-            atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
-            atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
-            atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
-            atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+            atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+            atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+            atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+            atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
  #endif
+            __syncthreads();
              labels[tid] = 0;
+
              __syncthreads();
  #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
              __atomicInc((unsigned int*)labels + cls, n);
  #else
              atomicInc((unsigned int*)labels + cls, n);
  #endif
+            __syncthreads();
              *nclasses = 0;
  
              int active = labels[tid];
@@ -235,61 +273,54 @@ namespace cv { namespace gpu { namespace device
                  r1[1] = saturate_cast<int>(r1[1] * s);
                  r1[2] = saturate_cast<int>(r1[2] * s);
                  r1[3] = saturate_cast<int>(r1[3] * s);
+            }
+            __syncthreads();
  
-                int n1 = active;
-                __syncthreads();
-                unsigned int j = 0;
-                if( active > groupThreshold )
-                {
-                    for (j = 0; j < n; j++)
-                    {
-                        int n2 = labels[j];
-                        if(!n2 || j == tid || n2 <= groupThreshold )
-                        continue;
-
-                        int* r2 = rrects + j * 4;
-
-                        int dx = saturate_cast<int>( r2[2] * grouping_eps );
-                        int dy = saturate_cast<int>( r2[3] * grouping_eps );
-
-                        if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
-                            r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
-                            (n2 > max(3, n1) || n1 < 3) )
-                            break;
-                    }
-                    if( j == n)
-                    {
+            if (active && active >= groupThreshold)
+            {
+                int* r1 = rrects + tid * 4;
+                int4 r_out;
+                r_out.x = r1[0];
+                r_out.y = r1[1];
+                r_out.z = r1[2];
+                r_out.w = r1[3];
  #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-                        objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+                objects[__atomicInc(nclasses, n)] = r_out;
  #else
-                        objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+                int aidx = atomicInc(nclasses, n);
+                objects[aidx] = r_out;
  #endif
-                    }
-                }
              }
          }
  
          void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
-                           /*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
-                           DevMem2D_<int4> objects, unsigned int* classified)
+                           const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
+        {
+            int blocks  = ceilf(workHeight / (float)step);
+            int threads = ceilf(workWidth / (float)step);
+
+            Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
+            lbp_classify_stump<<<blocks, threads>>>(clr,  objects, objects.cols, classified);
+        }
+
+        void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
+                           const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified,
+                           int maxX)
          {
+            const int THREADS_BLOCK = 256;
              int blocks  = ceilf(workHeight / (float)step);
              int threads = ceilf(workWidth / (float)step);
  
-            Stage* stages = (Stage*)(mstages.ptr());
-            ClNode* nodes = (ClNode*)(mnodes.ptr());
-            const float* leaves = mleaves.ptr();
-            const int* subsets = msubsets.ptr();
-            const uchar4* features = (uchar4*)(mfeatures.ptr());
-            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
-                workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
+            Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
+            int lines = divUp(threads, THREADS_BLOCK);
+            lbp_classify_stump<<<blocks * lines, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, lines, maxX);
          }
  
-        int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
          {
-            int threads = candidates.cols;
+            int threads = ncandidates;
              int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
-            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
+            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
              return 0;
          }
      }
diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp

index 69867c9..b3cf6dc 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -61,6 +61,7 @@ __device__ __forceinline__ T __atomicInc(T* address, T val)
          count = tag | (count + 1);
          *address = count;
      } while (*address != count);
+
      return (count & TAG_MASK) - 1;
  }
  
@@ -85,6 +86,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
      {
          *address = count;
      } while (*address > count);
+
      return count;
  }
  
@@ -151,7 +153,6 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
              }
          }
          __syncthreads();
-        // printf("tid %d label %d\n", tid, labels[tid]);
      }
  } // lbp
author	Marina Kolpakova <no@email>
	Wed, 11 Jul 2012 12:22:22 +0000 (12:22 +0000)
committer	Marina Kolpakova <no@email>
	Wed, 11 Jul 2012 12:22:22 +0000 (12:22 +0000)
modules/gpu/src/cascadeclassifier.cpp		patch \| blob \| history
modules/gpu/src/cuda/lbp.cu		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/lbp.hpp		patch \| blob \| history