added OpenCL optimization for LBP-based face detector

author Vadim Pisarevsky <vadim.pisarevsky@gmail.com>

Mon, 23 Dec 2013 11:28:50 +0000 (15:28 +0400)

committer Vadim Pisarevsky <vadim.pisarevsky@gmail.com>

Mon, 23 Dec 2013 11:28:50 +0000 (15:28 +0400)
author Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Mon, 23 Dec 2013 11:28:50 +0000 (15:28 +0400)
committer Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Mon, 23 Dec 2013 11:28:50 +0000 (15:28 +0400)
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp

index 93225f1..07f9bde 100644 (file)
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -743,6 +743,14 @@ bool LBPEvaluator::setWindow( Point pt )
      pwin = &sum.at<int>(pt);
      return true;
  }
+    
+
+void LBPEvaluator::getUMats(std::vector<UMat>& bufs)
+{
+    bufs.clear();
+    bufs.push_back(usum);
+    bufs.push_back(ufbuf);
+}
  
  //----------------------------------------------  HOGEvaluator ---------------------------------------
  bool HOGEvaluator::Feature :: read( const FileNode& node )
@@ -1162,50 +1170,84 @@ bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processin
  bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize,
                                                     int yStep, double factor, Size sumSize0 )
  {
-    const int VECTOR_SIZE = 1;
-    Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
-    if( haar.empty() )
-        return false;
-
-    haar->setImage(_image, data.origWinSize, sumSize0);
-
-    if( cascadeKernel.empty() )
-    {
-        cascadeKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc,
-                             format("-D VECTOR_SIZE=%d", VECTOR_SIZE));
-        if( cascadeKernel.empty() )
-            return false;
-    }
-
+    int featureType = getFeatureType();
+    std::vector<UMat> bufs;
+    size_t globalsize[] = { processingRectSize.width/yStep, processingRectSize.height/yStep };
+    bool ok = false;
+    
      if( ustages.empty() )
      {
          copyVectorToUMat(data.stages, ustages);
          copyVectorToUMat(data.stumps, ustumps);
+        if( !data.subsets.empty() )
+            copyVectorToUMat(data.subsets, usubsets);
      }
  
-    std::vector<UMat> bufs;
-    haar->getUMats(bufs);
-    CV_Assert(bufs.size() == 3);
-
-    Rect normrect = haar->getNormRect();
-
-    //processingRectSize = Size(yStep, yStep);
-    size_t globalsize[] = { (processingRectSize.width/yStep + VECTOR_SIZE-1)/VECTOR_SIZE, processingRectSize.height/yStep };
-
-    cascadeKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
-                       ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum
-                       ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
-
-                       // cascade classifier
-                       (int)data.stages.size(),
-                       ocl::KernelArg::PtrReadOnly(ustages),
-                       ocl::KernelArg::PtrReadOnly(ustumps),
+    if( featureType == FeatureEvaluator::HAAR )
+    {
+        Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
+        if( haar.empty() )
+            return false;
  
-                       ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
-                       processingRectSize,
-                       yStep, (float)factor,
-                       normrect, data.origWinSize, MAX_FACES);
-    bool ok = cascadeKernel.run(2, globalsize, 0, true);
+        haar->setImage(_image, data.origWinSize, sumSize0);
+        if( haarKernel.empty() )
+        {
+            haarKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc, "");
+            if( haarKernel.empty() )
+                return false;
+        }
+        
+        haar->getUMats(bufs);
+        Rect normrect = haar->getNormRect();
+
+        haarKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
+                        ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum
+                        ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
+
+                        // cascade classifier
+                        (int)data.stages.size(),
+                        ocl::KernelArg::PtrReadOnly(ustages),
+                        ocl::KernelArg::PtrReadOnly(ustumps),
+
+                        ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
+                        processingRectSize,
+                        yStep, (float)factor,
+                        normrect, data.origWinSize, MAX_FACES);
+        ok = haarKernel.run(2, globalsize, 0, true);
+    }
+    else if( featureType == FeatureEvaluator::LBP )
+    {
+        Ptr<LBPEvaluator> lbp = featureEvaluator.dynamicCast<LBPEvaluator>();
+        if( lbp.empty() )
+            return false;
+        
+        lbp->setImage(_image, data.origWinSize, sumSize0);
+        if( lbpKernel.empty() )
+        {
+            lbpKernel.create("runLBPClassifierStump", ocl::objdetect::cascadedetect_oclsrc, "");
+            if( lbpKernel.empty() )
+                return false;
+        }
+        
+        lbp->getUMats(bufs);
+        
+        int subsetSize = (data.ncategories + 31)/32;
+        lbpKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
+                        ocl::KernelArg::PtrReadOnly(bufs[1]), // optfeatures
+                        
+                        // cascade classifier
+                        (int)data.stages.size(),
+                        ocl::KernelArg::PtrReadOnly(ustages),
+                        ocl::KernelArg::PtrReadOnly(ustumps),
+                        ocl::KernelArg::PtrReadOnly(usubsets),
+                        subsetSize,
+                        
+                        ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
+                        processingRectSize,
+                        yStep, (float)factor,
+                        data.origWinSize, MAX_FACES);
+        ok = lbpKernel.run(2, globalsize, 0, true);
+    }
      //CV_Assert(ok);
      return ok;
  }
@@ -1254,6 +1296,7 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
                                                      double scaleFactor, Size minObjectSize, Size maxObjectSize,
                                                      bool outputRejectLevels )
  {
+    int featureType = getFeatureType();
      Size imgsz = _image.size();
      int imgtype = _image.type();
  
@@ -1267,7 +1310,8 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
          maxObjectSize = imgsz;
  
      bool use_ocl = ocl::useOpenCL() &&
-        getFeatureType() == FeatureEvaluator::HAAR &&
+        (featureType == FeatureEvaluator::HAAR ||
+         featureType == FeatureEvaluator::LBP) &&
          !isOldFormatCascade() &&
          data.isStumpBased() &&
          maskGenerator.empty() &&
@@ -1593,7 +1637,8 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
  bool CascadeClassifierImpl::read_(const FileNode& root)
  {
      tryOpenCL = true;
-    cascadeKernel = ocl::Kernel();
+    haarKernel = ocl::Kernel();
+    lbpKernel = ocl::Kernel();
      ustages.release();
      ustumps.release();
      if( !data.read(root) )
diff --git a/modules/objdetect/src/cascadedetect.hpp b/modules/objdetect/src/cascadedetect.hpp

index a0b2b55..3731344 100644 (file)
--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -149,7 +149,7 @@ protected:
      Ptr<MaskGenerator> maskGenerator;
      UMat ugrayImage, uimageBuffer;
      UMat ufacepos, ustages, ustumps, usubsets;
-    ocl::Kernel cascadeKernel;
+    ocl::Kernel haarKernel, lbpKernel;
      bool tryOpenCL;
  
      Mutex mtx;
@@ -392,6 +392,7 @@ public:
  
      virtual bool setImage(InputArray image, Size _origWinSize, Size);
      virtual bool setWindow(Point pt);
+    virtual void getUMats(std::vector<UMat>& bufs);
  
      int operator()(int featureIdx) const
      { return optfeaturesPtr[featureIdx].calc(pwin); }
diff --git a/modules/objdetect/src/opencl/cascadedetect.cl b/modules/objdetect/src/opencl/cascadedetect.cl

index 7428e89..3e0187e 100644 (file)
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -105,7 +105,7 @@ __kernel void runHaarClassifierStump(
      }
  }
  
-#if 0
+
  __kernel void runLBPClassifierStump(
      __global const int* sum,
      int sumstep, int sumoffset,
@@ -119,45 +119,48 @@ __kernel void runLBPClassifierStump(
  
      volatile __global int* facepos,
      int2 imgsize, int xyscale, float factor,
-    int4 normrect, int2 windowsize, int maxFaces)
+    int2 windowsize, int maxFaces)
  {
      int ix = get_global_id(0)*xyscale;
      int iy = get_global_id(1)*xyscale;
      sumstep /= sizeof(int);
-    sqsumstep /= sizeof(int);
      
      if( ix < imgsize.x && iy < imgsize.y )
      {
          int stageIdx;
          __global const Stump* stump = stumps;
+        __global const int* p = sum + mad24(iy, sumstep, ix);
          
          for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
          {
              int i, ntrees = stages[stageIdx].ntrees;
              float s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++ )
+            for( i = 0; i < ntrees; i++, stump++, bitsets += bitsetSize )
              {
                  float4 st = stump->st;
                  __global const OptLBPFeature* f = optfeatures + as_int(st.x);
                  int16 ofs = f->ofs;
                  
+                #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+                ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
                  
+                int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
                  
-                int4 ofs = f->ofs[0];
-                sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
-                ofs = f->ofs[1];
-                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
-                if( weight.z > 0 )
-                {
-                    ofs = f->ofs[2];
-                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
-                }
+                int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
                  
-                s += (sval < st.y*nf) ? st.z : st.w;
+                mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+                
+                s += (bitsets[idx] & (1 << mask)) ? st.z : st.w;
              }
              
              if( s < stages[stageIdx].threshold )
-            break;
+                break;
          }
          
          if( stageIdx == nstages )
@@ -174,4 +177,4 @@ __kernel void runLBPClassifierStump(
          }
      }
  }
-#endif
+
author	Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
	Mon, 23 Dec 2013 11:28:50 +0000 (15:28 +0400)
committer	Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
	Mon, 23 Dec 2013 11:28:50 +0000 (15:28 +0400)
modules/objdetect/src/cascadedetect.cpp		patch \| blob \| history
modules/objdetect/src/cascadedetect.hpp		patch \| blob \| history
modules/objdetect/src/opencl/cascadedetect.cl		patch \| blob \| history