added recommended number of stripes to parallel_for_, modified some of the functions...

author Vadim Pisarevsky <vadim.pisarevsky@itseez.com>

Thu, 11 Oct 2012 18:37:14 +0000 (22:37 +0400)

committer Vadim Pisarevsky <vadim.pisarevsky@itseez.com>

Thu, 11 Oct 2012 18:37:14 +0000 (22:37 +0400)
author Vadim Pisarevsky <vadim.pisarevsky@itseez.com>
Thu, 11 Oct 2012 18:37:14 +0000 (22:37 +0400)
committer Vadim Pisarevsky <vadim.pisarevsky@itseez.com>
Thu, 11 Oct 2012 18:37:14 +0000 (22:37 +0400)
diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp

index 2c1ff04..5564983 100644 (file)
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -4614,11 +4614,11 @@ protected:
  class CV_EXPORTS ParallelLoopBody
  {
  public:
-    virtual void operator() (const Range& range) const = 0;
      virtual ~ParallelLoopBody();
+    virtual void operator() (const Range& range) const = 0;
  };
  
-CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body);
+CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes=-1.);
  
  /////////////////////////// Synchronization Primitives ///////////////////////////////
  
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp

index 7f6cda9..85ac58b 100644 (file)
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -80,87 +80,114 @@
  
  namespace cv
  {
-    ParallelLoopBody::~ParallelLoopBody() { }
+    class ParallelLoopBodyWrapper
+    {
+    public:
+        ParallelLoopBodyWrapper(const ParallelLoopBody& _body, const Range& _r, double _nstripes)
+        {
+            body = &_body;
+            wholeRange = _r;
+            double len = wholeRange.end - wholeRange.start;
+            nstripes = cvRound(_nstripes < 0 ? len : MIN(MAX(_nstripes, 1.), len));
+        }
+        void operator()(const Range& sr) const
+        {
+            Range r;
+            r.start = (int)(wholeRange.start +
+                            ((size_t)sr.start*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes);
+            r.end = sr.end >= nstripes ? wholeRange.end : (int)(wholeRange.start +
+                            ((size_t)sr.end*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes);
+            (*body)(r);
+        }
+        Range stripeRange() const { return Range(0, nstripes); }
  
-#ifdef HAVE_TBB
-    class TbbProxyLoopBody
+    protected:
+        const ParallelLoopBody* body;
+        Range wholeRange;
+        int nstripes;
+    };
+    
+    ParallelLoopBody::~ParallelLoopBody() {}
+
+#if defined HAVE_TBB
+    class ProxyLoopBody : public ParallelLoopBodyWrapper
      {
      public:
-        TbbProxyLoopBody(const ParallelLoopBody& _body) :
-            body(&_body)
-        { }
+        ProxyLoopBody(const ParallelLoopBody& _body, const Range& _r, double _nstripes)
+        : ParallelLoopBodyWrapper(_body, _r, _nstripes)
+        {}
  
          void operator ()(const tbb::blocked_range<int>& range) const
          {
-            body->operator()(Range(range.begin(), range.end()));
+            (*this)(Range(range.begin(), range.end()));
          }
-
-    private:
-        const ParallelLoopBody* body;
      };
-#endif // end HAVE_TBB
+#elif defined HAVE_GCD
  
-#ifdef HAVE_GCD
+    typedef ParallelLoopBodyWrapper ProxyLoopBody;
      static
      void block_function(void* context, size_t index)
      {
-        ParallelLoopBody* ptr_body = static_cast<ParallelLoopBody*>(context);
-        ptr_body->operator()(Range(index, index + 1));
+        ProxyLoopBody* ptr_body = static_cast<ProxyLoopBody*>(context);
+        (*ptr_body)(Range(index, index + 1));
      }
-#endif // HAVE_GCD
+#elif defined HAVE_CONCURRENCY    
+    class ProxyLoopBody : public ParallelLoopBodyWrapper
+    {
+    public:
+        ProxyLoopBody(const ParallelLoopBody& _body, const Range& _r, double _nstripes)
+        : ParallelLoopBodyWrapper(_body, _r, _nstripes)
+        {}
+        
+        void operator ()(int i) const
+        {
+            (*this)(Range(i, i + 1));
+        }
+    }
+#else
+    typedef ParallelLoopBodyWrapper ProxyLoopBody;
+#endif
  
-    void parallel_for_(const Range& range, const ParallelLoopBody& body)
+    void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes)
      {
-#ifdef HAVE_TBB
+        ProxyLoopBody pbody(body, range, nstripes);
+        Range stripeRange = pbody.stripeRange();
+        
+#if defined HAVE_TBB
  
-        tbb::parallel_for(tbb::blocked_range<int>(range.start, range.end), TbbProxyLoopBody(body));
+        tbb::parallel_for(tbb::blocked_range<int>(stripeRange.start, stripeRange.end), pbody);
  
  #elif defined HAVE_CONCURRENCY
  
-        class ConcurrencyProxyLoopBody
-        {
-        public:
-            ConcurrencyProxyLoopBody(const ParallelLoopBody& body) : _body(body) {}
-
-            void operator ()(int i) const
-            {
-                _body(Range(i, i + 1));
-            }
-
-        private:
-            const ParallelLoopBody& _body;
-            ConcurrencyProxyLoopBody& operator=(const ConcurrencyProxyLoopBody&) {return *this;}
-        } proxy(body);
-
-        Concurrency::parallel_for(range.start, range.end, proxy);
+        Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody);
  
  #elif defined HAVE_OPENMP
  
  #pragma omp parallel for schedule(dynamic)
-        for (int i = range.start; i < range.end; ++i)
-            body(Range(i, i + 1));
+        for (int i = stripeRange.start; i < stripeRange.end; ++i)
+            pbody(Range(i, i + 1));
  
  #elif defined HAVE_GCD
  
          dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
-        dispatch_apply_f(range.end - range.start, concurrent_queue, &const_cast<ParallelLoopBody&>(body), block_function);
+        dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function);
  
  #elif defined HAVE_CSTRIPES
  
          parallel()
          {
-            int offset = range.start;
-            int len = range.end - offset;
+            int offset = stripeRange.start;
+            int len = stripeRange.end - offset;
              Range r(offset + CPX_RANGE_START(len), offset + CPX_RANGE_END(len));
-            body(r);
+            pbody(r);
              barrier();
          }
  
  #else
  
-        body(range);
+        pbody(stripeRange);
  
-#endif // end HAVE_TBB
+#endif
      }
  
  } // namespace cv
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp

index fd068d0..69be461 100644 (file)
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -187,7 +187,7 @@ private:
  template <typename Cvt>
  void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
  {
-    parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt));
+    parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
  }
  
  ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp

index e3374f0..6fb728a 100644 (file)
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -357,7 +357,7 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
      
      Range range(0, dsize.height);
      resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
-    parallel_for_(range, invoker);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
  }
  
  
@@ -1222,7 +1222,7 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
      Range range(0, dsize.height);
      resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
          ssize, dsize, ksize, xmin, xmax);
-    parallel_for_(range, invoker);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
  }
  
  template <typename T, typename WT>
@@ -1381,7 +1381,7 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
      Range range(0, dst.rows);
      resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x, 
          scale_y, ofs, xofs);
-    parallel_for_(range, invoker);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
  }
  
  struct DecimateAlpha
@@ -2680,14 +2680,14 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
                            const Mat& _fxy, const void* _wtab,
                            int borderType, const Scalar& _borderValue);
  
-class remapInvoker :
+class RemapInvoker :
      public ParallelLoopBody
  {
  public:
-    remapInvoker(const Mat& _src, Mat _dst, const Mat& _map1, const Mat& _map2, const Mat *_m1, 
+    RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1, 
                   const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
                   int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
-        ParallelLoopBody(), src(_src), dst(_dst), map1(_map1), map2(_map2), m1(_m1), m2(_m2),
+        ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
          interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue), 
          planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
      {    
@@ -2697,9 +2697,9 @@ public:
      {
          int x, y, x1, y1;
          const int buf_size = 1 << 14;
-        int brows0 = std::min(128, dst.rows), map_depth = map1.depth();
-        int bcols0 = std::min(buf_size/brows0, dst.cols);
-        brows0 = std::min(buf_size/bcols0, dst.rows);
+        int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
+        int bcols0 = std::min(buf_size/brows0, dst->cols);
+        brows0 = std::min(buf_size/bcols0, dst->rows);
      #if CV_SSE2
          bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
      #endif
@@ -2710,17 +2710,17 @@ public:
  
          for( y = range.start; y < range.end; y += brows0 )
          {
-            for( x = 0; x < dst.cols; x += bcols0 )
+            for( x = 0; x < dst->cols; x += bcols0 )
              {
                  int brows = std::min(brows0, range.end - y);
-                int bcols = std::min(bcols0, dst.cols - x);
-                Mat dpart(dst, Rect(x, y, bcols, brows));
+                int bcols = std::min(bcols0, dst->cols - x);
+                Mat dpart(*dst, Rect(x, y, bcols, brows));
                  Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
  
                  if( nnfunc )
                  {
-                    if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
-                        bufxy = map1(Rect(x, y, bcols, brows));
+                    if( m1->type() == CV_16SC2 && !m2->data ) // the data is already in the right format
+                        bufxy = (*m1)(Rect(x, y, bcols, brows));
                      else if( map_depth != CV_32F )
                      {
                          for( y1 = 0; y1 < brows; y1++ )
@@ -2738,14 +2738,14 @@ public:
                          }
                      }
                      else if( !planar_input )
-                        map1(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
+                        (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
                      else
                      {
                          for( y1 = 0; y1 < brows; y1++ )
                          {
                              short* XY = (short*)(bufxy.data + bufxy.step*y1);
-                            const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
-                            const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
+                            const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x;
+                            const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x;
                              x1 = 0;
  
                          #if CV_SSE2
@@ -2778,7 +2778,7 @@ public:
                              }
                          }
                      }
-                    nnfunc( src, dpart, bufxy, borderType, borderValue );
+                    nnfunc( *src, dpart, bufxy, borderType, borderValue );
                      continue;
                  }
  
@@ -2788,16 +2788,15 @@ public:
                      short* XY = (short*)(bufxy.data + bufxy.step*y1);
                      ushort* A = (ushort*)(bufa.data + bufa.step*y1);
  
-                    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
-                        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
+                    if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
                      {
-                        bufxy = m1->operator()(Rect(x, y, bcols, brows));
-                        bufa = m2->operator()(Rect(x, y, bcols, brows));
+                        bufxy = (*m1)(Rect(x, y, bcols, brows));
+                        bufa = (*m2)(Rect(x, y, bcols, brows));
                      }
                      else if( planar_input )
                      {
-                        const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
-                        const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
+                        const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x;
+                        const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x;
  
                          x1 = 0;
                      #if CV_SSE2
@@ -2850,7 +2849,7 @@ public:
                      }
                      else
                      {
-                        const float* sXY = (const float*)(map1.data + map1.step*(y+y1)) + x*2;
+                        const float* sXY = (const float*)(m1->data + m1->step*(y+y1)) + x*2;
  
                          for( x1 = 0; x1 < bcols; x1++ )
                          {
@@ -2863,15 +2862,14 @@ public:
                          }
                      }
                  }
-                ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue);
+                ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
              }
          }
      }
      
  private:
-    Mat src;
-    Mat dst;
-    Mat map1, map2;
+    const Mat* src;
+    Mat* dst;
      const Mat *m1, *m2;
      int interpolation, borderType;
      Scalar borderValue;
@@ -2961,8 +2959,8 @@ void cv::remap( InputArray _src, OutputArray _dst,
  
      const Mat *m1 = &map1, *m2 = &map2;
  
-    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
-        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
+    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || !map2.data)) ||
+        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || !map1.data)) )
      {
          if( map1.type() != CV_16SC2 )
              std::swap(m1, m2);
@@ -2974,11 +2972,10 @@ void cv::remap( InputArray _src, OutputArray _dst,
          planar_input = map1.channels() == 1;
      }
  
-    Range range(0, dst.rows);
-    remapInvoker invoker(src, dst, map1, map2, m1, m2, interpolation, 
+    RemapInvoker invoker(src, dst, m1, m2, interpolation, 
                           borderType, borderValue, planar_input, nnfunc, ifunc,
                           ctab);
-    parallel_for_(range, invoker);
+    parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
  }
  
  
@@ -3300,7 +3297,7 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
      Range range(0, dst.rows);
      warpAffineInvoker invoker(src, dst, interpolation, borderType,
                                borderValue, adelta, bdelta, M);
-    parallel_for_(range, invoker);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
  }
  
  
@@ -3430,7 +3427,7 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
  
      Range range(0, dst.rows);
      warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
-    parallel_for_(range, invoker);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
  }
  
  
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp

index 58264e2..92d40f5 100644 (file)
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1919,7 +1919,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
      }
      
      BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
-    parallel_for_(Range(0, size.height), body);
+    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
  }
  
  
@@ -2189,7 +2189,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
      // parallel_for usage
  
      BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
-    parallel_for_(Range(0, size.height), body);
+    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
  }
  
  }
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp

index c4b25aa..e2ef8fe 100644 (file)
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -664,13 +664,11 @@ getThreshVal_Otsu_8u( const Mat& _src )
  class ThresholdRunner : public ParallelLoopBody
  {
  public:
-    ThresholdRunner(Mat _src, Mat _dst, int _nStripes, double _thresh, double _maxval, int _thresholdType)
+    ThresholdRunner(Mat _src, Mat _dst, double _thresh, double _maxval, int _thresholdType)
      {
          src = _src;
          dst = _dst;
  
-        nStripes = _nStripes;
-
          thresh = _thresh;
          maxval = _maxval;
          thresholdType = _thresholdType;
@@ -678,13 +676,8 @@ public:
  
      void operator () ( const Range& range ) const
      {
-        int row0 = std::min(cvRound(range.start * src.rows / nStripes), src.rows);
-        int row1 = range.end >= nStripes ? src.rows :
-            std::min(cvRound(range.end * src.rows / nStripes), src.rows);
-
-        /*if(0)
-            printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
-                   src.rows, src.cols, range.begin(), range.end(), row0, row1);*/
+        int row0 = range.start;
+        int row1 = range.end;
  
          Mat srcStripe = src.rowRange(row0, row1);
          Mat dstStripe = dst.rowRange(row0, row1);
@@ -789,10 +782,9 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
      else
          CV_Error( CV_StsUnsupportedFormat, "" );
      
-    size_t nStripes = (src.total() + (1<<15)) >> 16;
-    nStripes = MAX(MIN(nStripes, (size_t)4), (size_t)1);
-    parallel_for_(Range(0, (int)nStripes),
-                  ThresholdRunner(src, dst, nStripes, thresh, maxval, type));
+    parallel_for_(Range(0, dst.rows),
+                  ThresholdRunner(src, dst, thresh, maxval, type),
+                  dst.total()/(double)(1<<16));
      return thresh;
  }
  
diff --git a/modules/video/src/bgfg_gmg.cpp b/modules/video/src/bgfg_gmg.cpp

index e3574b7..42d6fb1 100644 (file)
--- a/modules/video/src/bgfg_gmg.cpp
+++ b/modules/video/src/bgfg_gmg.cpp
@@ -298,7 +298,7 @@ namespace
  
  void cv::BackgroundSubtractorGMG::operator ()(InputArray _frame, OutputArray _fgmask, double newLearningRate)
  {
-    cv::Mat frame = _frame.getMat();
+    Mat frame = _frame.getMat();
  
      CV_Assert(frame.depth() == CV_8U || frame.depth() == CV_16U || frame.depth() == CV_32F);
      CV_Assert(frame.channels() == 1 || frame.channels() == 3 || frame.channels() == 4);
@@ -313,16 +313,16 @@ void cv::BackgroundSubtractorGMG::operator ()(InputArray _frame, OutputArray _fg
          initialize(frame.size(), 0.0, frame.depth() == CV_8U ? 255.0 : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0);
  
      _fgmask.create(frameSize_, CV_8UC1);
-    cv::Mat fgmask = _fgmask.getMat();
+    Mat fgmask = _fgmask.getMat();
  
      GMG_LoopBody body(frame, fgmask, nfeatures_, colors_, weights_,
                        maxFeatures, learningRate, numInitializationFrames, quantizationLevels, backgroundPrior, decisionThreshold,
                        maxVal_, minVal_, frameNum_, updateBackgroundModel);
-    cv::parallel_for_(cv::Range(0, frame.rows), body);
+    parallel_for_(Range(0, frame.rows), body, frame.total()/(double)(1<<16));
  
      if (smoothingRadius > 0)
      {
-        cv::medianBlur(fgmask, buf_, smoothingRadius);
+        medianBlur(fgmask, buf_, smoothingRadius);
          cv::swap(fgmask, buf_);
      }
author	Vadim Pisarevsky <vadim.pisarevsky@itseez.com>
	Thu, 11 Oct 2012 18:37:14 +0000 (22:37 +0400)
committer	Vadim Pisarevsky <vadim.pisarevsky@itseez.com>
	Thu, 11 Oct 2012 18:37:14 +0000 (22:37 +0400)
modules/core/include/opencv2/core/core.hpp		patch \| blob \| history
modules/core/src/parallel.cpp		patch \| blob \| history
modules/imgproc/src/color.cpp		patch \| blob \| history
modules/imgproc/src/imgwarp.cpp		patch \| blob \| history
modules/imgproc/src/smooth.cpp		patch \| blob \| history
modules/imgproc/src/thresh.cpp		patch \| blob \| history
modules/video/src/bgfg_gmg.cpp		patch \| blob \| history