TBB version of calcHist
authorDaniil-Osokin <daniil.osokin@itseez.com>
Sat, 6 Oct 2012 14:06:29 +0000 (18:06 +0400)
committerDaniil Osokin <daniil.osokin@itseez.com>
Mon, 17 Dec 2012 07:01:31 +0000 (11:01 +0400)
modules/imgproc/src/histogram.cpp

index edcb240..89699f8 100644 (file)
@@ -165,11 +165,13 @@ static void histPrepareImages( const Mat* images, int nimages, const int* channe
         deltas[dims*2 + 1] = (int)(mask.step/mask.elemSize1());
     }
 
+#ifndef HAVE_TBB
     if( isContinuous )
     {
         imsize.width *= imsize.height;
         imsize.height = 1;
     }
+#endif
 
     if( !ranges )
     {
@@ -207,6 +209,538 @@ static void histPrepareImages( const Mat* images, int nimages, const int* channe
 
 
 ////////////////////////////////// C A L C U L A T E    H I S T O G R A M ////////////////////////////////////
+#ifdef HAVE_TBB
+enum {one = 1, two, three}; // array elements number
+
+template<typename T>
+class calcHist1D_Invoker
+{
+public:
+    calcHist1D_Invoker( const vector<uchar*>& _ptrs, const vector<int>& _deltas,
+                        Mat& hist, const double* _uniranges, int sz, int dims,
+                        Size& imageSize )
+        : mask_(_ptrs[dims]),
+          mstep_(_deltas[dims*2 + 1]),
+          imageWidth_(imageSize.width),
+          histogramSize_(hist.size()), histogramType_(hist.type()),
+          globalHistogram_((tbb::atomic<int>*)hist.data)
+    {
+        p_[0] = ((T**)&_ptrs[0])[0];
+        step_[0] = (&_deltas[0])[1];
+        d_[0] = (&_deltas[0])[0];
+        a_[0] = (&_uniranges[0])[0];
+        b_[0] = (&_uniranges[0])[1];
+        size_[0] = sz;
+    }
+
+    void operator()( const BlockedRange& range ) const
+    {
+        T* p0 = p_[0] + range.begin() * (step_[0] + imageWidth_*d_[0]);
+        uchar* mask = mask_ + range.begin()*mstep_;
+
+        for( int row = range.begin(); row < range.end(); row++, p0 += step_[0] )
+        {
+            if( !mask_ )
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0] )
+                {
+                    int idx = cvFloor(*p0*a_[0] + b_[0]);
+                    if( (unsigned)idx < (unsigned)size_[0] )
+                    {
+                        globalHistogram_[idx].fetch_and_add(1);
+                    }
+                }
+            }
+            else
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0] )
+                {
+                    if( mask[x] )
+                    {
+                        int idx = cvFloor(*p0*a_[0] + b_[0]);
+                        if( (unsigned)idx < (unsigned)size_[0] )
+                        {
+                            globalHistogram_[idx].fetch_and_add(1);
+                        }
+                    }
+                }
+                mask += mstep_;
+            }
+        }
+    }
+
+private:
+    T* p_[one];
+    uchar* mask_;
+    int step_[one];
+    int d_[one];
+    int mstep_;
+    double a_[one];
+    double b_[one];
+    int size_[one];
+    int imageWidth_;
+    Size histogramSize_;
+    int histogramType_;
+    tbb::atomic<int>* globalHistogram_;
+};
+
+template<typename T>
+class calcHist2D_Invoker
+{
+public:
+    calcHist2D_Invoker( const vector<uchar*>& _ptrs, const vector<int>& _deltas,
+                        Mat& hist, const double* _uniranges, const int* size,
+                        int dims, Size& imageSize, size_t* hstep )
+        : mask_(_ptrs[dims]),
+          mstep_(_deltas[dims*2 + 1]),
+          imageWidth_(imageSize.width),
+          histogramSize_(hist.size()), histogramType_(hist.type()),
+          globalHistogram_(hist.data)
+    {
+        p_[0] = ((T**)&_ptrs[0])[0]; p_[1] = ((T**)&_ptrs[0])[1];
+        step_[0] = (&_deltas[0])[1]; step_[1] = (&_deltas[0])[3];
+        d_[0] = (&_deltas[0])[0];    d_[1] = (&_deltas[0])[2];
+        a_[0] = (&_uniranges[0])[0]; a_[1] = (&_uniranges[0])[2];
+        b_[0] = (&_uniranges[0])[1]; b_[1] = (&_uniranges[0])[3];
+        size_[0] = size[0];          size_[1] = size[1];
+        hstep_[0] = hstep[0];
+    }
+
+    void operator()(const BlockedRange& range) const
+    {
+        T* p0 = p_[0] + range.begin()*(step_[0] + imageWidth_*d_[0]);
+        T* p1 = p_[1] + range.begin()*(step_[1] + imageWidth_*d_[1]);
+        uchar* mask = mask_ + range.begin()*mstep_;
+
+        for( int row = range.begin(); row < range.end(); row++, p0 += step_[0], p1 += step_[1] )
+        {
+            if( !mask_ )
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1] )
+                {
+                    int idx0 = cvFloor(*p0*a_[0] + b_[0]);
+                    int idx1 = cvFloor(*p1*a_[1] + b_[1]);
+                    if( (unsigned)idx0 < (unsigned)size_[0] && (unsigned)idx1 < (unsigned)size_[1] )
+                        ( (tbb::atomic<int>*)(globalHistogram_ + hstep_[0]*idx0) )[idx1].fetch_and_add(1);
+                }
+            }
+            else
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1] )
+                {
+                    if( mask[x] )
+                    {
+                        int idx0 = cvFloor(*p0*a_[0] + b_[0]);
+                        int idx1 = cvFloor(*p1*a_[1] + b_[1]);
+                        if( (unsigned)idx0 < (unsigned)size_[0] && (unsigned)idx1 < (unsigned)size_[1] )
+                            ((tbb::atomic<int>*)(globalHistogram_ + hstep_[0]*idx0))[idx1].fetch_and_add(1);
+                    }
+                }
+                mask += mstep_;
+            }
+        }
+    }
+
+private:
+    T* p_[two];
+    uchar* mask_;
+    int step_[two];
+    int d_[two];
+    int mstep_;
+    double a_[two];
+    double b_[two];
+    int size_[two];
+    const int imageWidth_;
+    size_t hstep_[one];
+    Size histogramSize_;
+    int histogramType_;
+    uchar* globalHistogram_;
+};
+
+
+template<typename T>
+class calcHist3D_Invoker
+{
+public:
+    calcHist3D_Invoker( const vector<uchar*>& _ptrs, const vector<int>& _deltas,
+                        Size imsize, Mat& hist, const double* uniranges, int _dims,
+                        size_t* hstep, int* size )
+        : mask_(_ptrs[_dims]),
+          mstep_(_deltas[_dims*2 + 1]),
+          imageWidth_(imsize.width),
+          globalHistogram_(hist.data)
+    {
+        p_[0] = ((T**)&_ptrs[0])[0]; p_[1] = ((T**)&_ptrs[0])[1]; p_[2] = ((T**)&_ptrs[0])[2];
+        step_[0] = (&_deltas[0])[1]; step_[1] = (&_deltas[0])[3]; step_[2] = (&_deltas[0])[5];
+        d_[0] = (&_deltas[0])[0];    d_[1] = (&_deltas[0])[2];    d_[2] = (&_deltas[0])[4];
+        a_[0] = uniranges[0];        a_[1] = uniranges[2];        a_[2] = uniranges[4];
+        b_[0] = uniranges[1];        b_[1] = uniranges[3];        b_[2] = uniranges[5];
+        size_[0] = size[0];          size_[1] = size[1];          size_[2] = size[2];
+        hstep_[0] = hstep[0];        hstep_[1] = hstep[1];
+    }
+
+    void operator()( const BlockedRange& range ) const
+    {
+        T* p0 = p_[0] + range.begin()*(imageWidth_*d_[0] + step_[0]);
+        T* p1 = p_[1] + range.begin()*(imageWidth_*d_[1] + step_[1]);
+        T* p2 = p_[2] + range.begin()*(imageWidth_*d_[2] + step_[2]);
+        uchar* mask = mask_ + range.begin()*mstep_;
+
+        for( int i = range.begin(); i < range.end(); i++, p0 += step_[0], p1 += step_[1], p2 += step_[2] )
+        {
+            if( !mask_ )
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1], p2 += d_[2] )
+                {
+                    int idx0 = cvFloor(*p0*a_[0] + b_[0]);
+                    int idx1 = cvFloor(*p1*a_[1] + b_[1]);
+                    int idx2 = cvFloor(*p2*a_[2] + b_[2]);
+                    if( (unsigned)idx0 < (unsigned)size_[0] &&
+                            (unsigned)idx1 < (unsigned)size_[1] &&
+                            (unsigned)idx2 < (unsigned)size_[2] )
+                    {
+                        ( (tbb::atomic<int>*)(globalHistogram_ + hstep_[0]*idx0 + hstep_[1]*idx1) )[idx2].fetch_and_add(1);
+                    }
+                }
+            }
+            else
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1], p2 += d_[2] )
+                {
+                    if( mask[x] )
+                    {
+                        int idx0 = cvFloor(*p0*a_[0] + b_[0]);
+                        int idx1 = cvFloor(*p1*a_[1] + b_[1]);
+                        int idx2 = cvFloor(*p2*a_[2] + b_[2]);
+                        if( (unsigned)idx0 < (unsigned)size_[0] &&
+                                (unsigned)idx1 < (unsigned)size_[1] &&
+                                (unsigned)idx2 < (unsigned)size_[2] )
+                        {
+                            ( (tbb::atomic<int>*)(globalHistogram_ + hstep_[0]*idx0 + hstep_[1]*idx1) )[idx2].fetch_and_add(1);
+                        }
+                    }
+                }
+                mask += mstep_;
+            }
+        }
+    }
+
+    static bool isFit( const Mat& histogram, const Size imageSize )
+    {
+        return ( imageSize.width * imageSize.height >= 320*240
+                 && histogram.total() >= 8*8*8 );
+    }
+
+private:
+    T* p_[three];
+    uchar* mask_;
+    int step_[three];
+    int d_[three];
+    const int mstep_;
+    double a_[three];
+    double b_[three];
+    int size_[three];
+    int imageWidth_;
+    size_t hstep_[two];
+    uchar* globalHistogram_;
+};
+
+class CalcHist1D_8uInvoker
+{
+public:
+    CalcHist1D_8uInvoker( const vector<uchar*>& ptrs, const vector<int>& deltas,
+                          Size imsize, Mat& hist, int dims, const vector<size_t>& tab,
+                          tbb::mutex* lock )
+        : mask_(ptrs[dims]),
+          mstep_(deltas[dims*2 + 1]),
+          imageWidth_(imsize.width),
+          imageSize_(imsize),
+          histSize_(hist.size()), histType_(hist.type()),
+          tab_((size_t*)&tab[0]),
+          histogramWriteLock_(lock),
+          globalHistogram_(hist.data)
+    {
+        p_[0] = (&ptrs[0])[0];
+        step_[0] = (&deltas[0])[1];
+        d_[0] = (&deltas[0])[0];
+    }
+
+    void operator()( const BlockedRange& range ) const
+    {
+        int localHistogram[256] = { 0, };
+        uchar* mask = mask_;
+        uchar* p0 = p_[0];
+        int x;
+        tbb::mutex::scoped_lock lock;
+
+        if( !mask_ )
+        {
+            int n = (imageWidth_ - 4) / 4 + 1;
+            int tail = imageWidth_ - n*4;
+
+            int xN = 4*n;
+            p0 += (xN*d_[0] + tail*d_[0] + step_[0]) * range.begin();
+        }
+        else
+        {
+            p0 += (imageWidth_*d_[0] + step_[0]) * range.begin();
+            mask += mstep_*range.begin();
+        }
+
+        for( int i = range.begin(); i < range.end(); i++, p0 += step_[0] )
+        {
+            if( !mask_ )
+            {
+                if( d_[0] == 1 )
+                {
+                    for( x = 0; x <= imageWidth_ - 4; x += 4 )
+                    {
+                        int t0 = p0[x], t1 = p0[x+1];
+                        localHistogram[t0]++; localHistogram[t1]++;
+                        t0 = p0[x+2]; t1 = p0[x+3];
+                        localHistogram[t0]++; localHistogram[t1]++;
+                    }
+                    p0 += x;
+                }
+                else
+                {
+                    for( x = 0; x <= imageWidth_ - 4; x += 4 )
+                    {
+                        int t0 = p0[0], t1 = p0[d_[0]];
+                        localHistogram[t0]++; localHistogram[t1]++;
+                        p0 += d_[0]*2;
+                        t0 = p0[0]; t1 = p0[d_[0]];
+                        localHistogram[t0]++; localHistogram[t1]++;
+                        p0 += d_[0]*2;
+                    }
+                }
+
+                for( ; x < imageWidth_; x++, p0 += d_[0] )
+                {
+                    localHistogram[*p0]++;
+                }
+            }
+            else
+            {
+                for( x = 0; x < imageWidth_; x++, p0 += d_[0] )
+                {
+                    if( mask[x] )
+                    {
+                        localHistogram[*p0]++;
+                    }
+                }
+                mask += mstep_;
+            }
+        }
+
+        lock.acquire(*histogramWriteLock_);
+        for(int i = 0; i < 256; i++ )
+        {
+            size_t hidx = tab_[i];
+            if( hidx < OUT_OF_RANGE )
+            {
+                *(int*)((globalHistogram_ + hidx)) += localHistogram[i];
+            }
+        }
+        lock.release();
+    }
+
+    static bool isFit( const Mat& histogram, const Size imageSize )
+    {
+        return ( histogram.total() >= 8
+                && imageSize.width * imageSize.height >= 160*120 );
+    }
+
+private:
+    uchar* p_[one];
+    uchar* mask_;
+    int mstep_;
+    int step_[one];
+    int d_[one];
+    int imageWidth_;
+    Size imageSize_;
+    Size histSize_;
+    int histType_;
+    size_t* tab_;
+    tbb::mutex* histogramWriteLock_;
+    uchar* globalHistogram_;
+};
+
+class CalcHist2D_8uInvoker
+{
+public:
+    CalcHist2D_8uInvoker( const vector<uchar*>& _ptrs, const vector<int>& _deltas,
+                          Size imsize, Mat& hist, int dims, const vector<size_t>& _tab,
+                          tbb::mutex* lock )
+        : mask_(_ptrs[dims]),
+          mstep_(_deltas[dims*2 + 1]),
+          imageWidth_(imsize.width),
+          histSize_(hist.size()), histType_(hist.type()),
+          tab_((size_t*)&_tab[0]),
+          histogramWriteLock_(lock),
+          globalHistogram_(hist.data)
+    {
+        p_[0] = (uchar*)(&_ptrs[0])[0]; p_[1] = (uchar*)(&_ptrs[0])[1];
+        step_[0] = (&_deltas[0])[1];    step_[1] = (&_deltas[0])[3];
+        d_[0] = (&_deltas[0])[0];       d_[1] = (&_deltas[0])[2];
+    }
+
+    void operator()( const BlockedRange& range ) const
+    {
+        uchar* p0 = p_[0] + range.begin()*(step_[0] + imageWidth_*d_[0]);
+        uchar* p1 = p_[1] + range.begin()*(step_[1] + imageWidth_*d_[1]);
+        uchar* mask = mask_ + range.begin()*mstep_;
+
+        Mat localHist = Mat::zeros(histSize_, histType_);
+        uchar* localHistData = localHist.data;
+        tbb::mutex::scoped_lock lock;
+
+        for(int i = range.begin(); i < range.end(); i++, p0 += step_[0], p1 += step_[1])
+        {
+            if( !mask_ )
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1] )
+                {
+                    size_t idx = tab_[*p0] + tab_[*p1 + 256];
+                    if( idx < OUT_OF_RANGE )
+                    {
+                        ++*(int*)(localHistData + idx);
+                    }
+                }
+            }
+            else
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1] )
+                {
+                    size_t idx;
+                    if( mask[x] && (idx = tab_[*p0] + tab_[*p1 + 256]) < OUT_OF_RANGE )
+                    {
+                        ++*(int*)(localHistData + idx);
+                    }
+                }
+                mask += mstep_;
+            }
+        }
+
+        lock.acquire(*histogramWriteLock_);
+        for(int i = 0; i < histSize_.width*histSize_.height; i++)
+        {
+            ((int*)globalHistogram_)[i] += ((int*)localHistData)[i];
+        }
+        lock.release();
+    }
+
+    static bool isFit( const Mat& histogram, const Size imageSize )
+    {
+        return ( (histogram.total() > 4*4 &&  histogram.total() <= 116*116
+                  && imageSize.width * imageSize.height >= 320*240)
+                 || (histogram.total() > 116*116 && imageSize.width * imageSize.height >= 1280*720) );
+    }
+
+private:
+    uchar* p_[two];
+    uchar* mask_;
+    int step_[two];
+    int d_[two];
+    int mstep_;
+    int imageWidth_;
+    Size histSize_;
+    int histType_;
+    size_t* tab_;
+    tbb::mutex* histogramWriteLock_;
+    uchar* globalHistogram_;
+};
+
+class CalcHist3D_8uInvoker
+{
+public:
+    CalcHist3D_8uInvoker( const vector<uchar*>& _ptrs, const vector<int>& _deltas,
+                          Size imsize, Mat& hist, int dims, const vector<size_t>& tab )
+        : mask_(_ptrs[dims]),
+          mstep_(_deltas[dims*2 + 1]),
+          histogramSize_(hist.size.p), histogramType_(hist.type()),
+          imageWidth_(imsize.width),
+          tab_((size_t*)&tab[0]),
+          globalHistogram_(hist.data)
+    {
+        p_[0] = (uchar*)(&_ptrs[0])[0]; p_[1] = (uchar*)(&_ptrs[0])[1]; p_[2] = (uchar*)(&_ptrs[0])[2];
+        step_[0] = (&_deltas[0])[1];    step_[1] = (&_deltas[0])[3];    step_[2] = (&_deltas[0])[5];
+        d_[0] = (&_deltas[0])[0];       d_[1] = (&_deltas[0])[2];       d_[2] = (&_deltas[0])[4];
+    }
+
+    void operator()( const BlockedRange& range ) const
+    {
+        uchar* p0 = p_[0] + range.begin()*(step_[0] + imageWidth_*d_[0]);
+        uchar* p1 = p_[1] + range.begin()*(step_[1] + imageWidth_*d_[1]);
+        uchar* p2 = p_[2] + range.begin()*(step_[2] + imageWidth_*d_[2]);
+        uchar* mask = mask_ + range.begin()*mstep_;
+
+        for(int i = range.begin(); i < range.end(); i++, p0 += step_[0], p1 += step_[1], p2 += step_[2] )
+        {
+            if( !mask_ )
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1], p2 += d_[2] )
+                {
+                    size_t idx = tab_[*p0] + tab_[*p1 + 256] + tab_[*p2 + 512];
+                    if( idx < OUT_OF_RANGE )
+                    {
+                        ( *(tbb::atomic<int>*)(globalHistogram_ + idx) ).fetch_and_add(1);
+                    }
+                }
+            }
+            else
+            {
+                for( int x = 0; x < imageWidth_; x++, p0 += d_[0], p1 += d_[1], p2 += d_[2] )
+                {
+                    size_t idx;
+                    if( mask[x] && (idx = tab_[*p0] + tab_[*p1 + 256] + tab_[*p2 + 512]) < OUT_OF_RANGE )
+                    {
+                        (*(tbb::atomic<int>*)(globalHistogram_ + idx)).fetch_and_add(1);
+                    }
+                }
+                mask += mstep_;
+            }
+        }
+    }
+
+    static bool isFit( const Mat& histogram, const Size imageSize )
+    {
+        return ( histogram.total() >= 128*128*128
+                 && imageSize.width * imageSize.width >= 320*240 );
+    }
+
+private:
+    uchar* p_[three];
+    uchar* mask_;
+    int mstep_;
+    int step_[three];
+    int d_[three];
+    int* histogramSize_;
+    int histogramType_;
+    int imageWidth_;
+    size_t* tab_;
+    uchar* globalHistogram_;
+};
+
+static void
+callCalcHist2D_8u( vector<uchar*>& _ptrs, const vector<int>& _deltas,
+                   Size imsize, Mat& hist, int dims,  vector<size_t>& _tab )
+{
+    int grainSize = imsize.height / tbb::task_scheduler_init::default_num_threads();
+    tbb::mutex histogramWriteLock;
+
+    CalcHist2D_8uInvoker body(_ptrs, _deltas, imsize, hist, dims, _tab, &histogramWriteLock);
+    parallel_for(BlockedRange(0, imsize.height, grainSize), body);
+}
+
+static void
+callCalcHist3D_8u( vector<uchar*>& _ptrs, const vector<int>& _deltas,
+                   Size imsize, Mat& hist, int dims,  vector<size_t>& _tab )
+{
+    CalcHist3D_8uInvoker body(_ptrs, _deltas, imsize, hist, dims, _tab);
+    parallel_for(BlockedRange(0, imsize.height), body);
+}
+#endif
 
 template<typename T> static void
 calcHist_( vector<uchar*>& _ptrs, const vector<int>& _deltas,
@@ -234,6 +768,11 @@ calcHist_( vector<uchar*>& _ptrs, const vector<int>& _deltas,
 
         if( dims == 1 )
         {
+#ifdef HAVE_TBB
+            calcHist1D_Invoker<T> body(_ptrs, _deltas, hist, _uniranges, size[0], dims, imsize);
+            parallel_for(BlockedRange(0, imsize.height), body);
+            return;
+#endif
             double a = uniranges[0], b = uniranges[1];
             int sz = size[0], d0 = deltas[0], step0 = deltas[1];
             const T* p0 = (const T*)ptrs[0];
@@ -259,6 +798,11 @@ calcHist_( vector<uchar*>& _ptrs, const vector<int>& _deltas,
         }
         else if( dims == 2 )
         {
+#ifdef HAVE_TBB
+            calcHist2D_Invoker<T> body(_ptrs, _deltas, hist, _uniranges, size, dims, imsize, hstep);
+            parallel_for(BlockedRange(0, imsize.height), body);
+            return;
+#endif
             double a0 = uniranges[0], b0 = uniranges[1], a1 = uniranges[2], b1 = uniranges[3];
             int sz0 = size[0], sz1 = size[1];
             int d0 = deltas[0], step0 = deltas[1],
@@ -290,6 +834,14 @@ calcHist_( vector<uchar*>& _ptrs, const vector<int>& _deltas,
         }
         else if( dims == 3 )
         {
+#ifdef HAVE_TBB
+            if( calcHist3D_Invoker<T>::isFit(hist, imsize) )
+            {
+                calcHist3D_Invoker<T> body(_ptrs, _deltas, imsize, hist, uniranges, dims, hstep, size);
+                parallel_for(BlockedRange(0, imsize.height), body);
+                return;
+            }
+#endif
             double a0 = uniranges[0], b0 = uniranges[1],
                    a1 = uniranges[2], b1 = uniranges[3],
                    a2 = uniranges[4], b2 = uniranges[5];
@@ -441,8 +993,20 @@ calcHist_8u( vector<uchar*>& _ptrs, const vector<int>& _deltas,
 
     if( dims == 1 )
     {
+#ifdef HAVE_TBB
+        if( CalcHist1D_8uInvoker::isFit(hist, imsize) )
+        {
+            int treadsNumber = tbb::task_scheduler_init::default_num_threads();
+            int grainSize = imsize.height/treadsNumber;
+            tbb::mutex histogramWriteLock;
+
+            CalcHist1D_8uInvoker body(_ptrs, _deltas, imsize, hist, dims, _tab, &histogramWriteLock);
+            parallel_for(BlockedRange(0, imsize.height, grainSize), body);
+            return;
+        }
+#endif
         int d0 = deltas[0], step0 = deltas[1];
-        int matH[256] = {0};
+        int matH[256] = { 0, };
         const uchar* p0 = (const uchar*)ptrs[0];
 
         for( ; imsize.height--; p0 += step0, mask += mstep )
@@ -489,6 +1053,13 @@ calcHist_8u( vector<uchar*>& _ptrs, const vector<int>& _deltas,
     }
     else if( dims == 2 )
     {
+#ifdef HAVE_TBB
+        if( CalcHist2D_8uInvoker::isFit(hist, imsize) )
+        {
+            callCalcHist2D_8u(_ptrs, _deltas, imsize, hist, dims, _tab);
+            return;
+        }
+#endif
         int d0 = deltas[0], step0 = deltas[1],
             d1 = deltas[2], step1 = deltas[3];
         const uchar* p0 = (const uchar*)ptrs[0];
@@ -514,6 +1085,13 @@ calcHist_8u( vector<uchar*>& _ptrs, const vector<int>& _deltas,
     }
     else if( dims == 3 )
     {
+#ifdef HAVE_TBB
+        if( CalcHist3D_8uInvoker::isFit(hist, imsize) )
+        {
+            callCalcHist3D_8u(_ptrs, _deltas, imsize, hist, dims, _tab);
+            return;
+        }
+#endif
         int d0 = deltas[0], step0 = deltas[1],
             d1 = deltas[2], step1 = deltas[3],
             d2 = deltas[4], step2 = deltas[5];