From 30593ee55e684a8b2a179c4bbbb49ab7d16e14cb Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Mon, 3 Feb 2014 15:12:27 +0400
Subject: [PATCH] 3rd attempt to prepare patch with improved OpenCL kernels of
 CascadeClassifier.

---
 modules/core/src/ocl.cpp                         |   11 +-
 modules/objdetect/doc/cascade_classification.rst |  106 --
 modules/objdetect/include/opencv2/objdetect.hpp  |   23 -
 modules/objdetect/perf/opencl/perf_cascades.cpp  |    6 +-
 modules/objdetect/src/cascadedetect.cpp          | 1117 ++++++++++------------
 modules/objdetect/src/cascadedetect.hpp          |  246 ++---
 modules/objdetect/src/opencl/cascadedetect.cl    |  671 +++++++++++--
 modules/objdetect/test/test_cascadeandhog.cpp    |    1 +
 samples/cpp/ufacedetect.cpp                      |    9 +-
 9 files changed, 1183 insertions(+), 1007 deletions(-)
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index e45f06a..8e6817b 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -615,7 +615,7 @@ static void* initOpenCLAndLoad(const char* funcname)
             initialized = true;
             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
             if( g_haveOpenCL )
-                fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath);
+                fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
             else
                 fprintf(stderr, "Failed to load OpenCL runtime\n");
         }
@@ -1335,11 +1335,13 @@ inline bool operator < (const HashKey& h1, const HashKey& h2)
     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
 }
 
-static bool g_isOpenCLInitialized = false;
-static bool g_isOpenCLAvailable = false;
 
 bool haveOpenCL()
 {
+#ifdef HAVE_OPENCL
+    static bool g_isOpenCLInitialized = false;
+    static bool g_isOpenCLAvailable = false;
+
     if (!g_isOpenCLInitialized)
     {
         try
@@ -1354,6 +1356,9 @@ bool haveOpenCL()
         g_isOpenCLInitialized = true;
     }
     return g_isOpenCLAvailable;
+#else
+    return false;
+#endif
 }
 
 bool useOpenCL()
diff --git a/modules/objdetect/doc/cascade_classification.rst b/modules/objdetect/doc/cascade_classification.rst
index b10887b..11c9906 100644
--- a/modules/objdetect/doc/cascade_classification.rst
+++ b/modules/objdetect/doc/cascade_classification.rst
@@ -32,112 +32,6 @@ The following reference is for the detection part only. There is a separate appl
 .. [Lienhart02] Rainer Lienhart and Jochen Maydt. An Extended Set of Haar-like Features for Rapid Object Detection. IEEE ICIP 2002, Vol. 1, pp. 900-903, Sep. 2002. This paper, as well as the extended technical report, can be retrieved at http://www.multimedia-computing.de/mediawiki//images/5/52/MRL-TR-May02-revised-Dec02.pdf
 
 
-FeatureEvaluator
-----------------
-.. ocv:class:: FeatureEvaluator
-
-Base class for computing feature values in cascade classifiers. ::
-
-    class CV_EXPORTS FeatureEvaluator
-    {
-    public:
-        enum { HAAR = 0, LBP = 1 }; // supported feature types
-        virtual ~FeatureEvaluator(); // destructor
-        virtual bool read(const FileNode& node);
-        virtual Ptr<FeatureEvaluator> clone() const;
-        virtual int getFeatureType() const;
-
-        virtual bool setImage(const Mat& img, Size origWinSize);
-        virtual bool setWindow(Point p);
-
-        virtual double calcOrd(int featureIdx) const;
-        virtual int calcCat(int featureIdx) const;
-
-        static Ptr<FeatureEvaluator> create(int type);
-    };
-
-
-FeatureEvaluator::read
---------------------------
-Reads parameters of features from the ``FileStorage`` node.
-
-.. ocv:function:: bool FeatureEvaluator::read(const FileNode& node)
-
-    :param node: File node from which the feature parameters are read.
-
-
-
-FeatureEvaluator::clone
----------------------------
-Returns a full copy of the feature evaluator.
-
-.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::clone() const
-
-
-
-FeatureEvaluator::getFeatureType
-------------------------------------
-Returns the feature type (``HAAR`` or ``LBP`` for now).
-
-.. ocv:function:: int FeatureEvaluator::getFeatureType() const
-
-
-FeatureEvaluator::setImage
-------------------------------
-Assigns an image to feature evaluator.
-
-.. ocv:function:: bool FeatureEvaluator::setImage(InputArray img, Size origWinSize, Size sumSize)
-
-    :param img: Matrix of the type ``CV_8UC1`` containing an image where the features are computed.
-
-    :param origWinSize: Size of training images.
-
-    :param sumSize: The requested size of integral images (so if the integral image is smaller, it resides in the top-left corner of the larger image of requested size). Because the features are represented using offsets from the image origin, using the same sumSize for all scales helps to avoid constant readjustments of the features to different scales.
-
-The method assigns an image, where the features will be computed, to the feature evaluator.
-
-
-
-FeatureEvaluator::setWindow
--------------------------------
-Assigns a window in the current image where the features will be computed.
-
-.. ocv:function:: bool FeatureEvaluator::setWindow(Point p)
-
-    :param p: Upper left point of the window where the features are computed. Size of the window is equal to the size of training images.
-
-FeatureEvaluator::calcOrd
------------------------------
-Computes the value of an ordered (numerical) feature.
-
-.. ocv:function:: double FeatureEvaluator::calcOrd(int featureIdx) const
-
-    :param featureIdx: Index of the feature whose value is computed.
-
-The function returns the computed value of an ordered feature.
-
-
-
-FeatureEvaluator::calcCat
------------------------------
-Computes the value of a categorical feature.
-
-.. ocv:function:: int FeatureEvaluator::calcCat(int featureIdx) const
-
-    :param featureIdx: Index of the feature whose value is computed.
-
-The function returns the computed label of a categorical feature, which is the value from [0,... (number of categories - 1)].
-
-
-FeatureEvaluator::create
-----------------------------
-Constructs the feature evaluator.
-
-.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::create(int type)
-
-    :param type: Type of features evaluated by cascade (``HAAR`` or ``LBP`` for now).
-
-
 CascadeClassifier
 -----------------
 .. ocv:class:: CascadeClassifier
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index b9ba2b9..07f1cb9 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -121,29 +121,6 @@ CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::ve
                                             std::vector<double>& foundScales,
                                             double detectThreshold = 0.0, Size winDetSize = Size(64, 128));
 
-class CV_EXPORTS FeatureEvaluator
-{
-public:
-    enum { HAAR = 0,
-           LBP  = 1,
-           HOG  = 2
-         };
-
-    virtual ~FeatureEvaluator();
-
-    virtual bool read(const FileNode& node);
-    virtual Ptr<FeatureEvaluator> clone() const;
-    virtual int getFeatureType() const;
-
-    virtual bool setImage(InputArray img, Size origWinSize, Size sumSize);
-    virtual bool setWindow(Point p);
-
-    virtual double calcOrd(int featureIdx) const;
-    virtual int calcCat(int featureIdx) const;
-
-    static Ptr<FeatureEvaluator> create(int type);
-};
-
 template<> CV_EXPORTS void DefaultDeleter<CvHaarClassifierCascade>::operator ()(CvHaarClassifierCascade* obj) const;
 
 enum { CASCADE_DO_CANNY_PRUNING    = 1,
diff --git a/modules/objdetect/perf/opencl/perf_cascades.cpp b/modules/objdetect/perf/opencl/perf_cascades.cpp
index bf600a0..8a310ef 100644
--- a/modules/objdetect/perf/opencl/perf_cascades.cpp
+++ b/modules/objdetect/perf/opencl/perf_cascades.cpp
@@ -24,14 +24,14 @@ OCL_PERF_TEST_P(Cascade_Image_MinSize, CascadeClassifier,
                                      string("cv/cascadeandhog/images/class57.png") ),
                     testing::Values(30, 64, 90) ) )
 {
-    const string cascasePath = get<0>(GetParam());
+    const string cascadePath = get<0>(GetParam());
     const string imagePath   = get<1>(GetParam());
     int min_size = get<2>(GetParam());
     Size minSize(min_size, min_size);
 
-    CascadeClassifier cc( getDataPath(cascasePath) );
+    CascadeClassifier cc( getDataPath(cascadePath) );
     if (cc.empty())
-        FAIL() << "Can't load cascade file: " << getDataPath(cascasePath);
+        FAIL() << "Can't load cascade file: " << getDataPath(cascadePath);
 
     Mat img = imread(getDataPath(imagePath), IMREAD_GRAYSCALE);
     if (img.empty())
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 46f2fe1..15a1ca2 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -46,71 +46,6 @@
 #include "opencv2/objdetect/objdetect_c.h"
 #include "opencl_kernels.hpp"
 
-#if defined (LOG_CASCADE_STATISTIC)
-struct Logger
-{
-    enum { STADIES_NUM = 20 };
-
-    int gid;
-    cv::Mat mask;
-    cv::Size sz0;
-    int step;
-
-
-    Logger() : gid (0), step(2) {}
-    void setImage(const cv::Mat& image)
-    {
-     if (gid == 0)
-         sz0 = image.size();
-
-      mask.create(image.rows, image.cols * (STADIES_NUM + 1) + STADIES_NUM, CV_8UC1);
-      mask = cv::Scalar(0);
-      cv::Mat roi = mask(cv::Rect(cv::Point(0,0), image.size()));
-      image.copyTo(roi);
-
-      printf("%d) Size = (%d, %d)\n", gid, image.cols, image.rows);
-
-      for(int i = 0; i < STADIES_NUM; ++i)
-      {
-          int x = image.cols + i * (image.cols + 1);
-          cv::line(mask, cv::Point(x, 0), cv::Point(x, mask.rows-1), cv::Scalar(255));
-      }
-
-      if (sz0.width/image.cols > 2 && sz0.height/image.rows > 2)
-          step = 1;
-    }
-
-    void setPoint(const cv::Point& p, int passed_stadies)
-    {
-        int cols = mask.cols / (STADIES_NUM + 1);
-
-        passed_stadies = -passed_stadies;
-        passed_stadies = (passed_stadies == -1) ? STADIES_NUM : passed_stadies;
-
-        unsigned char* ptr = mask.ptr<unsigned char>(p.y) + cols + 1 + p.x;
-        for(int i = 0; i < passed_stadies; ++i, ptr += cols + 1)
-        {
-            *ptr = 255;
-
-            if (step == 2)
-            {
-                ptr[1] = 255;
-                ptr[mask.step] = 255;
-                ptr[mask.step + 1] = 255;
-            }
-        }
-    };
-
-    void write()
-    {
-        char buf[4096];
-        sprintf(buf, "%04d.png", gid++);
-        cv::imwrite(buf, mask);
-    }
-
-} logger;
-#endif
-
 namespace cv
 {
 
@@ -121,7 +56,8 @@ template<typename _Tp> void copyVectorToUMat(const std::vector<_Tp>& v, UMat& um
     Mat(1, (int)(v.size()*sizeof(v[0])), CV_8U, (void*)&v[0]).copyTo(um);
 }
 
-void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps, std::vector<int>* weights, std::vector<double>* levelWeights)
+void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps,
+                     std::vector<int>* weights, std::vector<double>* levelWeights)
 {
     if( groupThreshold <= 0 || rectList.empty() )
     {
@@ -426,7 +362,8 @@ void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& weights, int
     groupRectangles(rectList, groupThreshold, eps, &weights, 0);
 }
 //used for cascade detection algorithm for ROC-curve calculating
-void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& rejectLevels, std::vector<double>& levelWeights, int groupThreshold, double eps)
+void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& rejectLevels,
+                     std::vector<double>& levelWeights, int groupThreshold, double eps)
 {
     groupRectangles(rectList, groupThreshold, eps, &rejectLevels, &levelWeights);
 }
@@ -439,14 +376,138 @@ void groupRectangles_meanshift(std::vector<Rect>& rectList, std::vector<double>&
 
 
 FeatureEvaluator::~FeatureEvaluator() {}
-bool FeatureEvaluator::read(const FileNode&) {return true;}
+
+bool FeatureEvaluator::read(const FileNode&, Size _origWinSize)
+{
+    origWinSize = _origWinSize;
+    localSize = lbufSize = Size(0, 0);
+    if (scaleData.empty())
+        scaleData = makePtr<std::vector<ScaleData> >();
+    else
+        scaleData->clear();
+    return true;
+}
+
 Ptr<FeatureEvaluator> FeatureEvaluator::clone() const { return Ptr<FeatureEvaluator>(); }
 int FeatureEvaluator::getFeatureType() const {return -1;}
-bool FeatureEvaluator::setImage(InputArray, Size, Size) {return true;}
-bool FeatureEvaluator::setWindow(Point) { return true; }
-double FeatureEvaluator::calcOrd(int) const { return 0.; }
+bool FeatureEvaluator::setWindow(Point, int) { return true; }
+void FeatureEvaluator::getUMats(std::vector<UMat>& bufs)
+{
+    if (!(sbufFlag & USBUF_VALID))
+    {
+        sbuf.copyTo(usbuf);
+        sbufFlag |= USBUF_VALID;
+    }
+
+    bufs.clear();
+    bufs.push_back(uscaleData);
+    bufs.push_back(usbuf);
+    bufs.push_back(ufbuf);
+}
+
+void FeatureEvaluator::getMats()
+{
+    if (!(sbufFlag & SBUF_VALID))
+    {
+        usbuf.copyTo(sbuf);
+        sbufFlag |= SBUF_VALID;
+    }
+}
+
+float FeatureEvaluator::calcOrd(int) const { return 0.; }
 int FeatureEvaluator::calcCat(int) const { return 0; }
 
+bool FeatureEvaluator::updateScaleData( Size imgsz, const std::vector<float>& _scales )
+{
+    if( scaleData.empty() )
+        scaleData = makePtr<std::vector<ScaleData> >();
+
+    size_t i, nscales = _scales.size();
+    bool recalcOptFeatures = nscales != scaleData->size();
+    scaleData->resize(nscales);
+
+    int layer_dy = 0;
+    Point layer_ofs(0,0);
+    Size prevBufSize = sbufSize;
+    sbufSize.width = std::max(sbufSize.width, (int)alignSize(cvRound(imgsz.width/_scales[0]) + 31, 32));
+    recalcOptFeatures = recalcOptFeatures || sbufSize.width != prevBufSize.width;
+
+    for( i = 0; i < nscales; i++ )
+    {
+        FeatureEvaluator::ScaleData& s = scaleData->at(i);
+        if( !recalcOptFeatures && fabs(s.scale - _scales[i]) > FLT_EPSILON*100*_scales[i] )
+            recalcOptFeatures = true;
+        float sc = _scales[i];
+        Size sz;
+        sz.width = cvRound(imgsz.width/sc);
+        sz.height = cvRound(imgsz.height/sc);
+        s.ystep = sc >= 2 ? 1 : 2;
+        s.scale = sc;
+        s.szi = Size(sz.width+1, sz.height+1);
+        if( layer_ofs.x + s.szi.width > sbufSize.width )
+        {
+            layer_ofs = Point(0, layer_ofs.y + layer_dy);
+            layer_dy = s.szi.height;
+        }
+        s.layer_ofs = layer_ofs.y*sbufSize.width + layer_ofs.x;
+        layer_ofs.x += s.szi.width;
+    }
+
+    layer_ofs.y += layer_dy;
+    sbufSize.height = std::max(sbufSize.height, layer_ofs.y);
+    recalcOptFeatures = recalcOptFeatures || sbufSize.height != prevBufSize.height;
+    return recalcOptFeatures;
+}
+
+
+bool FeatureEvaluator::setImage( InputArray _image, const std::vector<float>& _scales )
+{
+    Size imgsz = _image.size();
+    bool recalcOptFeatures = updateScaleData(imgsz, _scales);
+    
+    size_t i, nscales = scaleData->size();
+    Size sz0 = scaleData->at(0).szi;
+    sz0 = Size(std::max(rbuf.cols, (int)alignSize(sz0.width, 16)), std::max(rbuf.rows, sz0.height));
+
+    if (recalcOptFeatures)
+    {
+        computeOptFeatures();
+        copyVectorToUMat(*scaleData, uscaleData);
+    }
+
+    if (_image.isUMat() && localSize.area() > 0)
+    {
+        usbuf.create(sbufSize.height*nchannels, sbufSize.width, CV_32S);
+        urbuf.create(sz0, CV_8U);
+
+        for (i = 0; i < nscales; i++)
+        {
+            const ScaleData& s = scaleData->at(i);
+            UMat dst(urbuf, Rect(0, 0, s.szi.width - 1, s.szi.height - 1));
+            resize(_image, dst, dst.size(), 1. / s.scale, 1. / s.scale, INTER_LINEAR);
+            computeChannels((int)i, dst);
+        }
+        sbufFlag = USBUF_VALID;
+    }
+    else
+    {
+        Mat image = _image.getMat();
+        sbuf.create(sbufSize.height*nchannels, sbufSize.width, CV_32S);
+        rbuf.create(sz0, CV_8U);
+
+        for (i = 0; i < nscales; i++)
+        {
+            const ScaleData& s = scaleData->at(i);
+            Mat dst(s.szi.height - 1, s.szi.width - 1, CV_8U, rbuf.data);
+            resize(image, dst, dst.size(), 1. / s.scale, 1. / s.scale, INTER_LINEAR);
+            computeChannels((int)i, dst);
+        }
+        sbufFlag = SBUF_VALID;
+    }
+
+    return true;
+}
+
 //----------------------------------------------  HaarEvaluator ---------------------------------------
 
 bool HaarEvaluator::Feature :: read( const FileNode& node )
@@ -476,24 +537,32 @@ HaarEvaluator::HaarEvaluator()
 {
     optfeaturesPtr = 0;
     pwin = 0;
+    localSize = Size(4, 2);
+    lbufSize = Size(0, 0);
+    nchannels = 0;
 }
+
 HaarEvaluator::~HaarEvaluator()
 {
 }
 
-bool HaarEvaluator::read(const FileNode& node)
+bool HaarEvaluator::read(const FileNode& node, Size _origWinSize)
 {
+    if (!FeatureEvaluator::read(node, _origWinSize))
+        return false;
     size_t i, n = node.size();
     CV_Assert(n > 0);
     if(features.empty())
         features = makePtr<std::vector<Feature> >();
     if(optfeatures.empty())
         optfeatures = makePtr<std::vector<OptFeature> >();
+    if (optfeatures_lbuf.empty())
+        optfeatures_lbuf = makePtr<std::vector<OptFeature> >();
     features->resize(n);
     FileNodeIterator it = node.begin();
     hasTiltedFeatures = false;
     std::vector<Feature>& ff = *features;
-    sumSize0 = Size();
+    sbufSize = Size();
     ufbuf.release();
 
     for(i = 0; i < n; i++, ++it)
@@ -503,143 +572,148 @@ bool HaarEvaluator::read(const FileNode& node)
         if( ff[i].tilted )
             hasTiltedFeatures = true;
     }
+    nchannels = hasTiltedFeatures ? 3 : 2;
+    normrect = Rect(1, 1, origWinSize.width - 2, origWinSize.height - 2);
+
+    if (ocl::haveOpenCL())
+    {
+        String vname = ocl::Device::getDefault().vendor();
+        if (vname == "Advanced Micro Devices, Inc." ||
+            vname == "AMD")
+            localSize = Size(8, 8);
+        lbufSize = Size(origWinSize.width + localSize.width,
+                        origWinSize.height + localSize.height);
+        if (lbufSize.area() > 1024)
+            lbufSize = Size(0, 0);
+    }
+
     return true;
 }
 
 Ptr<FeatureEvaluator> HaarEvaluator::clone() const
 {
     Ptr<HaarEvaluator> ret = makePtr<HaarEvaluator>();
-    ret->origWinSize = origWinSize;
-    ret->features = features;
-    ret->optfeatures = optfeatures;
-    ret->optfeaturesPtr = optfeatures->empty() ? 0 : &(*(ret->optfeatures))[0];
-    ret->hasTiltedFeatures = hasTiltedFeatures;
-    ret->sum0 = sum0; ret->sqsum0 = sqsum0;
-    ret->sum = sum; ret->sqsum = sqsum;
-    ret->usum0 = usum0; ret->usqsum0 = usqsum0; ret->ufbuf = ufbuf;
-    ret->normrect = normrect;
-    memcpy( ret->nofs, nofs, 4*sizeof(nofs[0]) );
-    ret->pwin = pwin;
-    ret->varianceNormFactor = varianceNormFactor;
+    *ret = *this;
     return ret;
 }
 
-bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSize )
-{
-    Size imgsz = _image.size();
-    int cols = imgsz.width, rows = imgsz.height;
-
-    if (imgsz.width < origWinSize.width || imgsz.height < origWinSize.height)
-        return false;
-
-    origWinSize = _origWinSize;
-    normrect = Rect(1, 1, origWinSize.width-2, origWinSize.height-2);
 
-    int rn = _sumSize.height, cn = _sumSize.width, rn_scale = hasTiltedFeatures ? 2 : 1;
-    int sumStep, tofs = 0;
-    CV_Assert(rn >= rows+1 && cn >= cols+1);
+void HaarEvaluator::computeChannels(int scaleIdx, InputArray img)
+{
+    const ScaleData& s = scaleData->at(scaleIdx);
+    tofs = (int)sbufSize.area();
+    sqofs = hasTiltedFeatures ? tofs*2 : tofs;
 
-    if( _image.isUMat() )
+    if (img.isUMat())
     {
-        usum0.create(rn*rn_scale, cn, CV_32S);
-        usqsum0.create(rn, cn, CV_32S);
-        usum = UMat(usum0, Rect(0, 0, cols+1, rows+1));
-        usqsum = UMat(usqsum0, Rect(0, 0, cols, rows));
-
-        if( hasTiltedFeatures )
+        int sx = s.layer_ofs % sbufSize.width;
+        int sy = s.layer_ofs / sbufSize.width;
+        int sqy = sy + (sqofs / sbufSize.width);
+        UMat sum(usbuf, Rect(sx, sy, s.szi.width, s.szi.height));
+        UMat sqsum(usbuf, Rect(sx, sqy, s.szi.width, s.szi.height));
+        sqsum.flags = (sqsum.flags & ~UMat::DEPTH_MASK) | CV_32F;
+
+        if (hasTiltedFeatures)
         {
-            UMat utilted(usum0, Rect(0, _sumSize.height, cols+1, rows+1));
-            integral(_image, usum, noArray(), utilted, CV_32S);
-            tofs = (int)((utilted.offset - usum.offset)/sizeof(int));
+            int sty = sy + (tofs / sbufSize.width);
+            UMat tilted(usbuf, Rect(sx, sty, s.szi.width, s.szi.height));
+            integral(img, sum, sqsum, tilted, CV_32S, CV_32F);
         }
         else
         {
-            integral(_image, usum, noArray(), noArray(), CV_32S);
+            UMatData* u = sqsum.u;
+            integral(img, sum, sqsum, noArray(), CV_32S, CV_32F);
+            CV_Assert(sqsum.u == u && sqsum.size() == s.szi && sqsum.type()==CV_32F);
         }
-
-        sqrBoxFilter(_image, usqsum, CV_32S,
-                     Size(normrect.width, normrect.height),
-                     Point(0, 0), false);
-        /*sqrBoxFilter(_image.getMat(), sqsum, CV_32S,
-                     Size(normrect.width, normrect.height),
-                     Point(0, 0), false);
-        sqsum.copyTo(usqsum);*/
-        sumStep = (int)(usum.step/usum.elemSize());
     }
     else
     {
-        sum0.create(rn*rn_scale, cn, CV_32S);
-        sqsum0.create(rn, cn, CV_32S);
-        sum = sum0(Rect(0, 0, cols+1, rows+1));
-        sqsum = sqsum0(Rect(0, 0, cols, rows));
+        Mat sum(s.szi, CV_32S, sbuf.ptr<int>() + s.layer_ofs, sbuf.step);
+        Mat sqsum(s.szi, CV_32F, sum.ptr<int>() + sqofs, sbuf.step);
 
-        if( hasTiltedFeatures )
+        if (hasTiltedFeatures)
         {
-            Mat tilted = sum0(Rect(0, _sumSize.height, cols+1, rows+1));
-            integral(_image, sum, noArray(), tilted, CV_32S);
-            tofs = (int)((tilted.data - sum.data)/sizeof(int));
+            Mat tilted(s.szi, CV_32S, sum.ptr<int>() + tofs, sbuf.step);
+            integral(img, sum, sqsum, tilted, CV_32S, CV_32F);
         }
         else
-            integral(_image, sum, noArray(), noArray(), CV_32S);
-        sqrBoxFilter(_image, sqsum, CV_32S,
-                     Size(normrect.width, normrect.height),
-                     Point(0, 0), false);
-        sumStep = (int)(sum.step/sum.elemSize());
+            integral(img, sum, sqsum, noArray(), CV_32S, CV_32F);
     }
+}
 
-    CV_SUM_OFS( nofs[0], nofs[1], nofs[2], nofs[3], 0, normrect, sumStep );
+void HaarEvaluator::computeOptFeatures()
+{
+    int sstep = sbufSize.width;
+    CV_SUM_OFS( nofs[0], nofs[1], nofs[2], nofs[3], 0, normrect, sstep );
 
     size_t fi, nfeatures = features->size();
     const std::vector<Feature>& ff = *features;
+    optfeatures->resize(nfeatures);
+    optfeaturesPtr = &(*optfeatures)[0];
+    for( fi = 0; fi < nfeatures; fi++ )
+        optfeaturesPtr[fi].setOffsets( ff[fi], sstep, tofs );
+    optfeatures_lbuf->resize(nfeatures);
 
-    if( sumSize0 != _sumSize )
-    {
-        optfeatures->resize(nfeatures);
-        optfeaturesPtr = &(*optfeatures)[0];
-        for( fi = 0; fi < nfeatures; fi++ )
-            optfeaturesPtr[fi].setOffsets( ff[fi], sumStep, tofs );
-    }
-    if( _image.isUMat() && (sumSize0 != _sumSize || ufbuf.empty()) )
-        copyVectorToUMat(*optfeatures, ufbuf);
-    sumSize0 = _sumSize;
+    for( fi = 0; fi < nfeatures; fi++ )
+        optfeatures_lbuf->at(fi).setOffsets(ff[fi], lbufSize.width > 0 ? lbufSize.width : sstep, tofs);
 
-    return true;
+    copyVectorToUMat(*optfeatures_lbuf, ufbuf);
 }
 
 
-bool  HaarEvaluator::setWindow( Point pt )
+bool HaarEvaluator::setWindow( Point pt, int scaleIdx )
 {
+    const ScaleData& s = getScaleData(scaleIdx);
+
     if( pt.x < 0 || pt.y < 0 ||
-        pt.x + origWinSize.width >= sum.cols ||
-        pt.y + origWinSize.height >= sum.rows )
+        pt.x + origWinSize.width >= s.szi.width ||
+        pt.y + origWinSize.height >= s.szi.height )
         return false;
 
-    const int* p = &sum.at<int>(pt);
-    int valsum = CALC_SUM_OFS(nofs, p);
-    double valsqsum = sqsum.at<int>(pt.y + normrect.y, pt.x + normrect.x);
+    pwin = &sbuf.at<int>(pt) + s.layer_ofs;
+    const float* pq = (const float*)(pwin + sqofs);
+    int valsum = CALC_SUM_OFS(nofs, pwin);
+    float valsqsum = CALC_SUM_OFS(nofs, pq);
 
     double nf = (double)normrect.area() * valsqsum - (double)valsum * valsum;
     if( nf > 0. )
         nf = std::sqrt(nf);
     else
         nf = 1.;
-    varianceNormFactor = 1./nf;
-    pwin = p;
+    varianceNormFactor = (float)(1./nf);
 
     return true;
 }
 
+
+void HaarEvaluator::OptFeature::setOffsets( const Feature& _f, int step, int _tofs )
+{
+    weight[0] = _f.rect[0].weight;
+    weight[1] = _f.rect[1].weight;
+    weight[2] = _f.rect[2].weight;
+
+    if( _f.tilted )
+    {
+        CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], _tofs, _f.rect[0].r, step );
+        CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], _tofs, _f.rect[1].r, step );
+        CV_TILTED_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], _tofs, _f.rect[2].r, step );
+    }
+    else
+    {
+        CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
+        CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
+        CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, _f.rect[2].r, step );
+    }
+}
+
 Rect HaarEvaluator::getNormRect() const
 {
     return normrect;
 }
 
-void HaarEvaluator::getUMats(std::vector<UMat>& bufs)
+int HaarEvaluator::getSquaresOffset() const
 {
-    bufs.clear();
-    bufs.push_back(usum);
-    bufs.push_back(usqsum);
-    bufs.push_back(ufbuf);
+    return sqofs;
 }
 
 //----------------------------------------------  LBPEvaluator -------------------------------------
@@ -655,15 +729,26 @@ LBPEvaluator::LBPEvaluator()
 {
     features = makePtr<std::vector<Feature> >();
     optfeatures = makePtr<std::vector<OptFeature> >();
+    scaleData = makePtr<std::vector<ScaleData> >();
 }
+
 LBPEvaluator::~LBPEvaluator()
 {
 }
 
-bool LBPEvaluator::read( const FileNode& node )
+bool LBPEvaluator::read( const FileNode& node, Size _origWinSize )
 {
+    if (!FeatureEvaluator::read(node, _origWinSize))
+        return false;
+    if(features.empty())
+        features = makePtr<std::vector<Feature> >();
+    if(optfeatures.empty())
+        optfeatures = makePtr<std::vector<OptFeature> >();
+    if (optfeatures_lbuf.empty())
+        optfeatures_lbuf = makePtr<std::vector<OptFeature> >();
+
     features->resize(node.size());
-    optfeaturesPtr = &(*optfeatures)[0];
+    optfeaturesPtr = 0;
     FileNodeIterator it = node.begin(), it_end = node.end();
     std::vector<Feature>& ff = *features;
     for(int i = 0; it != it_end; ++it, i++)
@@ -671,274 +756,92 @@ bool LBPEvaluator::read( const FileNode& node )
         if(!ff[i].read(*it))
             return false;
     }
+    nchannels = 1;
+    if (ocl::haveOpenCL())
+    {
+        const ocl::Device& device = ocl::Device::getDefault();
+        String vname = device.vendor();
+        if ((vname == "Advanced Micro Devices, Inc." ||
+            vname == "AMD") && !device.hostUnifiedMemory())
+            localSize = Size(8, 8);
+    }
     return true;
 }
 
 Ptr<FeatureEvaluator> LBPEvaluator::clone() const
 {
     Ptr<LBPEvaluator> ret = makePtr<LBPEvaluator>();
-    ret->origWinSize = origWinSize;
-    ret->features = features;
-    ret->optfeatures = optfeatures;
-    ret->optfeaturesPtr = ret->optfeatures.empty() ? 0 : &(*ret->optfeatures)[0];
-    ret->sum0 = sum0, ret->sum = sum;
-    ret->pwin = pwin;
+    *ret = *this;
     return ret;
 }
 
-bool LBPEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSize )
+void LBPEvaluator::computeChannels(int scaleIdx, InputArray _img)
 {
-    Size imgsz = _image.size();
-    int cols = imgsz.width, rows = imgsz.height;
-
-    if (imgsz.width < origWinSize.width || imgsz.height < origWinSize.height)
-        return false;
-
-    origWinSize = _origWinSize;
+    const ScaleData& s = scaleData->at(scaleIdx);
 
-    int rn = _sumSize.height, cn = _sumSize.width;
-    int sumStep;
-    CV_Assert(rn >= rows+1 && cn >= cols+1);
-
-    if( _image.isUMat() )
+    if (_img.isUMat())
     {
-        usum0.create(rn, cn, CV_32S);
-        usum = UMat(usum0, Rect(0, 0, cols+1, rows+1));
-
-        integral(_image, usum, noArray(), noArray(), CV_32S);
-        sumStep = (int)(usum.step/usum.elemSize());
+        int sx = s.layer_ofs % sbufSize.width;
+        int sy = s.layer_ofs / sbufSize.width;
+        UMat sum(usbuf, Rect(sx, sy, s.szi.width, s.szi.height));
+        integral(_img, sum, noArray(), noArray(), CV_32S);
     }
     else
     {
-        sum0.create(rn, cn, CV_32S);
-        sum = sum0(Rect(0, 0, cols+1, rows+1));
-
-        integral(_image, sum, noArray(), noArray(), CV_32S);
-        sumStep = (int)(sum.step/sum.elemSize());
-    }
-
-    size_t fi, nfeatures = features->size();
-    const std::vector<Feature>& ff = *features;
-
-    if( sumSize0 != _sumSize )
-    {
-        optfeatures->resize(nfeatures);
-        optfeaturesPtr = &(*optfeatures)[0];
-        for( fi = 0; fi < nfeatures; fi++ )
-            optfeaturesPtr[fi].setOffsets( ff[fi], sumStep );
+        Mat sum(s.szi, CV_32S, sbuf.ptr<int>() + s.layer_ofs, sbuf.step);
+        integral(_img, sum, noArray(), noArray(), CV_32S);
     }
-    if( _image.isUMat() && (sumSize0 != _sumSize || ufbuf.empty()) )
-        copyVectorToUMat(*optfeatures, ufbuf);
-    sumSize0 = _sumSize;
-
-    return true;
 }
 
-bool LBPEvaluator::setWindow( Point pt )
+void LBPEvaluator::computeOptFeatures()
 {
-    if( pt.x < 0 || pt.y < 0 ||
-        pt.x + origWinSize.width >= sum.cols ||
-        pt.y + origWinSize.height >= sum.rows )
-        return false;
-    pwin = &sum.at<int>(pt);
-    return true;
-}
-
+    int sstep = sbufSize.width;
 
-void LBPEvaluator::getUMats(std::vector<UMat>& bufs)
-{
-    bufs.clear();
-    bufs.push_back(usum);
-    bufs.push_back(ufbuf);
-}
-
-//----------------------------------------------  HOGEvaluator ---------------------------------------
-bool HOGEvaluator::Feature :: read( const FileNode& node )
-{
-    FileNode rnode = node[CC_RECT];
-    FileNodeIterator it = rnode.begin();
-    it >> rect[0].x >> rect[0].y >> rect[0].width >> rect[0].height >> featComponent;
-    rect[1].x = rect[0].x + rect[0].width;
-    rect[1].y = rect[0].y;
-    rect[2].x = rect[0].x;
-    rect[2].y = rect[0].y + rect[0].height;
-    rect[3].x = rect[0].x + rect[0].width;
-    rect[3].y = rect[0].y + rect[0].height;
-    rect[1].width = rect[2].width = rect[3].width = rect[0].width;
-    rect[1].height = rect[2].height = rect[3].height = rect[0].height;
-    return true;
+    size_t fi, nfeatures = features->size();
+    const std::vector<Feature>& ff = *features;
+    optfeatures->resize(nfeatures);
+    optfeaturesPtr = &(*optfeatures)[0];
+    for( fi = 0; fi < nfeatures; fi++ )
+        optfeaturesPtr[fi].setOffsets( ff[fi], sstep );    
+    copyVectorToUMat(*optfeatures, ufbuf);
 }
 
-HOGEvaluator::HOGEvaluator()
-{
-    features = makePtr<std::vector<Feature> >();
-}
 
-HOGEvaluator::~HOGEvaluator()
+void LBPEvaluator::OptFeature::setOffsets( const Feature& _f, int step )
 {
-}
+    Rect tr = _f.rect;
+    int w0 = tr.width;
+    int h0 = tr.height;
 
-bool HOGEvaluator::read( const FileNode& node )
-{
-    features->resize(node.size());
-    featuresPtr = &(*features)[0];
-    FileNodeIterator it = node.begin(), it_end = node.end();
-    for(int i = 0; it != it_end; ++it, i++)
-    {
-        if(!featuresPtr[i].read(*it))
-            return false;
-    }
-    return true;
+    CV_SUM_OFS( ofs[0], ofs[1], ofs[4], ofs[5], 0, tr, step );
+    tr.x += 2*w0;
+    CV_SUM_OFS( ofs[2], ofs[3], ofs[6], ofs[7], 0, tr, step );
+    tr.y += 2*h0;
+    CV_SUM_OFS( ofs[10], ofs[11], ofs[14], ofs[15], 0, tr, step );
+    tr.x -= 2*w0;
+    CV_SUM_OFS( ofs[8], ofs[9], ofs[12], ofs[13], 0, tr, step );
 }
 
-Ptr<FeatureEvaluator> HOGEvaluator::clone() const
-{
-    Ptr<HOGEvaluator> ret = makePtr<HOGEvaluator>();
-    ret->origWinSize = origWinSize;
-    ret->features = features;
-    ret->featuresPtr = &(*ret->features)[0];
-    ret->offset = offset;
-    ret->hist = hist;
-    ret->normSum = normSum;
-    return ret;
-}
 
-bool HOGEvaluator::setImage( InputArray _image, Size winSize, Size )
+bool LBPEvaluator::setWindow( Point pt, int scaleIdx )
 {
-    Mat image = _image.getMat();
-    int rows = image.rows + 1;
-    int cols = image.cols + 1;
-    origWinSize = winSize;
-    if( image.cols < origWinSize.width || image.rows < origWinSize.height )
-        return false;
-    hist.clear();
-    for( int bin = 0; bin < Feature::BIN_NUM; bin++ )
-    {
-        hist.push_back( Mat(rows, cols, CV_32FC1) );
-    }
-    normSum.create( rows, cols, CV_32FC1 );
+    CV_Assert(0 <= scaleIdx && scaleIdx < (int)scaleData->size());
+    const ScaleData& s = scaleData->at(scaleIdx);
 
-    integralHistogram( image, hist, normSum, Feature::BIN_NUM );
-
-    size_t featIdx, featCount = features->size();
-
-    for( featIdx = 0; featIdx < featCount; featIdx++ )
-    {
-        featuresPtr[featIdx].updatePtrs( hist, normSum );
-    }
-    return true;
-}
-
-bool HOGEvaluator::setWindow(Point pt)
-{
     if( pt.x < 0 || pt.y < 0 ||
-        pt.x + origWinSize.width >= hist[0].cols-2 ||
-        pt.y + origWinSize.height >= hist[0].rows-2 )
+        pt.x + origWinSize.width >= s.szi.width ||
+        pt.y + origWinSize.height >= s.szi.height )
         return false;
-    offset = pt.y * ((int)hist[0].step/sizeof(float)) + pt.x;
+
+    pwin = &sbuf.at<int>(pt) + s.layer_ofs;
     return true;
 }
 
-void HOGEvaluator::integralHistogram(const Mat &img, std::vector<Mat> &histogram, Mat &norm, int nbins) const
-{
-    CV_Assert( img.type() == CV_8U || img.type() == CV_8UC3 );
-    int x, y, binIdx;
-
-    Size gradSize(img.size());
-    Size histSize(histogram[0].size());
-    Mat grad(gradSize, CV_32F);
-    Mat qangle(gradSize, CV_8U);
-
-    AutoBuffer<int> mapbuf(gradSize.width + gradSize.height + 4);
-    int* xmap = (int*)mapbuf + 1;
-    int* ymap = xmap + gradSize.width + 2;
-
-    const int borderType = (int)BORDER_REPLICATE;
-
-    for( x = -1; x < gradSize.width + 1; x++ )
-        xmap[x] = borderInterpolate(x, gradSize.width, borderType);
-    for( y = -1; y < gradSize.height + 1; y++ )
-        ymap[y] = borderInterpolate(y, gradSize.height, borderType);
-
-    int width = gradSize.width;
-    AutoBuffer<float> _dbuf(width*4);
-    float* dbuf = _dbuf;
-    Mat Dx(1, width, CV_32F, dbuf);
-    Mat Dy(1, width, CV_32F, dbuf + width);
-    Mat Mag(1, width, CV_32F, dbuf + width*2);
-    Mat Angle(1, width, CV_32F, dbuf + width*3);
-
-    float angleScale = (float)(nbins/CV_PI);
-
-    for( y = 0; y < gradSize.height; y++ )
-    {
-        const uchar* currPtr = img.data + img.step*ymap[y];
-        const uchar* prevPtr = img.data + img.step*ymap[y-1];
-        const uchar* nextPtr = img.data + img.step*ymap[y+1];
-        float* gradPtr = (float*)grad.ptr(y);
-        uchar* qanglePtr = (uchar*)qangle.ptr(y);
-
-        for( x = 0; x < width; x++ )
-        {
-            dbuf[x] = (float)(currPtr[xmap[x+1]] - currPtr[xmap[x-1]]);
-            dbuf[width + x] = (float)(nextPtr[xmap[x]] - prevPtr[xmap[x]]);
-        }
-        cartToPolar( Dx, Dy, Mag, Angle, false );
-        for( x = 0; x < width; x++ )
-        {
-            float mag = dbuf[x+width*2];
-            float angle = dbuf[x+width*3];
-            angle = angle*angleScale - 0.5f;
-            int bidx = cvFloor(angle);
-            angle -= bidx;
-            if( bidx < 0 )
-                bidx += nbins;
-            else if( bidx >= nbins )
-                bidx -= nbins;
-
-            qanglePtr[x] = (uchar)bidx;
-            gradPtr[x] = mag;
-        }
-    }
-    integral(grad, norm, grad.depth());
-
-    float* histBuf;
-    const float* magBuf;
-    const uchar* binsBuf;
-
-    int binsStep = (int)( qangle.step / sizeof(uchar) );
-    int histStep = (int)( histogram[0].step / sizeof(float) );
-    int magStep = (int)( grad.step / sizeof(float) );
-    for( binIdx = 0; binIdx < nbins; binIdx++ )
-    {
-        histBuf = (float*)histogram[binIdx].data;
-        magBuf = (const float*)grad.data;
-        binsBuf = (const uchar*)qangle.data;
-
-        memset( histBuf, 0, histSize.width * sizeof(histBuf[0]) );
-        histBuf += histStep + 1;
-        for( y = 0; y < qangle.rows; y++ )
-        {
-            histBuf[-1] = 0.f;
-            float strSum = 0.f;
-            for( x = 0; x < qangle.cols; x++ )
-            {
-                if( binsBuf[x] == binIdx )
-                    strSum += magBuf[x];
-                histBuf[x] = histBuf[-histStep + x] + strSum;
-            }
-            histBuf += histStep;
-            binsBuf += binsStep;
-            magBuf += magStep;
-        }
-    }
-}
 
 Ptr<FeatureEvaluator> FeatureEvaluator::create( int featureType )
 {
     return featureType == HAAR ? Ptr<FeatureEvaluator>(new HaarEvaluator) :
         featureType == LBP ? Ptr<FeatureEvaluator>(new LBPEvaluator) :
-        featureType == HOG ? Ptr<FeatureEvaluator>(new HOGEvaluator) :
         Ptr<FeatureEvaluator>();
 }
 
@@ -981,24 +884,21 @@ void CascadeClassifierImpl::read(const FileNode& node)
     read_(node);
 }
 
-int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, double& weight )
+int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, int scaleIdx, double& weight )
 {
-    CV_Assert( !oldCascade );
-
-    assert( data.featureType == FeatureEvaluator::HAAR ||
+    assert( !oldCascade &&
+           (data.featureType == FeatureEvaluator::HAAR ||
             data.featureType == FeatureEvaluator::LBP ||
-            data.featureType == FeatureEvaluator::HOG );
+            data.featureType == FeatureEvaluator::HOG) );
 
-    if( !evaluator->setWindow(pt) )
+    if( !evaluator->setWindow(pt, scaleIdx) )
         return -1;
-    if( data.isStumpBased() )
+    if( data.maxNodesPerTree == 1 )
     {
         if( data.featureType == FeatureEvaluator::HAAR )
             return predictOrderedStump<HaarEvaluator>( *this, evaluator, weight );
         else if( data.featureType == FeatureEvaluator::LBP )
             return predictCategoricalStump<LBPEvaluator>( *this, evaluator, weight );
-        else if( data.featureType == FeatureEvaluator::HOG )
-            return predictOrderedStump<HOGEvaluator>( *this, evaluator, weight );
         else
             return -2;
     }
@@ -1008,8 +908,6 @@ int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, do
             return predictOrdered<HaarEvaluator>( *this, evaluator, weight );
         else if( data.featureType == FeatureEvaluator::LBP )
             return predictCategorical<LBPEvaluator>( *this, evaluator, weight );
-        else if( data.featureType == FeatureEvaluator::HOG )
-            return predictOrdered<HOGEvaluator>( *this, evaluator, weight );
         else
             return -2;
     }
@@ -1036,14 +934,17 @@ Ptr<BaseCascadeClassifier::MaskGenerator> createFaceDetectionMaskGenerator()
 class CascadeClassifierInvoker : public ParallelLoopBody
 {
 public:
-    CascadeClassifierInvoker( CascadeClassifierImpl& _cc, Size _sz1, int _stripSize, int _yStep, double _factor,
-        std::vector<Rect>& _vec, std::vector<int>& _levels, std::vector<double>& _weights, bool outputLevels, const Mat& _mask, Mutex* _mtx)
+    CascadeClassifierInvoker( CascadeClassifierImpl& _cc, int _nscales, int _nstripes,
+                              const FeatureEvaluator::ScaleData* _scaleData,
+                              const int* _stripeSizes, std::vector<Rect>& _vec,
+                              std::vector<int>& _levels, std::vector<double>& _weights,
+                              bool outputLevels, const Mat& _mask, Mutex* _mtx)
     {
         classifier = &_cc;
-        processingRectSize = _sz1;
-        stripSize = _stripSize;
-        yStep = _yStep;
-        scalingFactor = _factor;
+        nscales = _nscales;
+        nstripes = _nstripes;
+        scaleData = _scaleData;
+        stripeSizes = _stripeSizes;
         rectangles = &_vec;
         rejectLevels = outputLevels ? &_levels : 0;
         levelWeights = outputLevels ? &_weights : 0;
@@ -1054,201 +955,203 @@ public:
     void operator()(const Range& range) const
     {
         Ptr<FeatureEvaluator> evaluator = classifier->featureEvaluator->clone();
+        double gypWeight = 0.;
+        Size origWinSize = classifier->data.origWinSize;
 
-        Size winSize(cvRound(classifier->data.origWinSize.width * scalingFactor),
-                     cvRound(classifier->data.origWinSize.height * scalingFactor));
-
-        int y1 = range.start * stripSize;
-        int y2 = std::min(range.end * stripSize, processingRectSize.height);
-        for( int y = y1; y < y2; y += yStep )
+        for( int scaleIdx = 0; scaleIdx < nscales; scaleIdx++ )
         {
-            for( int x = 0; x < processingRectSize.width; x += yStep )
+            const FeatureEvaluator::ScaleData& s = scaleData[scaleIdx];
+            float scalingFactor = s.scale;
+            int yStep = s.ystep;
+            int stripeSize = stripeSizes[scaleIdx];
+            int y0 = range.start*stripeSize;
+            Size szw = s.getWorkingSize(origWinSize);
+            int y1 = std::min(range.end*stripeSize, szw.height);
+            Size winSize(cvRound(origWinSize.width * scalingFactor),
+                         cvRound(origWinSize.height * scalingFactor));
+
+            for( int y = y0; y < y1; y += yStep )
             {
-                if ( (!mask.empty()) && (mask.at<uchar>(Point(x,y))==0)) {
-                    continue;
-                }
-
-                double gypWeight;
-                int result = classifier->runAt(evaluator, Point(x, y), gypWeight);
-
-#if defined (LOG_CASCADE_STATISTIC)
-
-                logger.setPoint(Point(x, y), result);
-#endif
-                if( rejectLevels )
+                for( int x = 0; x < szw.width; x += yStep )
                 {
-                    if( result == 1 )
-                        result =  -(int)classifier->data.stages.size();
-                    if( classifier->data.stages.size() + result == 0 )
+                    int result = classifier->runAt(evaluator, Point(x, y), scaleIdx, gypWeight);
+                    if( rejectLevels )
+                    {
+                        if( result == 1 )
+                            result = -(int)classifier->data.stages.size();
+                        if( classifier->data.stages.size() + result == 0 )
+                        {
+                            mtx->lock();
+                            rectangles->push_back(Rect(cvRound(x*scalingFactor),
+                                                       cvRound(y*scalingFactor),
+                                                       winSize.width, winSize.height));
+                            rejectLevels->push_back(-result);
+                            levelWeights->push_back(gypWeight);
+                            mtx->unlock();
+                        }
+                    }
+                    else if( result > 0 )
                     {
                         mtx->lock();
-                        rectangles->push_back(Rect(cvRound(x*scalingFactor), cvRound(y*scalingFactor), winSize.width, winSize.height));
-                        rejectLevels->push_back(-result);
-                        levelWeights->push_back(gypWeight);
+                        rectangles->push_back(Rect(cvRound(x*scalingFactor),
+                                                   cvRound(y*scalingFactor),
+                                                   winSize.width, winSize.height));
                         mtx->unlock();
                     }
+                    if( result == 0 )
+                        x += yStep;
                 }
-                else if( result > 0 )
-                {
-                    mtx->lock();
-                    rectangles->push_back(Rect(cvRound(x*scalingFactor), cvRound(y*scalingFactor),
-                                               winSize.width, winSize.height));
-                    mtx->unlock();
-                }
-                if( result == 0 )
-                    x += yStep;
             }
         }
     }
 
     CascadeClassifierImpl* classifier;
     std::vector<Rect>* rectangles;
-    Size processingRectSize;
-    int stripSize, yStep;
-    double scalingFactor;
+    int nscales, nstripes;
+    const FeatureEvaluator::ScaleData* scaleData;
+    const int* stripeSizes;
     std::vector<int> *rejectLevels;
     std::vector<double> *levelWeights;
+    std::vector<float> scales;
     Mat mask;
     Mutex* mtx;
 };
 
+
 struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } };
 struct getNeighbors { int operator ()(const CvAvgComp& e) const { return e.neighbors; } };
 
 
-bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processingRectSize,
-                                           int yStep, double factor, std::vector<Rect>& candidates,
-                                           std::vector<int>& levels, std::vector<double>& weights,
-                                           Size sumSize0, bool outputRejectLevels )
-{
-    if( !featureEvaluator->setImage(_image, data.origWinSize, sumSize0) )
-        return false;
-
-#if defined (LOG_CASCADE_STATISTIC)
-    logger.setImage(image);
-#endif
-
-    Mat currentMask;
-    if (maskGenerator) {
-        Mat image = _image.getMat();
-        currentMask=maskGenerator->generateMask(image);
-    }
-
-    std::vector<Rect> candidatesVector;
-    std::vector<int> rejectLevels;
-    std::vector<double> levelWeights;
-
-    int stripCount, stripSize;
-
-    const int PTS_PER_THREAD = 1000;
-    stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
-    stripCount = std::min(std::max(stripCount, 1), 100);
-    stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
-
-    if( outputRejectLevels )
-    {
-        parallel_for_(Range(0, stripCount), CascadeClassifierInvoker( *this, processingRectSize, stripSize, yStep, factor,
-            candidatesVector, rejectLevels, levelWeights, true, currentMask, &mtx));
-        levels.insert( levels.end(), rejectLevels.begin(), rejectLevels.end() );
-        weights.insert( weights.end(), levelWeights.begin(), levelWeights.end() );
-    }
-    else
-    {
-         parallel_for_(Range(0, stripCount), CascadeClassifierInvoker( *this, processingRectSize, stripSize, yStep, factor,
-            candidatesVector, rejectLevels, levelWeights, false, currentMask, &mtx));
-    }
-    candidates.insert( candidates.end(), candidatesVector.begin(), candidatesVector.end() );
-
-#if defined (LOG_CASCADE_STATISTIC)
-    logger.write();
-#endif
-
-    return true;
-}
-
-
-bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize,
-                                                   int yStep, double factor, Size sumSize0 )
+bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<float>& scales,
+                                                            std::vector<Rect>& candidates )
 {
     int featureType = getFeatureType();
     std::vector<UMat> bufs;
-    size_t globalsize[] = { processingRectSize.width/yStep, processingRectSize.height/yStep };
+    featureEvaluator->getUMats(bufs);
+    Size localsz = featureEvaluator->getLocalSize();
+    if( localsz.area() == 0 )
+        return false;
+    Size lbufSize = featureEvaluator->getLocalBufSize();
+    size_t localsize[] = { localsz.width, localsz.height };
+    const int grp_per_CU = 12;
+    size_t globalsize[] = { grp_per_CU*ocl::Device::getDefault().maxComputeUnits()*localsize[0], localsize[1] };
     bool ok = false;
 
+    ufacepos.create(1, MAX_FACES*3+1, CV_32S);
+    UMat ufacepos_count(ufacepos, Rect(0, 0, 1, 1));
+    ufacepos_count.setTo(Scalar::all(0));
+
     if( ustages.empty() )
     {
         copyVectorToUMat(data.stages, ustages);
-        copyVectorToUMat(data.stumps, ustumps);
+        if (!data.stumps.empty())
+            copyVectorToUMat(data.stumps, unodes);
+        else
+            copyVectorToUMat(data.nodes, unodes);
+        copyVectorToUMat(data.leaves, uleaves);
         if( !data.subsets.empty() )
             copyVectorToUMat(data.subsets, usubsets);
     }
 
+    int nstages = (int)data.stages.size();
+
     if( featureType == FeatureEvaluator::HAAR )
     {
         Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
         if( haar.empty() )
             return false;
 
-        haar->setImage(_image, data.origWinSize, sumSize0);
         if( haarKernel.empty() )
         {
-            haarKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc, "");
+            String opts;
+            if (lbufSize.area())
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d",
+                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree);
+            else
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d",
+                              localsz.width, localsz.height, data.maxNodesPerTree);
+            haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
             if( haarKernel.empty() )
                 return false;
         }
 
-        haar->getUMats(bufs);
         Rect normrect = haar->getNormRect();
+        int sqofs = haar->getSquaresOffset();
+        int splitstage_ocl = 1;
 
-        haarKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
-                        ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum
+        haarKernel.args((int)scales.size(),
+                        ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
+                        ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sum
                         ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
 
                         // cascade classifier
-                        (int)data.stages.size(),
+                        splitstage_ocl, nstages,
                         ocl::KernelArg::PtrReadOnly(ustages),
-                        ocl::KernelArg::PtrReadOnly(ustumps),
+                        ocl::KernelArg::PtrReadOnly(unodes),
+                        ocl::KernelArg::PtrReadOnly(uleaves),
 
                         ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
-                        processingRectSize,
-                        yStep, (float)factor,
-                        normrect, data.origWinSize, (int)MAX_FACES);
-        ok = haarKernel.run(2, globalsize, 0, true);
+                        normrect, sqofs, data.origWinSize, (int)MAX_FACES);
+        ok = haarKernel.run(2, globalsize, localsize, true);
     }
     else if( featureType == FeatureEvaluator::LBP )
     {
+        if (data.maxNodesPerTree > 1)
+            return false;
+        
         Ptr<LBPEvaluator> lbp = featureEvaluator.dynamicCast<LBPEvaluator>();
         if( lbp.empty() )
             return false;
 
-        lbp->setImage(_image, data.origWinSize, sumSize0);
         if( lbpKernel.empty() )
         {
-            lbpKernel.create("runLBPClassifierStump", ocl::objdetect::cascadedetect_oclsrc, "");
+            String opts;
+            if (lbufSize.area())
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d",
+                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width);
+            else
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d", localsz.width, localsz.height);
+            lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
             if( lbpKernel.empty() )
                 return false;
         }
 
-        lbp->getUMats(bufs);
-
+        int splitstage_ocl = 1;
         int subsetSize = (data.ncategories + 31)/32;
-        lbpKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
-                        ocl::KernelArg::PtrReadOnly(bufs[1]), // optfeatures
+        lbpKernel.args((int)scales.size(),
+                       ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
+                       ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sum
+                       ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
+
+                       // cascade classifier
+                       splitstage_ocl, nstages,
+                       ocl::KernelArg::PtrReadOnly(ustages),
+                       ocl::KernelArg::PtrReadOnly(unodes),
+                       ocl::KernelArg::PtrReadOnly(usubsets),
+                       subsetSize,
+
+                       ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
+                       data.origWinSize, (int)MAX_FACES);
+
+        ok = lbpKernel.run(2, globalsize, localsize, true);
+    }
 
-                        // cascade classifier
-                        (int)data.stages.size(),
-                        ocl::KernelArg::PtrReadOnly(ustages),
-                        ocl::KernelArg::PtrReadOnly(ustumps),
-                        ocl::KernelArg::PtrReadOnly(usubsets),
-                        subsetSize,
+    if( ok )
+    {
+        Mat facepos = ufacepos.getMat(ACCESS_READ);
+        const int* fptr = facepos.ptr<int>();
+        int nfaces = fptr[0];
+        nfaces = std::min(nfaces, (int)MAX_FACES);
 
-                        ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
-                        processingRectSize,
-                        yStep, (float)factor,
-                        data.origWinSize, (int)MAX_FACES);
-        ok = lbpKernel.run(2, globalsize, 0, true);
+        for( int i = 0; i < nfaces; i++ )
+        {
+            const FeatureEvaluator::ScaleData& s = featureEvaluator->getScaleData(fptr[i*3 + 1]);
+            candidates.push_back(Rect(cvRound(fptr[i*3 + 2]*s.scale),
+                                      cvRound(fptr[i*3 + 3]*s.scale),
+                                      cvRound(data.origWinSize.width*s.scale),
+                                      cvRound(data.origWinSize.height*s.scale)));
+        }
     }
-    //CV_Assert(ok);
     return ok;
 }
 
@@ -1296,11 +1199,11 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
                                                     double scaleFactor, Size minObjectSize, Size maxObjectSize,
                                                     bool outputRejectLevels )
 {
-    int featureType = getFeatureType();
     Size imgsz = _image.size();
-    int imgtype = _image.type();
-
-    Mat grayImage, imageBuffer;
+    
+    Mat grayImage;
+    UMat ugrayImage;
+    _InputArray gray;
 
     candidates.clear();
     rejectLevels.clear();
@@ -1309,120 +1212,86 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
     if( maxObjectSize.height == 0 || maxObjectSize.width == 0 )
         maxObjectSize = imgsz;
 
-    bool use_ocl = ocl::useOpenCL() &&
-        (featureType == FeatureEvaluator::HAAR ||
-         featureType == FeatureEvaluator::LBP) &&
-        ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
-        !isOldFormatCascade() &&
-        data.isStumpBased() &&
-        maskGenerator.empty() &&
-        !outputRejectLevels &&
-        tryOpenCL;
-
-    if( !use_ocl )
-    {
-        Mat image = _image.getMat();
-        if (maskGenerator)
-            maskGenerator->initializeMask(image);
-
-        grayImage = image;
-        if( CV_MAT_CN(imgtype) > 1 )
-        {
-            Mat temp;
-            cvtColor(grayImage, temp, COLOR_BGR2GRAY);
-            grayImage = temp;
-        }
+    bool use_ocl = tryOpenCL && ocl::useOpenCL() &&
+         featureEvaluator->getLocalSize().area() > 0 &&
+         ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
+         (data.minNodesPerTree == data.maxNodesPerTree) &&
+         !isOldFormatCascade() &&
+         maskGenerator.empty() &&
+         !outputRejectLevels;
 
-        imageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
-    }
-    else
+    /*if( use_ocl )
     {
-        UMat uimage = _image.getUMat();
-        if( CV_MAT_CN(imgtype) > 1 )
-            cvtColor(uimage, ugrayImage, COLOR_BGR2GRAY);
+        if (_image.channels() > 1)
+            cvtColor(_image, ugrayImage, COLOR_BGR2GRAY);
+        else if (_image.isUMat())
+            ugrayImage = _image.getUMat();
         else
-            uimage.copyTo(ugrayImage);
-        uimageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
+            _image.copyTo(ugrayImage);
+        gray = ugrayImage;
     }
-
-    Size sumSize0((imgsz.width + SUM_ALIGN) & -SUM_ALIGN, imgsz.height+1);
-
-    if( use_ocl )
+    else*/
     {
-        ufacepos.create(1, MAX_FACES*4 + 1, CV_32S);
-        UMat ufacecount(ufacepos, Rect(0,0,1,1));
-        ufacecount.setTo(Scalar::all(0));
+        if (_image.channels() > 1)
+            cvtColor(_image, grayImage, COLOR_BGR2GRAY);
+        else if (_image.isMat())
+            grayImage = _image.getMat();
+        else
+            _image.copyTo(grayImage);
+        gray = grayImage;
     }
 
+    std::vector<float> scales;
+    scales.reserve(1024);
+
     for( double factor = 1; ; factor *= scaleFactor )
     {
         Size originalWindowSize = getOriginalWindowSize();
 
         Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) );
-        Size scaledImageSize( cvRound( imgsz.width/factor ), cvRound( imgsz.height/factor ) );
-        Size processingRectSize( scaledImageSize.width - originalWindowSize.width,
-                                 scaledImageSize.height - originalWindowSize.height );
-
-        if( processingRectSize.width <= 0 || processingRectSize.height <= 0 )
-            break;
-        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
+        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height ||
+            windowSize.width > imgsz.width || windowSize.height > imgsz.height )
             break;
         if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
             continue;
+        scales.push_back((float)factor);
+    }
 
-        int yStep;
-        if( getFeatureType() == cv::FeatureEvaluator::HOG )
-        {
-            yStep = 4;
-        }
-        else
-        {
-            yStep = factor > 2. ? 1 : 2;
-        }
-
-        if( use_ocl )
-        {
-            UMat uscaledImage(uimageBuffer, Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
-            resize( ugrayImage, uscaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
-
-            if( ocl_detectSingleScale( uscaledImage, processingRectSize, yStep, factor, sumSize0 ) )
-                continue;
-
-            /////// if the OpenCL branch has been executed but failed, fall back to CPU: /////
-
-            tryOpenCL = false; // for this cascade do not try OpenCL anymore
-
-            // since we may already have some partial results from OpenCL code (unlikely, but still),
-            // we just recursively call the function again, but with tryOpenCL==false it will
-            // go with CPU route, so there is no infinite recursion
-            detectMultiScaleNoGrouping( _image, candidates, rejectLevels, levelWeights,
-                                       scaleFactor, minObjectSize, maxObjectSize,
-                                       outputRejectLevels);
-            return;
-        }
-        else
-        {
-            Mat scaledImage( scaledImageSize, CV_8U, imageBuffer.data );
-            resize( grayImage, scaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
+    if( !featureEvaluator->setImage(gray, scales) )
+        return;
 
-            if( !detectSingleScale( scaledImage, processingRectSize, yStep, factor, candidates,
-                                    rejectLevels, levelWeights, sumSize0, outputRejectLevels ) )
-                break;
-        }
-    }
+    // OpenCL code
+    if( use_ocl && ocl_detectMultiScaleNoGrouping( scales, candidates ))
+        return;
+    tryOpenCL = false;
 
-    if( use_ocl && tryOpenCL )
+    // CPU code
+    featureEvaluator->getMats();
     {
-        Mat facepos = ufacepos.getMat(ACCESS_READ);
-        const int* fptr = facepos.ptr<int>();
-        int i, nfaces = fptr[0];
-        for( i = 0; i < nfaces; i++ )
+        Mat currentMask;
+        if (maskGenerator)
+            currentMask = maskGenerator->generateMask(gray.getMat());
+
+        size_t i, nscales = scales.size();
+        cv::AutoBuffer<int> stripeSizeBuf(nscales);
+        int* stripeSizes = stripeSizeBuf;
+        const FeatureEvaluator::ScaleData* s = &featureEvaluator->getScaleData(0);
+        Size szw = s->getWorkingSize(data.origWinSize);
+        int nstripes = cvCeil(szw.width/32.);
+        for( i = 0; i < nscales; i++ )
         {
-            candidates.push_back(Rect(fptr[i*4+1], fptr[i*4+2], fptr[i*4+3], fptr[i*4+4]));
+            szw = s[i].getWorkingSize(data.origWinSize);
+            stripeSizes[i] = std::max((szw.height/s[i].ystep + nstripes-1)/nstripes, 1)*s[i].ystep;
         }
+
+        CascadeClassifierInvoker invoker(*this, (int)nscales, nstripes, s, stripeSizes,
+                                         candidates, rejectLevels, levelWeights,
+                                         outputRejectLevels, currentMask, &mtx);
+        parallel_for_(Range(0, nstripes), invoker);
     }
 }
 
+
 void CascadeClassifierImpl::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
                                           std::vector<int>& rejectLevels,
                                           std::vector<double>& levelWeights,
@@ -1462,10 +1331,9 @@ void CascadeClassifierImpl::detectMultiScale( InputArray _image, std::vector<Rec
                                           double scaleFactor, int minNeighbors,
                                           int flags, Size minObjectSize, Size maxObjectSize)
 {
-    Mat image = _image.getMat();
     std::vector<int> fakeLevels;
     std::vector<double> fakeWeights;
-    detectMultiScale( image, objects, fakeLevels, fakeWeights, scaleFactor,
+    detectMultiScale( _image, objects, fakeLevels, fakeWeights, scaleFactor,
         minNeighbors, flags, minObjectSize, maxObjectSize );
 }
 
@@ -1550,6 +1418,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
     stumps.clear();
 
     FileNodeIterator it = fn.begin(), it_end = fn.end();
+    minNodesPerTree = INT_MAX;
     maxNodesPerTree = 0;
 
     for( int si = 0; it != it_end; si++, ++it )
@@ -1576,6 +1445,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
 
             DTree tree;
             tree.nodeCount = (int)internalNodes.size()/nodeStep;
+            minNodesPerTree = std::min(minNodesPerTree, tree.nodeCount);
             maxNodesPerTree = std::max(maxNodesPerTree, tree.nodeCount);
 
             classifiers.push_back(tree);
@@ -1613,7 +1483,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
         }
     }
 
-    if( isStumpBased() )
+    if( maxNodesPerTree == 1 )
     {
         int nodeOfs = 0, leafOfs = 0;
         size_t nstages = stages.size();
@@ -1641,7 +1511,8 @@ bool CascadeClassifierImpl::read_(const FileNode& root)
     haarKernel = ocl::Kernel();
     lbpKernel = ocl::Kernel();
     ustages.release();
-    ustumps.release();
+    unodes.release();
+    uleaves.release();
     if( !data.read(root) )
         return false;
 
@@ -1651,7 +1522,7 @@ bool CascadeClassifierImpl::read_(const FileNode& root)
     if( fn.empty() )
         return false;
 
-    return featureEvaluator->read(fn);
+    return featureEvaluator->read(fn, data.origWinSize);
 }
 
 template<> void DefaultDeleter<CvHaarClassifierCascade>::operator ()(CvHaarClassifierCascade* obj) const
diff --git a/modules/objdetect/src/cascadedetect.hpp b/modules/objdetect/src/cascadedetect.hpp
index 2d3f906..b568c90 100644
--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -3,6 +3,72 @@
 namespace cv
 {
 
+class FeatureEvaluator
+{
+public:
+    enum
+    {
+        HAAR = 0,
+        LBP  = 1,
+        HOG  = 2
+    };
+
+    struct ScaleData
+    {
+        ScaleData() { scale = 0.f; layer_ofs = ystep = 0; }
+        Size getWorkingSize(Size winSize) const
+        {
+            return Size(std::max(szi.width - winSize.width, 0),
+                        std::max(szi.height - winSize.height, 0));
+        }
+
+        float scale;
+        Size szi;
+        int layer_ofs, ystep;
+    };
+
+    virtual ~FeatureEvaluator();
+    
+    virtual bool read(const FileNode& node, Size origWinSize);
+    virtual Ptr<FeatureEvaluator> clone() const;
+    virtual int getFeatureType() const;
+    int getNumChannels() const { return nchannels; }
+
+    virtual bool setImage(InputArray img, const std::vector<float>& scales);
+    virtual bool setWindow(Point p, int scaleIdx);
+    const ScaleData& getScaleData(int scaleIdx) const
+    {
+        CV_Assert( 0 <= scaleIdx && scaleIdx < (int)scaleData->size());
+        return scaleData->at(scaleIdx);
+    }
+    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual void getMats();
+
+    Size getLocalSize() const { return localSize; }
+    Size getLocalBufSize() const { return lbufSize; }
+
+    virtual float calcOrd(int featureIdx) const;
+    virtual int calcCat(int featureIdx) const;
+
+    static Ptr<FeatureEvaluator> create(int type);
+
+protected:
+    enum { SBUF_VALID=1, USBUF_VALID=2 };
+    int sbufFlag;
+
+    bool updateScaleData( Size imgsz, const std::vector<float>& _scales );
+    virtual void computeChannels( int, InputArray ) {}
+    virtual void computeOptFeatures() {}
+
+    Size origWinSize, sbufSize, localSize, lbufSize;
+    int nchannels;
+    Mat sbuf, rbuf;
+    UMat urbuf, usbuf, ufbuf, uscaleData;
+
+    Ptr<std::vector<ScaleData> > scaleData;
+};
+
+
 class CascadeClassifierImpl : public BaseCascadeClassifier
 {
 public:
@@ -54,9 +120,8 @@ protected:
                             int yStep, double factor, std::vector<Rect>& candidates,
                             std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
                             Size sumSize0, bool outputRejectLevels = false );
-    bool ocl_detectSingleScale( InputArray image, Size processingRectSize,
-                                int yStep, double factor, Size sumSize0 );
-
+    bool ocl_detectMultiScaleNoGrouping( const std::vector<float>& scales,
+                                         std::vector<Rect>& candidates );
 
     void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates,
                                     std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
@@ -72,6 +137,7 @@ protected:
     };
 
     friend class CascadeClassifierInvoker;
+    friend class SparseCascadeClassifierInvoker;
 
     template<class FEval>
     friend int predictOrdered( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
@@ -85,7 +151,7 @@ protected:
     template<class FEval>
     friend int predictCategoricalStump( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
 
-    int runAt( Ptr<FeatureEvaluator>& feval, Point pt, double& weight );
+    int runAt( Ptr<FeatureEvaluator>& feval, Point pt, int scaleIdx, double& weight );
 
     class Data
     {
@@ -126,12 +192,10 @@ protected:
 
         bool read(const FileNode &node);
 
-        bool isStumpBased() const { return maxNodesPerTree == 1; }
-
         int stageType;
         int featureType;
         int ncategories;
-        int maxNodesPerTree;
+        int minNodesPerTree, maxNodesPerTree;
         Size origWinSize;
 
         std::vector<Stage> stages;
@@ -148,7 +212,7 @@ protected:
 
     Ptr<MaskGenerator> maskGenerator;
     UMat ugrayImage, uimageBuffer;
-    UMat ufacepos, ustages, ustumps, usubsets;
+    UMat ufacepos, ustages, unodes, uleaves, usubsets;
     ocl::Kernel haarKernel, lbpKernel;
     bool tryOpenCL;
 
@@ -268,7 +332,6 @@ public:
 
         enum { RECT_NUM = Feature::RECT_NUM };
         float calc( const int* pwin ) const;
-
         void setOffsets( const Feature& _f, int step, int tofs );
 
         int ofs[RECT_NUM][4];
@@ -278,35 +341,34 @@ public:
     HaarEvaluator();
     virtual ~HaarEvaluator();
 
-    virtual bool read( const FileNode& node );
+    virtual bool read( const FileNode& node, Size origWinSize);
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const { return FeatureEvaluator::HAAR; }
 
-    virtual bool setImage(InputArray, Size origWinSize, Size sumSize);
-    virtual bool setWindow(Point pt);
-    virtual Rect getNormRect() const;
-    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual bool setWindow(Point p, int scaleIdx);
+    Rect getNormRect() const;
+    int getSquaresOffset() const;
 
-    double operator()(int featureIdx) const
+    float operator()(int featureIdx) const
     { return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
-    virtual double calcOrd(int featureIdx) const
+    virtual float calcOrd(int featureIdx) const
     { return (*this)(featureIdx); }
 
 protected:
-    Size origWinSize, sumSize0;
+    virtual void computeChannels( int i, InputArray img );
+    virtual void computeOptFeatures();
+
     Ptr<std::vector<Feature> > features;
     Ptr<std::vector<OptFeature> > optfeatures;
-    OptFeature* optfeaturesPtr; // optimization
+    Ptr<std::vector<OptFeature> > optfeatures_lbuf;
     bool hasTiltedFeatures;
 
-    Mat sum0, sum, sqsum0, sqsum;
-    UMat usum0, usum, usqsum0, usqsum, ufbuf;
-
+    int tofs, sqofs;
+    Vec4i nofs;
     Rect normrect;
-    int nofs[4];
-
     const int* pwin;
-    double varianceNormFactor;
+    OptFeature* optfeaturesPtr; // optimization
+    float varianceNormFactor;
 };
 
 inline HaarEvaluator::Feature :: Feature()
@@ -336,28 +398,6 @@ inline float HaarEvaluator::OptFeature :: calc( const int* ptr ) const
     return ret;
 }
 
-inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step, int tofs )
-{
-    weight[0] = _f.rect[0].weight;
-    weight[1] = _f.rect[1].weight;
-    weight[2] = _f.rect[2].weight;
-
-    Rect r2 = weight[2] > 0 ? _f.rect[2].r : Rect(0,0,0,0);
-    if (_f.tilted)
-    {
-        CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], tofs, _f.rect[0].r, step );
-        CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], tofs, _f.rect[1].r, step );
-        CV_TILTED_PTRS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], tofs, r2, step );
-    }
-    else
-    {
-        CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
-        CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
-        CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, r2, step );
-    }
-}
-
-
 //----------------------------------------------  LBPEvaluator -------------------------------------
 
 class LBPEvaluator : public FeatureEvaluator
@@ -367,7 +407,7 @@ public:
     {
         Feature();
         Feature( int x, int y, int _block_w, int _block_h  ) :
-            rect(x, y, _block_w, _block_h) {}
+                 rect(x, y, _block_w, _block_h) {}
 
         bool read(const FileNode& node );
 
@@ -386,27 +426,25 @@ public:
     LBPEvaluator();
     virtual ~LBPEvaluator();
 
-    virtual bool read( const FileNode& node );
+    virtual bool read( const FileNode& node, Size origWinSize );
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const { return FeatureEvaluator::LBP; }
 
-    virtual bool setImage(InputArray image, Size _origWinSize, Size);
-    virtual bool setWindow(Point pt);
-    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual bool setWindow(Point p, int scaleIdx);
 
     int operator()(int featureIdx) const
     { return optfeaturesPtr[featureIdx].calc(pwin); }
     virtual int calcCat(int featureIdx) const
     { return (*this)(featureIdx); }
 protected:
-    Size origWinSize, sumSize0;
+    virtual void computeChannels( int i, InputArray img );
+    virtual void computeOptFeatures();
+
     Ptr<std::vector<Feature> > features;
     Ptr<std::vector<OptFeature> > optfeatures;
+    Ptr<std::vector<OptFeature> > optfeatures_lbuf;
     OptFeature* optfeaturesPtr; // optimization
 
-    Mat sum0, sum;
-    UMat usum0, usum, ufbuf;
-
     const int* pwin;
 };
 
@@ -436,98 +474,6 @@ inline int LBPEvaluator::OptFeature :: calc( const int* p ) const
            (CALC_SUM_OFS_( ofs[4], ofs[5], ofs[8], ofs[9], p ) >= cval ? 1 : 0);
 }
 
-inline void LBPEvaluator::OptFeature :: setOffsets( const Feature& _f, int step )
-{
-    Rect tr = _f.rect;
-    CV_SUM_OFS( ofs[0], ofs[1], ofs[4], ofs[5], 0, tr, step );
-    tr.x += 2*_f.rect.width;
-    CV_SUM_OFS( ofs[2], ofs[3], ofs[6], ofs[7], 0, tr, step );
-    tr.y += 2*_f.rect.height;
-    CV_SUM_OFS( ofs[10], ofs[11], ofs[14], ofs[15], 0, tr, step );
-    tr.x -= 2*_f.rect.width;
-    CV_SUM_OFS( ofs[8], ofs[9], ofs[12], ofs[13], 0, tr, step );
-}
-
-//---------------------------------------------- HOGEvaluator -------------------------------------------
-
-class HOGEvaluator : public FeatureEvaluator
-{
-public:
-    struct Feature
-    {
-        Feature();
-        float calc( int offset ) const;
-        void updatePtrs( const std::vector<Mat>& _hist, const Mat &_normSum );
-        bool read( const FileNode& node );
-
-        enum { CELL_NUM = 4, BIN_NUM = 9 };
-
-        Rect rect[CELL_NUM];
-        int featComponent; //component index from 0 to 35
-        const float* pF[4]; //for feature calculation
-        const float* pN[4]; //for normalization calculation
-    };
-    HOGEvaluator();
-    virtual ~HOGEvaluator();
-    virtual bool read( const FileNode& node );
-    virtual Ptr<FeatureEvaluator> clone() const;
-    virtual int getFeatureType() const { return FeatureEvaluator::HOG; }
-    virtual bool setImage( InputArray image, Size winSize, Size );
-    virtual bool setWindow( Point pt );
-    double operator()(int featureIdx) const
-    {
-        return featuresPtr[featureIdx].calc(offset);
-    }
-    virtual double calcOrd( int featureIdx ) const
-    {
-        return (*this)(featureIdx);
-    }
-
-private:
-    virtual void integralHistogram( const Mat& srcImage, std::vector<Mat> &histogram, Mat &norm, int nbins ) const;
-
-    Size origWinSize;
-    Ptr<std::vector<Feature> > features;
-    Feature* featuresPtr;
-    std::vector<Mat> hist;
-    Mat normSum;
-    int offset;
-};
-
-inline HOGEvaluator::Feature :: Feature()
-{
-    rect[0] = rect[1] = rect[2] = rect[3] = Rect();
-    pF[0] = pF[1] = pF[2] = pF[3] = 0;
-    pN[0] = pN[1] = pN[2] = pN[3] = 0;
-    featComponent = 0;
-}
-
-inline float HOGEvaluator::Feature :: calc( int _offset ) const
-{
-    float res = CALC_SUM(pF, _offset);
-    float normFactor = CALC_SUM(pN, _offset);
-    res = (res > 0.001f) ? (res / ( normFactor + 0.001f) ) : 0.f;
-    return res;
-}
-
-inline void HOGEvaluator::Feature :: updatePtrs( const std::vector<Mat> &_hist, const Mat &_normSum )
-{
-    int binIdx = featComponent % BIN_NUM;
-    int cellIdx = featComponent / BIN_NUM;
-    Rect normRect = Rect( rect[0].x, rect[0].y, 2*rect[0].width, 2*rect[0].height );
-
-    const float* featBuf = (const float*)_hist[binIdx].data;
-    size_t featStep = _hist[0].step / sizeof(featBuf[0]);
-
-    const float* normBuf = (const float*)_normSum.data;
-    size_t normStep = _normSum.step / sizeof(normBuf[0]);
-
-    CV_SUM_PTRS( pF[0], pF[1], pF[2], pF[3], featBuf, rect[cellIdx], featStep );
-    CV_SUM_PTRS( pN[0], pN[1], pN[2], pN[3], normBuf, normRect, normStep );
-}
-
-
-
 
 //----------------------------------------------  predictor functions -------------------------------------
 
@@ -662,11 +608,7 @@ inline int predictCategoricalStump( CascadeClassifierImpl& cascade,
     const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0];
     const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    float tmp = 0; // float accumulator -- float operations are quicker
-#else
-    double tmp = 0;
-#endif
+    float tmp = 0;
     for( int si = 0; si < nstages; si++ )
     {
         const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si];
diff --git a/modules/objdetect/src/opencl/cascadedetect.cl b/modules/objdetect/src/opencl/cascadedetect.cl
index 4a508ca..88b3624 100644
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -1,6 +1,18 @@
 ///////////////////////////// OpenCL kernels for face detection //////////////////////////////
 ////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
 
+//
+// the code has been derived from the OpenCL Haar cascade kernel by
+//
+//    Niko Li, newlife20080214@gmail.com
+//    Wang Weiyan, wangweiyanster@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Nathan, liujun@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
+//    Erping Pang, erping@multicorewareinc.com
+//
+
+
 typedef struct __attribute__((aligned(4))) OptHaarFeature
 {
     int4 ofs[3] __attribute__((aligned (4)));
@@ -20,6 +32,12 @@ typedef struct __attribute__((aligned(4))) Stump
 }
 Stump;
 
+typedef struct __attribute__((aligned(4))) Node
+{
+    int4 n __attribute__((aligned (4)));
+}
+Node;
+
 typedef struct __attribute__((aligned (4))) Stage
 {
     int first __attribute__((aligned (4)));
@@ -28,151 +46,614 @@ typedef struct __attribute__((aligned (4))) Stage
 }
 Stage;
 
-__kernel void runHaarClassifierStump(
+typedef struct __attribute__((aligned (4))) ScaleData
+{
+    float scale __attribute__((aligned (4)));
+    int szi_width __attribute__((aligned (4)));
+    int szi_height __attribute__((aligned (4)));
+    int layer_ofs __attribute__((aligned (4)));
+    int ystep __attribute__((aligned (4)));
+}
+ScaleData;
+
+#ifndef SUM_BUF_SIZE
+#define SUM_BUF_SIZE 0
+#endif
+
+#ifndef NODE_COUNT
+#define NODE_COUNT 1
+#endif
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
+void runHaarClassifier(
+    int nscales, __global const ScaleData* scaleData,
     __global const int* sum,
-    int sumstep, int sumoffset,
-    __global const int* sqsum,
-    int sqsumstep, int sqsumoffset,
+    int _sumstep, int sumoffset,
     __global const OptHaarFeature* optfeatures,
 
-    int nstages,
+    int splitstage, int nstages,
     __global const Stage* stages,
-    __global const Stump* stumps,
+    __global const Node* nodes,
+    __global const float* leaves0,
 
     volatile __global int* facepos,
-    int2 imgsize, int xyscale, float factor,
-    int4 normrect, int2 windowsize, int maxFaces)
+    int4 normrect, int sqofs, int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale;
-    int iy = get_global_id(1)*xyscale;
-    sumstep /= sizeof(int);
-    sqsumstep /= sizeof(int);
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int groupIdx = get_group_id(0);
+    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
+    int scaleIdx, tileIdx, stageIdx;
+    int sumstep = (int)(_sumstep/sizeof(int));
+    int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
+                        mad24(normrect.y, sumstep, normrect.x + normrect.z),
+                        mad24(normrect.y + normrect.w, sumstep, normrect.x),
+                        mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
+    int normarea = normrect.z * normrect.w;
+    float invarea = 1.f/normarea;
+    int lidx = ly*LOCAL_SIZE_X + lx;
 
-    if( ix < imgsize.x && iy < imgsize.y )
+    #if SUM_BUF_SIZE > 0
+    int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
+                       mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
+                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
+                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
+    #else
+    int4 nofs = nofs0;
+    #endif
+    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
+    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
+    #if SUM_BUF_SIZE > 0
+    __local int* ibuf = lstore;
+    __local int* lcount = ibuf + SUM_BUF_SIZE;
+    #else
+    __local int* lcount = lstore;
+    #endif
+    __local float* lnf = (__local float*)(lcount + 1);
+    __local float* lpartsum = lnf + LOCAL_SIZE;
+    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
+
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
     {
-        int stageIdx;
-        __global const Stump* stump = stumps;
-
-        __global const int* psum = sum + mad24(iy, sumstep, ix);
-        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
-        int normarea = normrect.z * normrect.w;
-        float invarea = 1.f/normarea;
-        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
-                      pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
-        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
-        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
-        nf = nf > 0 ? nf : 1.f;
-
-        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
+                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
+        int totalTiles = ntiles.x*ntiles.y;
+
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
         {
-            int i, ntrees = stages[stageIdx].ntrees;
-            float s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++ )
+            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
+            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
+            int ix = lx, iy = ly;
+            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
+            __global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
+
+            if( ix0 >= worksize.x || iy0 >= worksize.y )
+                continue;
+            #if SUM_BUF_SIZE > 0
+            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
             {
-                float4 st = stump->st;
-                __global const OptHaarFeature* f = optfeatures + as_int(st.x);
-                float4 weight = f->weight;
-
-                int4 ofs = f->ofs[0];
-                sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
-                ofs = f->ofs[1];
-                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
-                if( weight.z > 0 )
+                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
+                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            #endif
+
+            if( lidx == 0 )
+                lcount[0] = 0;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
+            {
+                #if NODE_COUNT==1
+                __global const Stump* stump = (__global const Stump*)nodes;
+                #else
+                __global const Node* node = nodes;
+                __global const float* leaves = leaves0;
+                #endif
+                #if SUM_BUF_SIZE > 0
+                __local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
+                #else
+                __global const int* psum = psum1;
+                #endif
+
+                __global const float* psqsum = (__global const float*)(psum1 + sqofs);
+                float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
+                float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
+                float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
+                nf = nf > 0 ? nf : 1.f;
+
+                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
                 {
-                    ofs = f->ofs[2];
-                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                    int ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    #if NODE_COUNT==1
+                    for( i = 0; i < ntrees; i++ )
+                    {
+                        float4 st = stump[i].st;
+                        __global const OptHaarFeature* f = optfeatures + as_int(st.x);
+                        float4 weight = f->weight;
+
+                        int4 ofs = f->ofs[0];
+                        sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                        ofs = f->ofs[1];
+                        sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                        if( weight.z > 0 )
+                        {
+                            ofs = f->ofs[2];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                        }
+
+                        s += (sval < st.y*nf) ? st.z : st.w;
+                    }
+                    stump += ntrees;
+                    #else
+                    for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
+                    {
+                        int idx = 0;
+                        do
+                        {
+                            int4 n = node[idx].n;
+                            __global const OptHaarFeature* f = optfeatures + n.x;
+                            float4 weight = f->weight;
+
+                            int4 ofs = f->ofs[0];
+
+                            sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                            ofs = f->ofs[1];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            if( weight.z > 0 )
+                            {
+                                ofs = f->ofs[2];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                            }
+
+                            idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
+                        }
+                        while(idx > 0);
+                        s += leaves[-idx];
+                    }
+                    #endif
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
                 }
 
-                s += (sval < st.y*nf) ? st.z : st.w;
+                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                {
+                    int count = atomic_inc(lcount);
+                    lbuf[count] = (int)(ix | (iy << 8));
+                    lnf[count] = nf;
+                }
             }
 
-            if( s < stages[stageIdx].threshold )
-                break;
-        }
+            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
+            {
+                int nrects = lcount[0];
 
-        if( stageIdx == nstages )
-        {
-            int nfaces = atomic_inc(facepos);
-            if( nfaces < maxFaces )
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if( nrects == 0 )
+                    break;
+                if( lidx == 0 )
+                    lcount[0] = 0;
+
+                {
+                    #if NODE_COUNT == 1
+                    __global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
+                    #else
+                    __global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
+                    __global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
+                    #endif
+                    int nparts = LOCAL_SIZE / nrects;
+                    int ntrees = stages[stageIdx].ntrees;
+                    int ntrees_p = (ntrees + nparts - 1)/nparts;
+                    int nr = lidx / nparts;
+                    int partidx = -1, idxval = 0;
+                    float partsum = 0.f, nf = 0.f;
+
+                    if( nr < nrects )
+                    {
+                        partidx = lidx % nparts;
+                        idxval = lbuf[nr];
+                        nf = lnf[nr];
+
+                        {
+                        int ntrees0 = ntrees_p*partidx;
+                        int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
+                        int ix1 = idxval & 255, iy1 = idxval >> 8;
+                        #if SUM_BUF_SIZE > 0
+                        __local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
+                        #else
+                        __global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
+                        #endif
+
+                        #if NODE_COUNT == 1
+                        for( i = ntrees0; i < ntrees1; i++ )
+                        {
+                            float4 st = stump[i].st;
+                            __global const OptHaarFeature* f = optfeatures + as_int(st.x);
+                            float4 weight = f->weight;
+
+                            int4 ofs = f->ofs[0];
+                            float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                            ofs = f->ofs[1];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            //if( weight.z > 0 )
+                            {
+                                ofs = f->ofs[2];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                            }
+
+                            partsum += (sval < st.y*nf) ? st.z : st.w;
+                        }
+                        #else
+                        for( i = ntrees0; i < ntrees1; i++ )
+                        {
+                            int idx = 0;
+                            do
+                            {
+                                int4 n = node[i*2 + idx].n;
+                                __global const OptHaarFeature* f = optfeatures + n.x;
+                                float4 weight = f->weight;
+                                int4 ofs = f->ofs[0];
+
+                                float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                                ofs = f->ofs[1];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                                if( weight.z > 0 )
+                                {
+                                    ofs = f->ofs[2];
+                                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                                }
+
+                                idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
+                            }
+                            while(idx > 0);
+                            partsum += leaves[i*3-idx];
+                        }
+                        #endif
+                        }
+                    }
+                    lpartsum[lidx] = partsum;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    if( partidx == 0 )
+                    {
+                        float s = lpartsum[nr*nparts];
+                        for( i = 1; i < nparts; i++ )
+                            s += lpartsum[i + nr*nparts];
+                        if( s >= stages[stageIdx].threshold )
+                        {
+                            int count = atomic_inc(lcount);
+                            lbuf[count] = idxval;
+                            lnf[count] = nf;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( stageIdx == nstages )
             {
-                volatile __global int* face = facepos + 1 + nfaces*4;
-                face[0] = convert_int_rte(ix*factor);
-                face[1] = convert_int_rte(iy*factor);
-                face[2] = convert_int_rte(windowsize.x*factor);
-                face[3] = convert_int_rte(windowsize.y*factor);
+                int nrects = lcount[0];
+                if( lidx < nrects )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        int val = lbuf[lidx];
+                        face[0] = scaleIdx;
+                        face[1] = ix0 + (val & 255);
+                        face[2] = iy0 + (val >> 8);
+                    }
+                }
             }
         }
     }
 }
 
+#undef CALC_SUM_OFS_
+#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
 
-__kernel void runLBPClassifierStump(
+__kernel void runLBPClassifierStumpSimple(
+    int nscales, __global const ScaleData* scaleData,
     __global const int* sum,
-    int sumstep, int sumoffset,
+    int _sumstep, int sumoffset,
     __global const OptLBPFeature* optfeatures,
 
-    int nstages,
+    int splitstage, int nstages,
     __global const Stage* stages,
     __global const Stump* stumps,
     __global const int* bitsets,
     int bitsetSize,
 
     volatile __global int* facepos,
-    int2 imgsize, int xyscale, float factor,
     int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale;
-    int iy = get_global_id(1)*xyscale;
-    sumstep /= sizeof(int);
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int local_size_x = get_local_size(0);
+    int local_size_y = get_local_size(1);
+    int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
+    int ngroups = get_num_groups(0)*get_num_groups(1);
+    int scaleIdx, tileIdx, stageIdx;
+    int startStage = 0, endStage = nstages;
+    int sumstep = (int)(_sumstep/sizeof(int));
 
-    if( ix < imgsize.x && iy < imgsize.y )
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
     {
-        int stageIdx;
-        __global const Stump* stump = stumps;
-        __global const int* p = sum + mad24(iy, sumstep, ix);
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
+                             (worksize.y/ystep + local_size_y-1)/local_size_y);
+        int totalTiles = ntiles.x*ntiles.y;
 
-        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
         {
-            int i, ntrees = stages[stageIdx].ntrees;
-            float s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++, bitsets += bitsetSize )
+            int iy = ((tileIdx / ntiles.x)*local_size_y + ly)*ystep;
+            int ix = ((tileIdx % ntiles.x)*local_size_x + lx)*ystep;
+
+            if( ix < worksize.x && iy < worksize.y )
             {
-                float4 st = stump->st;
-                __global const OptLBPFeature* f = optfeatures + as_int(st.x);
-                int16 ofs = f->ofs;
+                __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
+                __global const Stump* stump = stumps;
+                __global const int* bitset = bitsets;
 
-                #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
-                ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
+                for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
+                {
+                    int i, ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+                    {
+                        float4 st = stump->st;
+                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                        int16 ofs = f->ofs;
 
-                int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
 
-                int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
-                idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
-                idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
 
-                mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
-                mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
-                mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
-                mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
-                mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
 
-                s += (bitsets[idx] & (1 << mask)) ? st.z : st.w;
-            }
+                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
+                    }
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
+                }
 
-            if( s < stages[stageIdx].threshold )
-                break;
+                if( stageIdx == nstages )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        face[0] = scaleIdx;
+                        face[1] = ix;
+                        face[2] = iy;
+                    }
+                }
+            }
         }
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
+void runLBPClassifierStump(
+    int nscales, __global const ScaleData* scaleData,
+    __global const int* sum,
+    int _sumstep, int sumoffset,
+    __global const OptLBPFeature* optfeatures,
+
+    int splitstage, int nstages,
+    __global const Stage* stages,
+    __global const Stump* stumps,
+    __global const int* bitsets,
+    int bitsetSize,
+
+    volatile __global int* facepos,
+    int2 windowsize, int maxFaces)
+{
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int groupIdx = get_group_id(0);
+    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
+    int scaleIdx, tileIdx, stageIdx;
+    int sumstep = (int)(_sumstep/sizeof(int));
+    int lidx = ly*LOCAL_SIZE_X + lx;
 
-        if( stageIdx == nstages )
+    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
+    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
+    #if SUM_BUF_SIZE > 0
+    __local int* ibuf = lstore;
+    __local int* lcount = ibuf + SUM_BUF_SIZE;
+    #else
+    __local int* lcount = lstore;
+    #endif
+    __local float* lpartsum = (__local float*)(lcount + 1);
+    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
+
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
+    {
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
+                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
+        int totalTiles = ntiles.x*ntiles.y;
+
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
         {
-            int nfaces = atomic_inc(facepos);
-            if( nfaces < maxFaces )
+            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
+            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
+            int ix = lx, iy = ly;
+            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
+
+            if( ix0 >= worksize.x || iy0 >= worksize.y )
+                continue;
+            #if SUM_BUF_SIZE > 0
+            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
             {
-                volatile __global int* face = facepos + 1 + nfaces*4;
-                face[0] = convert_int_rte(ix*factor);
-                face[1] = convert_int_rte(iy*factor);
-                face[2] = convert_int_rte(windowsize.x*factor);
-                face[3] = convert_int_rte(windowsize.y*factor);
+                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
+                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            #endif
+
+            if( lidx == 0 )
+                lcount[0] = 0;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
+            {
+                __global const Stump* stump = stumps;
+                __global const int* bitset = bitsets;
+                #if SUM_BUF_SIZE > 0
+                __local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
+                #else
+                __global const int* p = psum0 + mad24(iy, sumstep, ix);
+                #endif
+
+                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
+                {
+                    int ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+                    {
+                        float4 st = stump->st;
+                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                        int16 ofs = f->ofs;
+
+                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+
+                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+
+                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+                        
+                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
+                    }
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
+                }
+
+                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                {
+                    int count = atomic_inc(lcount);
+                    lbuf[count] = (int)(ix | (iy << 8));
+                }
+            }
+
+            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
+            {
+                int nrects = lcount[0];
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if( nrects == 0 )
+                    break;
+                if( lidx == 0 )
+                    lcount[0] = 0;
+
+                {
+                    __global const Stump* stump = stumps + stages[stageIdx].first;
+                    __global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
+                    int nparts = LOCAL_SIZE / nrects;
+                    int ntrees = stages[stageIdx].ntrees;
+                    int ntrees_p = (ntrees + nparts - 1)/nparts;
+                    int nr = lidx / nparts;
+                    int partidx = -1, idxval = 0;
+                    float partsum = 0.f, nf = 0.f;
+
+                    if( nr < nrects )
+                    {
+                        partidx = lidx % nparts;
+                        idxval = lbuf[nr];
+
+                        {
+                            int ntrees0 = ntrees_p*partidx;
+                            int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
+                            int ix1 = idxval & 255, iy1 = idxval >> 8;
+                            #if SUM_BUF_SIZE > 0
+                            __local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
+                            #else
+                            __global const int* p = psum0 + mad24(iy1, sumstep, ix1);
+                            #endif
+
+                            for( i = ntrees0; i < ntrees1; i++ )
+                            {
+                                float4 st = stump[i].st;
+                                __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                                int16 ofs = f->ofs;
+
+                                #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+                                    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
+
+                                int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+
+                                int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                                idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                                idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+
+                                mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                                mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                                mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                                mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                                mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+
+                                partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
+                            }
+                        }
+                    }
+                    lpartsum[lidx] = partsum;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    if( partidx == 0 )
+                    {
+                        float s = lpartsum[nr*nparts];
+                        for( i = 1; i < nparts; i++ )
+                            s += lpartsum[i + nr*nparts];
+                        if( s >= stages[stageIdx].threshold )
+                        {
+                            int count = atomic_inc(lcount);
+                            lbuf[count] = idxval;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( stageIdx == nstages )
+            {
+                int nrects = lcount[0];
+                if( lidx < nrects )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        int val = lbuf[lidx];
+                        face[0] = scaleIdx;
+                        face[1] = ix0 + (val & 255);
+                        face[2] = iy0 + (val >> 8);
+                    }
+                }
             }
         }
     }
diff --git a/modules/objdetect/test/test_cascadeandhog.cpp b/modules/objdetect/test/test_cascadeandhog.cpp
index a301099..e4637e8 100644
--- a/modules/objdetect/test/test_cascadeandhog.cpp
+++ b/modules/objdetect/test/test_cascadeandhog.cpp
@@ -257,6 +257,7 @@ int CV_DetectorTest::runTestCase( int detectorIdx, vector<vector<Rect> >& object
     string dataPath = ts->get_data_path(), detectorFilename;
     if( !detectorFilenames[detectorIdx].empty() )
         detectorFilename = dataPath + detectorFilenames[detectorIdx];
+    printf("detector %s\n", detectorFilename.c_str());
 
     for( int ii = 0; ii < (int)imageFilenames.size(); ++ii )
     {
diff --git a/samples/cpp/ufacedetect.cpp b/samples/cpp/ufacedetect.cpp
index a1726a5..5e13a82 100644
--- a/samples/cpp/ufacedetect.cpp
+++ b/samples/cpp/ufacedetect.cpp
@@ -231,9 +231,14 @@ void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
     smallImg.copyTo(canvas);
 
     double fps = getTickFrequency()/t;
+    static double avgfps = 0;
+    static int nframes = 0;
+    nframes++;
+    double alpha = nframes > 50 ? 0.01 : 1./nframes;
+    avgfps = avgfps*(1-alpha) + fps*alpha;
 
-    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50),
-            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
+    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", avgfps), Point(50, 30),
+            FONT_HERSHEY_SIMPLEX, 0.8, Scalar(0,255,0), 2);
 
     for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
     {
-- 
2.7.4