restored SSE2 and added AVX optimization of the old haar face detector
authorVadim Pisarevsky <vadim.pisarevsky@itseez.com>
Tue, 4 Sep 2012 10:59:38 +0000 (14:59 +0400)
committerVadim Pisarevsky <vadim.pisarevsky@itseez.com>
Tue, 4 Sep 2012 10:59:38 +0000 (14:59 +0400)
modules/core/src/system.cpp
modules/objdetect/src/haar.cpp

index 253d840..f85f78a 100644 (file)
@@ -178,7 +178,7 @@ struct HWFeatures
             f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
             f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
             f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
-            f.have[CV_CPU_AVX]    = (cpuid_data[2] & (1<<28)) != 0;
+            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
         }
 
         return f;
index 983fdcf..3dd032f 100644 (file)
 
 #include "precomp.hpp"
 #include <stdio.h>
-/*
-#if CV_SSE2
+
+
+#if CV_SSE2 || CV_SSE3
 #   if !CV_SSE4_1 && !CV_SSE4_2
 #              define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
 #       define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) 
 #   endif
 #endif
 
-#if defined CV_ICC
-#      if defined CV_AVX
+#      if  CV_AVX
 #              define CV_HAAR_USE_AVX 1
 #      else 
-#              if  defined CV_SSE2 ||  defined CV_SSE4_1 || defined CV_SSE4_2
+#              if  CV_SSE2 || CV_SSE3 
 #                      define CV_HAAR_USE_SSE 1
-#              else 
-#                      define CV_HAAR_NO_SIMD 1
 #              endif
 #      endif
-#endif
-*/
+
 /* these settings affect the quality of detection: change with care */
 #define CV_ADJUST_FEATURES 1
 #define CV_ADJUST_WEIGHTS  0
@@ -636,34 +633,163 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
 }
 
 
+//AVX version icvEvalHidHaarClassifier.  Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
+#ifdef CV_HAAR_USE_AVX 
+CV_INLINE
+double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
+                                                                       double variance_norm_factor, size_t p_offset )
+{
+       int  CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
+       char flags[8] = {0,0,0,0,0,0,0,0};
+       CvHidHaarTreeNode* nodes[8];
+       double res = 0; 
+       char exitConditionFlag = 0;
+       for(;;)
+       {
+               float  CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
+               nodes[0] = classifier    ->node + idxV[0];
+               nodes[1] = (classifier+1)->node + idxV[1];
+               nodes[2] = (classifier+2)->node + idxV[2];
+               nodes[3] = (classifier+3)->node + idxV[3];
+               nodes[4] = (classifier+4)->node + idxV[4];
+               nodes[5] = (classifier+5)->node + idxV[5];
+               nodes[6] = (classifier+6)->node + idxV[6];
+               nodes[7] = (classifier+7)->node + idxV[7];
+
+               __m256 t = _mm256_set1_ps(variance_norm_factor);
+               t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
+
+               __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
+                               calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
+                               p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); 
+               __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, 
+                               nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); 
+               __m256 sum = _mm256_mul_ps(offset, weight);
+                       
+               offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
+                               calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
+                               calc_sum(nodes[0]->feature.rect[1],p_offset)); 
+               weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, 
+                               nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); 
+       
+               sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+                                       
+               if( nodes[0]->feature.rect[2].p0 )
+                       tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
+               if( nodes[1]->feature.rect[2].p0 )
+            tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
+               if( nodes[2]->feature.rect[2].p0 )
+            tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
+               if( nodes[3]->feature.rect[2].p0 )
+            tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
+               if( nodes[4]->feature.rect[2].p0 )
+                       tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
+               if( nodes[5]->feature.rect[2].p0 )
+            tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
+               if( nodes[6]->feature.rect[2].p0 )
+            tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
+               if( nodes[7]->feature.rect[2].p0 )
+            tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
+               
+               sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
+
+               __m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left);
+               __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right);
+
+               _mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ ))));
+
+               for(int i = 0; i < 8; i++)
+               {
+                       if(idxV[i]<=0)
+                       {
+                               if(!flags[i])
+                               {
+                                       exitConditionFlag++;
+                                       flags[i]=1;
+                                       res+=((classifier+i)->alpha[-idxV[i]]);
+                               }
+                               idxV[i]=0;
+                       }
+               }
+               if(exitConditionFlag==8)
+                       return res;
+       }
+}
+#endif
+
 CV_INLINE
 double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
                                  double variance_norm_factor,
                                  size_t p_offset )
 {
     int idx = 0;
-    do
+       /*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX 
+               if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow
+               {
+                       double CV_DECL_ALIGNED(16) temp[2];             
+                       __m128d zero = _mm_setzero_pd();
+                       do
+                       {
+                               CvHidHaarTreeNode* node = classifier->node + idx;
+                               __m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor);
+                               __m128d left = _mm_set1_pd(node->left);
+                               __m128d right = _mm_set1_pd(node->right);
+
+                               double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+                               _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+                               if( node->feature.rect[2].p0 )
+                                       _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+
+                               __m128d sum = _mm_set1_pd(_sum);
+                               t = _mm_cmplt_sd(sum, t);
+                               sum = _mm_blendv_pd(right, left, t);
+
+                               _mm_store_pd(temp, sum);
+                               idx = (int)temp[0];             
+                       }
+                       while(idx > 0 );
+                               
+               }
+               else
+       #endif*/
     {
-        CvHidHaarTreeNode* node = classifier->node + idx;
-        double t = node->threshold * variance_norm_factor;
+               do
+               {
+            CvHidHaarTreeNode* node = classifier->node + idx;
+            double t = node->threshold * variance_norm_factor;
 
-        double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
-        sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+                       double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+                       sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
 
-        if( node->feature.rect[2].p0 )
-            sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+                       if( node->feature.rect[2].p0 )
+                               sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
 
-        idx = sum < t ? node->left : node->right;
+                       idx = sum < t ? node->left : node->right;
+               }
+               while( idx > 0 );
     }
-    while( idx > 0 );
     return classifier->alpha[-idx];
 }
 
 
+
 static int
 cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                                CvPoint pt, double& stage_sum, int start_stage )
 {
+       #ifdef CV_HAAR_USE_AVX 
+                       bool haveAVX = false;
+                       if(cv::checkHardwareSupport(CV_CPU_AVX))
+                               if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers
+                               {
+                                       haveAVX = true;
+                               }                                       
+       #else 
+               #ifdef CV_HAAR_USE_SSE 
+                       bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
+               #endif
+       #endif
+
     int p_offset, pq_offset;
     int i, j;
     double mean, variance_norm_factor;
@@ -702,10 +828,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
         {
             stage_sum = 0.0;
 
+                       #ifdef CV_HAAR_USE_AVX 
+                       if(haveAVX)
+                       {
+                               for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
+                               {
+                                       stage_sum += icvEvalHidHaarClassifierAVX(
+                                               cascade->stage_classifier[i].classifier+j, 
+                                               variance_norm_factor, p_offset );
+                               }
+                       }
+                       #endif
             for( j = 0; j < ptr->count; j++ )
             {
-                stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j,
-                    variance_norm_factor, p_offset );
+                stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset );
             }
 
             if( stage_sum >= ptr->threshold )
@@ -723,99 +859,287 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
     }
     else if( cascade->isStumpBased )
     {
-        for( i = start_stage; i < cascade->count; i++ )
-        {
-#ifndef CV_HAAR_USE_SSE
-            stage_sum = 0.0;
-#else
-            __m128d stage_sum = _mm_setzero_pd();
-#endif
-
-            if( cascade->stage_classifier[i].two_rects )
-            {
-                for( j = 0; j < cascade->stage_classifier[i].count; j++ )
-                {
-                    CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
-                    CvHidHaarTreeNode* node = classifier->node;
-
-#ifndef CV_HAAR_USE_SSE
-                    double t = node->threshold*variance_norm_factor;
-                    double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
-                    sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
-                    stage_sum += classifier->alpha[sum >= t];
-#else
-                    // ayasin - NHM perf optim. Avoid use of costly flaky jcc
-                    __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
-                    __m128d a = _mm_set_sd(classifier->alpha[0]);
-                    __m128d b = _mm_set_sd(classifier->alpha[1]);
-                    __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
-                                             calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
-                    t = _mm_cmpgt_sd(t, sum);
-                    stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
-#endif
-
-                }
-            }
-            else
-            {
-                for( j = 0; j < cascade->stage_classifier[i].count; j++ )
-                {
-                    CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
-                    CvHidHaarTreeNode* node = classifier->node;
-#ifndef CV_HAAR_USE_SSE
-                    double t = node->threshold*variance_norm_factor;
-                    double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
-                    sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
-                    if( node->feature.rect[2].p0 )
-                        sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
-
-                    stage_sum += classifier->alpha[sum >= t];
-#else
-                    // ayasin - NHM perf optim. Avoid use of costly flaky jcc
-                    __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
-                    __m128d a = _mm_set_sd(classifier->alpha[0]);
-                    __m128d b = _mm_set_sd(classifier->alpha[1]);
-                    double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
-                    _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
-                    if( node->feature.rect[2].p0 )
-                        _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
-                    __m128d sum = _mm_set_sd(_sum);
-
-                    t = _mm_cmpgt_sd(t, sum);
-                    stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
-#endif
-                }
-            }
-
-#ifndef CV_HAAR_USE_SSE
-            if( stage_sum < cascade->stage_classifier[i].threshold )
-#else
-            __m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold);
-            if( _mm_comilt_sd(stage_sum, i_threshold) )
-#endif
-                return -i;
-        }
-    }
-    else
+       #ifdef CV_HAAR_USE_AVX 
+                       if(haveAVX)
+                       {
+                               CvHidHaarClassifier* classifiers[8];
+                               CvHidHaarTreeNode* nodes[8];
+                               for( i = start_stage; i < cascade->count; i++ )
+                               {
+                                       stage_sum = 0.0;
+                                       int j = 0;
+                                       float  CV_DECL_ALIGNED(32) buf[8];
+                                       if( cascade->stage_classifier[i].two_rects )
+                                       {
+                                               for( ; j <= cascade->stage_classifier[i].count-8; j+=8 )
+                                               {
+                                                       //__m256 stage_sumPart = _mm256_setzero_ps(); 
+                                                       classifiers[0] = cascade->stage_classifier[i].classifier + j;
+                                                       nodes[0] = classifiers[0]->node;
+                                                       classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
+                                                       nodes[1] = classifiers[1]->node;
+                                                       classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
+                                                       nodes[2]= classifiers[2]->node;
+                                                       classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
+                                                       nodes[3] = classifiers[3]->node;
+                                                       classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
+                                                       nodes[4] = classifiers[4]->node;
+                                                       classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
+                                                       nodes[5] = classifiers[5]->node;
+                                                       classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
+                                                       nodes[6] = classifiers[6]->node;
+                                                       classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
+                                                       nodes[7] = classifiers[7]->node;
+
+                                                       __m256 t = _mm256_set1_ps(variance_norm_factor);
+                                                       t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
+
+                                                       __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
+                                                               calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
+                                                               p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); 
+                                                       __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, 
+                                                               nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); 
+                                                       __m256 sum = _mm256_mul_ps(offset, weight);
+
+                                                       offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
+                                                               calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
+                                                               calc_sum(nodes[0]->feature.rect[1],p_offset)); 
+                                                       weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, 
+                                                               nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); 
+                                                       sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+                                       
+                                                       __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
+                                                               classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
+                                                       __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
+                                                               classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
+
+                                                       _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )));
+                                                       stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+                       
+                                               }
+                       
+                                               for( ; j < cascade->stage_classifier[i].count; j++ )
+                                               {
+                                                       CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+                                                       CvHidHaarTreeNode* node = classifier->node;
+
+                                                       double t = node->threshold*variance_norm_factor;
+                                                       double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+                                                       sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+                                                       stage_sum += classifier->alpha[sum >= t];
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
+                                               {
+                                                       float  CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
+
+                                                       classifiers[0] = cascade->stage_classifier[i].classifier + j;
+                                                       nodes[0] = classifiers[0]->node;
+                                                       classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
+                                                       nodes[1] = classifiers[1]->node;
+                                                       classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
+                                                       nodes[2]= classifiers[2]->node;
+                                                       classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
+                                                       nodes[3] = classifiers[3]->node;
+                                                       classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
+                                                       nodes[4] = classifiers[4]->node;
+                                                       classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
+                                                       nodes[5] = classifiers[5]->node;
+                                                       classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
+                                                       nodes[6] = classifiers[6]->node;
+                                                       classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
+                                                       nodes[7] = classifiers[7]->node;
+
+                                                       __m256 t = _mm256_set1_ps(variance_norm_factor);
+                                                       t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
+                                                       
+                                                       __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
+                                                               calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
+                                                               p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); 
+                                                       __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, 
+                                                               nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); 
+                                                       __m256 sum = _mm256_mul_ps(offset, weight);
+
+                                                       offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
+                                                               calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
+                                                               calc_sum(nodes[0]->feature.rect[1],p_offset)); 
+                                                       weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, 
+                                                               nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); 
+                               
+                                                       sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+
+                                                       if( nodes[0]->feature.rect[2].p0 )
+                                                               tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
+                                                       if( nodes[1]->feature.rect[2].p0 )
+                                                               tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
+                                                       if( nodes[2]->feature.rect[2].p0 )
+                                                               tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
+                                                       if( nodes[3]->feature.rect[2].p0 )
+                                                               tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
+                                                       if( nodes[4]->feature.rect[2].p0 )
+                                                               tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
+                                                       if( nodes[5]->feature.rect[2].p0 )
+                                                               tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
+                                                       if( nodes[6]->feature.rect[2].p0 )
+                                                               tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
+                                                       if( nodes[7]->feature.rect[2].p0 )
+                                                               tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
+                               
+                                                       sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
+
+                                                       __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
+                                                               classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
+                                                       __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
+                                                               classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
+
+                                                       __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
+                                                       outBuf = _mm256_hadd_ps(outBuf, outBuf);
+                                                       outBuf = _mm256_hadd_ps(outBuf, outBuf);
+                                                       _mm256_store_ps(buf, outBuf);
+                                                       stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); 
+                                               }
+                               
+                                               for( ; j < cascade->stage_classifier[i].count; j++ )
+                                               {
+                                                       CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+                                                       CvHidHaarTreeNode* node = classifier->node;
+
+                                                       double t = node->threshold*variance_norm_factor;
+                                                       double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+                                                       sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+                                                       if( node->feature.rect[2].p0 )
+                                                               sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+                                                       stage_sum += classifier->alpha[sum >= t];
+                                               }
+                                       }
+                                       if( stage_sum < cascade->stage_classifier[i].threshold )
+                                               return -i;
+                               }
+                       }
+                       else
+       #endif
+       #ifdef  CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization
+                       if(haveSSE2) 
+                       {
+                               for( i = start_stage; i < cascade->count; i++ )
+                               {
+                                       __m128d stage_sum = _mm_setzero_pd();
+                                       if( cascade->stage_classifier[i].two_rects )
+                                       {
+                                               for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+                                               {
+                                                       CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+                                                       CvHidHaarTreeNode* node = classifier->node;
+
+                                                       // ayasin - NHM perf optim. Avoid use of costly flaky jcc
+                                                       __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
+                                                       __m128d a = _mm_set_sd(classifier->alpha[0]);
+                                                       __m128d b = _mm_set_sd(classifier->alpha[1]);
+                                                       __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
+                                                                                                               calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
+                                                       t = _mm_cmpgt_sd(t, sum);
+                                                       stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+                                               {
+                                                       CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+                                                       CvHidHaarTreeNode* node = classifier->node;
+                                                       // ayasin - NHM perf optim. Avoid use of costly flaky jcc
+                                                       __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
+                                                       __m128d a = _mm_set_sd(classifier->alpha[0]);
+                                                       __m128d b = _mm_set_sd(classifier->alpha[1]);
+                                                       double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+                                                       _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+                                                       if( node->feature.rect[2].p0 )
+                                                               _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+                                                       __m128d sum = _mm_set_sd(_sum);
+
+                                                       t = _mm_cmpgt_sd(t, sum);
+                                                       stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
+                                               }
+                                       }
+                                       __m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold);
+                                       if( _mm_comilt_sd(stage_sum, i_threshold) )
+                                               return -i;
+                               }
+                       }
+                       else 
+       #endif
+                       {
+                               for( i = start_stage; i < cascade->count; i++ )
+                               {
+                                       stage_sum = 0.0;
+                                       if( cascade->stage_classifier[i].two_rects )
+                                       {
+                                               for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+                                               {
+                                                       CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+                                                       CvHidHaarTreeNode* node = classifier->node;
+                                                       double t = node->threshold*variance_norm_factor;
+                                                       double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+                                                       sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+                                                       stage_sum += classifier->alpha[sum >= t];
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+                                               {
+                                                       CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+                                                       CvHidHaarTreeNode* node = classifier->node;
+                                                       double t = node->threshold*variance_norm_factor;
+                                                       double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+                                                       sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+                                                       if( node->feature.rect[2].p0 )
+                                                               sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+                                                       stage_sum += classifier->alpha[sum >= t];
+                                               }
+                                       }
+                                       if( stage_sum < cascade->stage_classifier[i].threshold )
+                                               return -i;
+                               }
+                       }
+       }
+    
+       else
     {
         for( i = start_stage; i < cascade->count; i++ )
         {
             stage_sum = 0.0;
-
-            for( j = 0; j < cascade->stage_classifier[i].count; j++ )
-            {
-                stage_sum += icvEvalHidHaarClassifier(
-                    cascade->stage_classifier[i].classifier + j,
-                    variance_norm_factor, p_offset );
-            }
-
+                       int j = 0;
+                       #ifdef CV_HAAR_USE_AVX 
+                       if(haveAVX)
+                       {
+                               for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
+                               {
+                                       stage_sum += icvEvalHidHaarClassifierAVX(
+                                               cascade->stage_classifier[i].classifier+j, 
+                                               variance_norm_factor, p_offset );
+                               }
+                       }
+                       #endif
+                               for(; j < cascade->stage_classifier[i].count; j++ )
+                               {
+                               
+                                       stage_sum += icvEvalHidHaarClassifier(
+                                               cascade->stage_classifier[i].classifier + j,
+                                               variance_norm_factor, p_offset );
+                               }
+                       
             if( stage_sum < cascade->stage_classifier[i].threshold )
                 return -i;
         }
     }
+       //_mm256_zeroupper();
     return 1;
 }
 
+
 CV_IMPL int
 cvRunHaarClassifierCascade( const CvHaarClassifierCascade* _cascade,
                             CvPoint pt, int start_stage )