From 959674618f29cbf1ed26d6d43d4ecf0fd4d66eb8 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 4 Sep 2012 14:59:38 +0400 Subject: [PATCH] restored SSE2 and added AVX optimization of the old haar face detector --- modules/core/src/system.cpp | 2 +- modules/objdetect/src/haar.cpp | 528 +++++++++++++++++++++++++++++++++-------- 2 files changed, 427 insertions(+), 103 deletions(-) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 253d840..f85f78a 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -178,7 +178,7 @@ struct HWFeatures f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; - f.have[CV_CPU_AVX] = (cpuid_data[2] & (1<<28)) != 0; + f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX } return f; diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp index 983fdcf..3dd032f 100644 --- a/modules/objdetect/src/haar.cpp +++ b/modules/objdetect/src/haar.cpp @@ -43,26 +43,23 @@ #include "precomp.hpp" #include -/* -#if CV_SSE2 + + +#if CV_SSE2 || CV_SSE3 # if !CV_SSE4_1 && !CV_SSE4_2 # define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) # define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) # endif #endif -#if defined CV_ICC -# if defined CV_AVX +# if CV_AVX # define CV_HAAR_USE_AVX 1 # else -# if defined CV_SSE2 || defined CV_SSE4_1 || defined CV_SSE4_2 +# if CV_SSE2 || CV_SSE3 # define CV_HAAR_USE_SSE 1 -# else -# define CV_HAAR_NO_SIMD 1 # endif # endif -#endif -*/ + /* these settings affect the quality of detection: change with care */ #define CV_ADJUST_FEATURES 1 #define CV_ADJUST_WEIGHTS 0 @@ -636,34 +633,163 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade, } +//AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!! +#ifdef CV_HAAR_USE_AVX +CV_INLINE +double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier, + double variance_norm_factor, size_t p_offset ) +{ + int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0}; + char flags[8] = {0,0,0,0,0,0,0,0}; + CvHidHaarTreeNode* nodes[8]; + double res = 0; + char exitConditionFlag = 0; + for(;;) + { + float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; + nodes[0] = classifier ->node + idxV[0]; + nodes[1] = (classifier+1)->node + idxV[1]; + nodes[2] = (classifier+2)->node + idxV[2]; + nodes[3] = (classifier+3)->node + idxV[3]; + nodes[4] = (classifier+4)->node + idxV[4]; + nodes[5] = (classifier+5)->node + idxV[5]; + nodes[6] = (classifier+6)->node + idxV[6]; + nodes[7] = (classifier+7)->node + idxV[7]; + + __m256 t = _mm256_set1_ps(variance_norm_factor); + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), + calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], + p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), + calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), + calc_sum(nodes[0]->feature.rect[1],p_offset)); + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); + + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); + + if( nodes[0]->feature.rect[2].p0 ) + tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; + if( nodes[1]->feature.rect[2].p0 ) + tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; + if( nodes[2]->feature.rect[2].p0 ) + tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; + if( nodes[3]->feature.rect[2].p0 ) + tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; + if( nodes[4]->feature.rect[2].p0 ) + tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; + if( nodes[5]->feature.rect[2].p0 ) + tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; + if( nodes[6]->feature.rect[2].p0 ) + tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; + if( nodes[7]->feature.rect[2].p0 ) + tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; + + sum = _mm256_add_ps(sum,_mm256_load_ps(tmp)); + + __m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left); + __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right); + + _mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ )))); + + for(int i = 0; i < 8; i++) + { + if(idxV[i]<=0) + { + if(!flags[i]) + { + exitConditionFlag++; + flags[i]=1; + res+=((classifier+i)->alpha[-idxV[i]]); + } + idxV[i]=0; + } + } + if(exitConditionFlag==8) + return res; + } +} +#endif + CV_INLINE double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset ) { int idx = 0; - do + /*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX + if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow + { + double CV_DECL_ALIGNED(16) temp[2]; + __m128d zero = _mm_setzero_pd(); + do + { + CvHidHaarTreeNode* node = classifier->node + idx; + __m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor); + __m128d left = _mm_set1_pd(node->left); + __m128d right = _mm_set1_pd(node->right); + + double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + + __m128d sum = _mm_set1_pd(_sum); + t = _mm_cmplt_sd(sum, t); + sum = _mm_blendv_pd(right, left, t); + + _mm_store_pd(temp, sum); + idx = (int)temp[0]; + } + while(idx > 0 ); + + } + else + #endif*/ { - CvHidHaarTreeNode* node = classifier->node + idx; - double t = node->threshold * variance_norm_factor; + do + { + CvHidHaarTreeNode* node = classifier->node + idx; + double t = node->threshold * variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + if( node->feature.rect[2].p0 ) + sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - idx = sum < t ? node->left : node->right; + idx = sum < t ? node->left : node->right; + } + while( idx > 0 ); } - while( idx > 0 ); return classifier->alpha[-idx]; } + static int cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, CvPoint pt, double& stage_sum, int start_stage ) { + #ifdef CV_HAAR_USE_AVX + bool haveAVX = false; + if(cv::checkHardwareSupport(CV_CPU_AVX)) + if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers + { + haveAVX = true; + } + #else + #ifdef CV_HAAR_USE_SSE + bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); + #endif + #endif + int p_offset, pq_offset; int i, j; double mean, variance_norm_factor; @@ -702,10 +828,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, { stage_sum = 0.0; + #ifdef CV_HAAR_USE_AVX + if(haveAVX) + { + for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) + { + stage_sum += icvEvalHidHaarClassifierAVX( + cascade->stage_classifier[i].classifier+j, + variance_norm_factor, p_offset ); + } + } + #endif for( j = 0; j < ptr->count; j++ ) { - stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, - variance_norm_factor, p_offset ); + stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset ); } if( stage_sum >= ptr->threshold ) @@ -723,99 +859,287 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, } else if( cascade->isStumpBased ) { - for( i = start_stage; i < cascade->count; i++ ) - { -#ifndef CV_HAAR_USE_SSE - stage_sum = 0.0; -#else - __m128d stage_sum = _mm_setzero_pd(); -#endif - - if( cascade->stage_classifier[i].two_rects ) - { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; - -#ifndef CV_HAAR_USE_SSE - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - stage_sum += classifier->alpha[sum >= t]; -#else - // ayasin - NHM perf optim. Avoid use of costly flaky jcc - __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); - __m128d a = _mm_set_sd(classifier->alpha[0]); - __m128d b = _mm_set_sd(classifier->alpha[1]); - __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + - calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); - t = _mm_cmpgt_sd(t, sum); - stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); -#endif - - } - } - else - { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; -#ifndef CV_HAAR_USE_SSE - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - - stage_sum += classifier->alpha[sum >= t]; -#else - // ayasin - NHM perf optim. Avoid use of costly flaky jcc - __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); - __m128d a = _mm_set_sd(classifier->alpha[0]); - __m128d b = _mm_set_sd(classifier->alpha[1]); - double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - __m128d sum = _mm_set_sd(_sum); - - t = _mm_cmpgt_sd(t, sum); - stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); -#endif - } - } - -#ifndef CV_HAAR_USE_SSE - if( stage_sum < cascade->stage_classifier[i].threshold ) -#else - __m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold); - if( _mm_comilt_sd(stage_sum, i_threshold) ) -#endif - return -i; - } - } - else + #ifdef CV_HAAR_USE_AVX + if(haveAVX) + { + CvHidHaarClassifier* classifiers[8]; + CvHidHaarTreeNode* nodes[8]; + for( i = start_stage; i < cascade->count; i++ ) + { + stage_sum = 0.0; + int j = 0; + float CV_DECL_ALIGNED(32) buf[8]; + if( cascade->stage_classifier[i].two_rects ) + { + for( ; j <= cascade->stage_classifier[i].count-8; j+=8 ) + { + //__m256 stage_sumPart = _mm256_setzero_ps(); + classifiers[0] = cascade->stage_classifier[i].classifier + j; + nodes[0] = classifiers[0]->node; + classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; + nodes[1] = classifiers[1]->node; + classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; + nodes[2]= classifiers[2]->node; + classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; + nodes[3] = classifiers[3]->node; + classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; + nodes[4] = classifiers[4]->node; + classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; + nodes[5] = classifiers[5]->node; + classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; + nodes[6] = classifiers[6]->node; + classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; + nodes[7] = classifiers[7]->node; + + __m256 t = _mm256_set1_ps(variance_norm_factor); + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), + calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], + p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), + calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), + calc_sum(nodes[0]->feature.rect[1],p_offset)); + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); + + __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], + classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); + __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], + classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); + + _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ))); + stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); + + } + + for( ; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + else + { + for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 ) + { + float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; + + classifiers[0] = cascade->stage_classifier[i].classifier + j; + nodes[0] = classifiers[0]->node; + classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; + nodes[1] = classifiers[1]->node; + classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; + nodes[2]= classifiers[2]->node; + classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; + nodes[3] = classifiers[3]->node; + classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; + nodes[4] = classifiers[4]->node; + classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; + nodes[5] = classifiers[5]->node; + classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; + nodes[6] = classifiers[6]->node; + classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; + nodes[7] = classifiers[7]->node; + + __m256 t = _mm256_set1_ps(variance_norm_factor); + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), + calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], + p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), + calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), + calc_sum(nodes[0]->feature.rect[1],p_offset)); + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); + + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); + + if( nodes[0]->feature.rect[2].p0 ) + tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; + if( nodes[1]->feature.rect[2].p0 ) + tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; + if( nodes[2]->feature.rect[2].p0 ) + tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; + if( nodes[3]->feature.rect[2].p0 ) + tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; + if( nodes[4]->feature.rect[2].p0 ) + tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; + if( nodes[5]->feature.rect[2].p0 ) + tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; + if( nodes[6]->feature.rect[2].p0 ) + tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; + if( nodes[7]->feature.rect[2].p0 ) + tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; + + sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); + + __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], + classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); + __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], + classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); + + __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )); + outBuf = _mm256_hadd_ps(outBuf, outBuf); + outBuf = _mm256_hadd_ps(outBuf, outBuf); + _mm256_store_ps(buf, outBuf); + stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); + } + + for( ; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + if( stage_sum < cascade->stage_classifier[i].threshold ) + return -i; + } + } + else + #endif + #ifdef CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization + if(haveSSE2) + { + for( i = start_stage; i < cascade->count; i++ ) + { + __m128d stage_sum = _mm_setzero_pd(); + if( cascade->stage_classifier[i].two_rects ) + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + + // ayasin - NHM perf optim. Avoid use of costly flaky jcc + __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); + __m128d a = _mm_set_sd(classifier->alpha[0]); + __m128d b = _mm_set_sd(classifier->alpha[1]); + __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + + calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); + t = _mm_cmpgt_sd(t, sum); + stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); + } + } + else + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + // ayasin - NHM perf optim. Avoid use of costly flaky jcc + __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); + __m128d a = _mm_set_sd(classifier->alpha[0]); + __m128d b = _mm_set_sd(classifier->alpha[1]); + double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + __m128d sum = _mm_set_sd(_sum); + + t = _mm_cmpgt_sd(t, sum); + stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); + } + } + __m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold); + if( _mm_comilt_sd(stage_sum, i_threshold) ) + return -i; + } + } + else + #endif + { + for( i = start_stage; i < cascade->count; i++ ) + { + stage_sum = 0.0; + if( cascade->stage_classifier[i].two_rects ) + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + else + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + if( stage_sum < cascade->stage_classifier[i].threshold ) + return -i; + } + } + } + + else { for( i = start_stage; i < cascade->count; i++ ) { stage_sum = 0.0; - - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) - { - stage_sum += icvEvalHidHaarClassifier( - cascade->stage_classifier[i].classifier + j, - variance_norm_factor, p_offset ); - } - + int j = 0; + #ifdef CV_HAAR_USE_AVX + if(haveAVX) + { + for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) + { + stage_sum += icvEvalHidHaarClassifierAVX( + cascade->stage_classifier[i].classifier+j, + variance_norm_factor, p_offset ); + } + } + #endif + for(; j < cascade->stage_classifier[i].count; j++ ) + { + + stage_sum += icvEvalHidHaarClassifier( + cascade->stage_classifier[i].classifier + j, + variance_norm_factor, p_offset ); + } + if( stage_sum < cascade->stage_classifier[i].threshold ) return -i; } } + //_mm256_zeroupper(); return 1; } + CV_IMPL int cvRunHaarClassifierCascade( const CvHaarClassifierCascade* _cascade, CvPoint pt, int start_stage ) -- 2.7.4