#if CV_SSE2 || CV_SSE3
# if !CV_SSE4_1 && !CV_SSE4_2
-# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
-# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
+# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
+# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
# endif
#endif
-# if CV_AVX
-# define CV_HAAR_USE_AVX 1
-# else
-# if CV_SSE2 || CV_SSE3
-# define CV_HAAR_USE_SSE 1
-# endif
-# endif
+# if CV_AVX
+# define CV_HAAR_USE_AVX 1
+# else
+# if CV_SSE2 || CV_SSE3
+# define CV_HAAR_USE_SSE 1
+# endif
+# endif
/* these settings affect the quality of detection: change with care */
#define CV_ADJUST_FEATURES 1
//AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
-#ifdef CV_HAAR_USE_AVX
+#ifdef CV_HAAR_USE_AVX
CV_INLINE
double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
- double variance_norm_factor, size_t p_offset )
+ double variance_norm_factor, size_t p_offset )
{
- int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
- char flags[8] = {0,0,0,0,0,0,0,0};
- CvHidHaarTreeNode* nodes[8];
- double res = 0;
- char exitConditionFlag = 0;
- for(;;)
- {
- float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
- nodes[0] = classifier ->node + idxV[0];
- nodes[1] = (classifier+1)->node + idxV[1];
- nodes[2] = (classifier+2)->node + idxV[2];
- nodes[3] = (classifier+3)->node + idxV[3];
- nodes[4] = (classifier+4)->node + idxV[4];
- nodes[5] = (classifier+5)->node + idxV[5];
- nodes[6] = (classifier+6)->node + idxV[6];
- nodes[7] = (classifier+7)->node + idxV[7];
-
- __m256 t = _mm256_set1_ps(variance_norm_factor);
- t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
-
- __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
- calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
- p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
- __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
- nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
- __m256 sum = _mm256_mul_ps(offset, weight);
-
- offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
- calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
- calc_sum(nodes[0]->feature.rect[1],p_offset));
- weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
- nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
-
- sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
-
- if( nodes[0]->feature.rect[2].p0 )
- tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
- if( nodes[1]->feature.rect[2].p0 )
+ int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
+ char flags[8] = {0,0,0,0,0,0,0,0};
+ CvHidHaarTreeNode* nodes[8];
+ double res = 0;
+ char exitConditionFlag = 0;
+ for(;;)
+ {
+ float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
+ nodes[0] = classifier ->node + idxV[0];
+ nodes[1] = (classifier+1)->node + idxV[1];
+ nodes[2] = (classifier+2)->node + idxV[2];
+ nodes[3] = (classifier+3)->node + idxV[3];
+ nodes[4] = (classifier+4)->node + idxV[4];
+ nodes[5] = (classifier+5)->node + idxV[5];
+ nodes[6] = (classifier+6)->node + idxV[6];
+ nodes[7] = (classifier+7)->node + idxV[7];
+
+ __m256 t = _mm256_set1_ps(variance_norm_factor);
+ t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
+
+ __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
+ calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
+ p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
+ __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
+ nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
+ __m256 sum = _mm256_mul_ps(offset, weight);
+
+ offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
+ calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
+ calc_sum(nodes[0]->feature.rect[1],p_offset));
+ weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
+ nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
+
+ sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+
+ if( nodes[0]->feature.rect[2].p0 )
+ tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
+ if( nodes[1]->feature.rect[2].p0 )
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
- if( nodes[2]->feature.rect[2].p0 )
+ if( nodes[2]->feature.rect[2].p0 )
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
- if( nodes[3]->feature.rect[2].p0 )
+ if( nodes[3]->feature.rect[2].p0 )
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
- if( nodes[4]->feature.rect[2].p0 )
- tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
- if( nodes[5]->feature.rect[2].p0 )
+ if( nodes[4]->feature.rect[2].p0 )
+ tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
+ if( nodes[5]->feature.rect[2].p0 )
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
- if( nodes[6]->feature.rect[2].p0 )
+ if( nodes[6]->feature.rect[2].p0 )
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
- if( nodes[7]->feature.rect[2].p0 )
+ if( nodes[7]->feature.rect[2].p0 )
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
-
- sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
-
- __m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left);
- __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right);
-
- _mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ ))));
-
- for(int i = 0; i < 8; i++)
- {
- if(idxV[i]<=0)
- {
- if(!flags[i])
- {
- exitConditionFlag++;
- flags[i]=1;
- res+=((classifier+i)->alpha[-idxV[i]]);
- }
- idxV[i]=0;
- }
- }
- if(exitConditionFlag==8)
- return res;
- }
+
+ sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
+
+ __m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left);
+ __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right);
+
+ _mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ ))));
+
+ for(int i = 0; i < 8; i++)
+ {
+ if(idxV[i]<=0)
+ {
+ if(!flags[i])
+ {
+ exitConditionFlag++;
+ flags[i]=1;
+ res+=((classifier+i)->alpha[-idxV[i]]);
+ }
+ idxV[i]=0;
+ }
+ }
+ if(exitConditionFlag==8)
+ return res;
+ }
}
#endif
size_t p_offset )
{
int idx = 0;
- /*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX
- if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow
- {
- double CV_DECL_ALIGNED(16) temp[2];
- __m128d zero = _mm_setzero_pd();
- do
- {
- CvHidHaarTreeNode* node = classifier->node + idx;
- __m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor);
- __m128d left = _mm_set1_pd(node->left);
- __m128d right = _mm_set1_pd(node->right);
-
- double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
- _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
- if( node->feature.rect[2].p0 )
- _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
-
- __m128d sum = _mm_set1_pd(_sum);
- t = _mm_cmplt_sd(sum, t);
- sum = _mm_blendv_pd(right, left, t);
-
- _mm_store_pd(temp, sum);
- idx = (int)temp[0];
- }
- while(idx > 0 );
-
- }
- else
- #endif*/
+ /*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX
+ if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow
+ {
+ double CV_DECL_ALIGNED(16) temp[2];
+ __m128d zero = _mm_setzero_pd();
+ do
+ {
+ CvHidHaarTreeNode* node = classifier->node + idx;
+ __m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor);
+ __m128d left = _mm_set1_pd(node->left);
+ __m128d right = _mm_set1_pd(node->right);
+
+ double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+ _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+ if( node->feature.rect[2].p0 )
+ _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+
+ __m128d sum = _mm_set1_pd(_sum);
+ t = _mm_cmplt_sd(sum, t);
+ sum = _mm_blendv_pd(right, left, t);
+
+ _mm_store_pd(temp, sum);
+ idx = (int)temp[0];
+ }
+ while(idx > 0 );
+
+ }
+ else
+ #endif*/
{
- do
- {
+ do
+ {
CvHidHaarTreeNode* node = classifier->node + idx;
double t = node->threshold * variance_norm_factor;
- double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
- sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+ double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+ sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
- if( node->feature.rect[2].p0 )
- sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+ if( node->feature.rect[2].p0 )
+ sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
- idx = sum < t ? node->left : node->right;
- }
- while( idx > 0 );
+ idx = sum < t ? node->left : node->right;
+ }
+ while( idx > 0 );
}
return classifier->alpha[-idx];
}
cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
CvPoint pt, double& stage_sum, int start_stage )
{
- #ifdef CV_HAAR_USE_AVX
- bool haveAVX = false;
- if(cv::checkHardwareSupport(CV_CPU_AVX))
- if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers
- {
- haveAVX = true;
- }
- #else
- #ifdef CV_HAAR_USE_SSE
- bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
- #endif
- #endif
+ #ifdef CV_HAAR_USE_AVX
+ bool haveAVX = false;
+ if(cv::checkHardwareSupport(CV_CPU_AVX))
+ if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers
+ {
+ haveAVX = true;
+ }
+ #else
+ #ifdef CV_HAAR_USE_SSE
+ bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
+ #endif
+ #endif
int p_offset, pq_offset;
int i, j;
{
stage_sum = 0.0;
- #ifdef CV_HAAR_USE_AVX
- if(haveAVX)
- {
- for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
- {
- stage_sum += icvEvalHidHaarClassifierAVX(
- cascade->stage_classifier[i].classifier+j,
- variance_norm_factor, p_offset );
- }
- }
- #endif
+ #ifdef CV_HAAR_USE_AVX
+ if(haveAVX)
+ {
+ for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
+ {
+ stage_sum += icvEvalHidHaarClassifierAVX(
+ cascade->stage_classifier[i].classifier+j,
+ variance_norm_factor, p_offset );
+ }
+ }
+ #endif
for( j = 0; j < ptr->count; j++ )
{
stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset );
}
else if( cascade->isStumpBased )
{
- #ifdef CV_HAAR_USE_AVX
- if(haveAVX)
- {
- CvHidHaarClassifier* classifiers[8];
- CvHidHaarTreeNode* nodes[8];
- for( i = start_stage; i < cascade->count; i++ )
- {
- stage_sum = 0.0;
- int j = 0;
- float CV_DECL_ALIGNED(32) buf[8];
- if( cascade->stage_classifier[i].two_rects )
- {
- for( ; j <= cascade->stage_classifier[i].count-8; j+=8 )
- {
- //__m256 stage_sumPart = _mm256_setzero_ps();
- classifiers[0] = cascade->stage_classifier[i].classifier + j;
- nodes[0] = classifiers[0]->node;
- classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
- nodes[1] = classifiers[1]->node;
- classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
- nodes[2]= classifiers[2]->node;
- classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
- nodes[3] = classifiers[3]->node;
- classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
- nodes[4] = classifiers[4]->node;
- classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
- nodes[5] = classifiers[5]->node;
- classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
- nodes[6] = classifiers[6]->node;
- classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
- nodes[7] = classifiers[7]->node;
-
- __m256 t = _mm256_set1_ps(variance_norm_factor);
- t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
-
- __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
- calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
- p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
- __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
- nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
- __m256 sum = _mm256_mul_ps(offset, weight);
-
- offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
- calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
- calc_sum(nodes[0]->feature.rect[1],p_offset));
- weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
- nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
- sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
-
- __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
- classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
- __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
- classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
-
- _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )));
- stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
-
- }
-
- for( ; j < cascade->stage_classifier[i].count; j++ )
- {
- CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
- CvHidHaarTreeNode* node = classifier->node;
-
- double t = node->threshold*variance_norm_factor;
- double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
- sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
- stage_sum += classifier->alpha[sum >= t];
- }
- }
- else
- {
- for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
- {
- float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
-
- classifiers[0] = cascade->stage_classifier[i].classifier + j;
- nodes[0] = classifiers[0]->node;
- classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
- nodes[1] = classifiers[1]->node;
- classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
- nodes[2]= classifiers[2]->node;
- classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
- nodes[3] = classifiers[3]->node;
- classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
- nodes[4] = classifiers[4]->node;
- classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
- nodes[5] = classifiers[5]->node;
- classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
- nodes[6] = classifiers[6]->node;
- classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
- nodes[7] = classifiers[7]->node;
-
- __m256 t = _mm256_set1_ps(variance_norm_factor);
- t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
-
- __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
- calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
- p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
- __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
- nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
- __m256 sum = _mm256_mul_ps(offset, weight);
-
- offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
- calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
- calc_sum(nodes[0]->feature.rect[1],p_offset));
- weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
- nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
-
- sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
-
- if( nodes[0]->feature.rect[2].p0 )
- tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
- if( nodes[1]->feature.rect[2].p0 )
- tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
- if( nodes[2]->feature.rect[2].p0 )
- tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
- if( nodes[3]->feature.rect[2].p0 )
- tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
- if( nodes[4]->feature.rect[2].p0 )
- tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
- if( nodes[5]->feature.rect[2].p0 )
- tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
- if( nodes[6]->feature.rect[2].p0 )
- tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
- if( nodes[7]->feature.rect[2].p0 )
- tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
-
- sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
-
- __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
- classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
- __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
- classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
-
- __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
- outBuf = _mm256_hadd_ps(outBuf, outBuf);
- outBuf = _mm256_hadd_ps(outBuf, outBuf);
- _mm256_store_ps(buf, outBuf);
- stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
- }
-
- for( ; j < cascade->stage_classifier[i].count; j++ )
- {
- CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
- CvHidHaarTreeNode* node = classifier->node;
-
- double t = node->threshold*variance_norm_factor;
- double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
- sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
- if( node->feature.rect[2].p0 )
- sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
- stage_sum += classifier->alpha[sum >= t];
- }
- }
- if( stage_sum < cascade->stage_classifier[i].threshold )
- return -i;
- }
- }
- else
- #endif
- #ifdef CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization
- if(haveSSE2)
- {
- for( i = start_stage; i < cascade->count; i++ )
- {
- __m128d stage_sum = _mm_setzero_pd();
- if( cascade->stage_classifier[i].two_rects )
- {
- for( j = 0; j < cascade->stage_classifier[i].count; j++ )
- {
- CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
- CvHidHaarTreeNode* node = classifier->node;
-
- // ayasin - NHM perf optim. Avoid use of costly flaky jcc
- __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
- __m128d a = _mm_set_sd(classifier->alpha[0]);
- __m128d b = _mm_set_sd(classifier->alpha[1]);
- __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
- calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
- t = _mm_cmpgt_sd(t, sum);
- stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
- }
- }
- else
- {
- for( j = 0; j < cascade->stage_classifier[i].count; j++ )
- {
- CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
- CvHidHaarTreeNode* node = classifier->node;
- // ayasin - NHM perf optim. Avoid use of costly flaky jcc
- __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
- __m128d a = _mm_set_sd(classifier->alpha[0]);
- __m128d b = _mm_set_sd(classifier->alpha[1]);
- double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
- _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
- if( node->feature.rect[2].p0 )
- _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
- __m128d sum = _mm_set_sd(_sum);
-
- t = _mm_cmpgt_sd(t, sum);
- stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
- }
- }
- __m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold);
- if( _mm_comilt_sd(stage_sum, i_threshold) )
- return -i;
- }
- }
- else
- #endif
- {
- for( i = start_stage; i < cascade->count; i++ )
- {
- stage_sum = 0.0;
- if( cascade->stage_classifier[i].two_rects )
- {
- for( j = 0; j < cascade->stage_classifier[i].count; j++ )
- {
- CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
- CvHidHaarTreeNode* node = classifier->node;
- double t = node->threshold*variance_norm_factor;
- double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
- sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
- stage_sum += classifier->alpha[sum >= t];
- }
- }
- else
- {
- for( j = 0; j < cascade->stage_classifier[i].count; j++ )
- {
- CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
- CvHidHaarTreeNode* node = classifier->node;
- double t = node->threshold*variance_norm_factor;
- double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
- sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
- if( node->feature.rect[2].p0 )
- sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
- stage_sum += classifier->alpha[sum >= t];
- }
- }
- if( stage_sum < cascade->stage_classifier[i].threshold )
- return -i;
- }
- }
- }
-
- else
+ #ifdef CV_HAAR_USE_AVX
+ if(haveAVX)
+ {
+ CvHidHaarClassifier* classifiers[8];
+ CvHidHaarTreeNode* nodes[8];
+ for( i = start_stage; i < cascade->count; i++ )
+ {
+ stage_sum = 0.0;
+ int j = 0;
+ float CV_DECL_ALIGNED(32) buf[8];
+ if( cascade->stage_classifier[i].two_rects )
+ {
+ for( ; j <= cascade->stage_classifier[i].count-8; j+=8 )
+ {
+ //__m256 stage_sumPart = _mm256_setzero_ps();
+ classifiers[0] = cascade->stage_classifier[i].classifier + j;
+ nodes[0] = classifiers[0]->node;
+ classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
+ nodes[1] = classifiers[1]->node;
+ classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
+ nodes[2]= classifiers[2]->node;
+ classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
+ nodes[3] = classifiers[3]->node;
+ classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
+ nodes[4] = classifiers[4]->node;
+ classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
+ nodes[5] = classifiers[5]->node;
+ classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
+ nodes[6] = classifiers[6]->node;
+ classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
+ nodes[7] = classifiers[7]->node;
+
+ __m256 t = _mm256_set1_ps(variance_norm_factor);
+ t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
+
+ __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
+ calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
+ p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
+ __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
+ nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
+ __m256 sum = _mm256_mul_ps(offset, weight);
+
+ offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
+ calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
+ calc_sum(nodes[0]->feature.rect[1],p_offset));
+ weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
+ nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
+ sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+
+ __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
+ classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
+ __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
+ classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
+
+ _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )));
+ stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+
+ }
+
+ for( ; j < cascade->stage_classifier[i].count; j++ )
+ {
+ CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+ CvHidHaarTreeNode* node = classifier->node;
+
+ double t = node->threshold*variance_norm_factor;
+ double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+ sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+ stage_sum += classifier->alpha[sum >= t];
+ }
+ }
+ else
+ {
+ for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
+ {
+ float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
+
+ classifiers[0] = cascade->stage_classifier[i].classifier + j;
+ nodes[0] = classifiers[0]->node;
+ classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
+ nodes[1] = classifiers[1]->node;
+ classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
+ nodes[2]= classifiers[2]->node;
+ classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
+ nodes[3] = classifiers[3]->node;
+ classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
+ nodes[4] = classifiers[4]->node;
+ classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
+ nodes[5] = classifiers[5]->node;
+ classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
+ nodes[6] = classifiers[6]->node;
+ classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
+ nodes[7] = classifiers[7]->node;
+
+ __m256 t = _mm256_set1_ps(variance_norm_factor);
+ t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
+
+ __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
+ calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
+ p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
+ __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
+ nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
+ __m256 sum = _mm256_mul_ps(offset, weight);
+
+ offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
+ calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
+ calc_sum(nodes[0]->feature.rect[1],p_offset));
+ weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
+ nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
+
+ sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
+
+ if( nodes[0]->feature.rect[2].p0 )
+ tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
+ if( nodes[1]->feature.rect[2].p0 )
+ tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
+ if( nodes[2]->feature.rect[2].p0 )
+ tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
+ if( nodes[3]->feature.rect[2].p0 )
+ tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
+ if( nodes[4]->feature.rect[2].p0 )
+ tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
+ if( nodes[5]->feature.rect[2].p0 )
+ tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
+ if( nodes[6]->feature.rect[2].p0 )
+ tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
+ if( nodes[7]->feature.rect[2].p0 )
+ tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
+
+ sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
+
+ __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
+ classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
+ __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
+ classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
+
+ __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
+ outBuf = _mm256_hadd_ps(outBuf, outBuf);
+ outBuf = _mm256_hadd_ps(outBuf, outBuf);
+ _mm256_store_ps(buf, outBuf);
+ stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+ }
+
+ for( ; j < cascade->stage_classifier[i].count; j++ )
+ {
+ CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+ CvHidHaarTreeNode* node = classifier->node;
+
+ double t = node->threshold*variance_norm_factor;
+ double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+ sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+ if( node->feature.rect[2].p0 )
+ sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+ stage_sum += classifier->alpha[sum >= t];
+ }
+ }
+ if( stage_sum < cascade->stage_classifier[i].threshold )
+ return -i;
+ }
+ }
+ else
+ #endif
+ #if defined CV_HAAR_USE_SSE && CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization
+ if(haveSSE2)
+ {
+ for( i = start_stage; i < cascade->count; i++ )
+ {
+ __m128d stage_sum = _mm_setzero_pd();
+ if( cascade->stage_classifier[i].two_rects )
+ {
+ for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+ {
+ CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+ CvHidHaarTreeNode* node = classifier->node;
+
+ // ayasin - NHM perf optim. Avoid use of costly flaky jcc
+ __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
+ __m128d a = _mm_set_sd(classifier->alpha[0]);
+ __m128d b = _mm_set_sd(classifier->alpha[1]);
+ __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
+ calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
+ t = _mm_cmpgt_sd(t, sum);
+ stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
+ }
+ }
+ else
+ {
+ for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+ {
+ CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+ CvHidHaarTreeNode* node = classifier->node;
+ // ayasin - NHM perf optim. Avoid use of costly flaky jcc
+ __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
+ __m128d a = _mm_set_sd(classifier->alpha[0]);
+ __m128d b = _mm_set_sd(classifier->alpha[1]);
+ double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+ _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+ if( node->feature.rect[2].p0 )
+ _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+ __m128d sum = _mm_set_sd(_sum);
+
+ t = _mm_cmpgt_sd(t, sum);
+ stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
+ }
+ }
+ __m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold);
+ if( _mm_comilt_sd(stage_sum, i_threshold) )
+ return -i;
+ }
+ }
+ else
+ #endif
+ {
+ for( i = start_stage; i < cascade->count; i++ )
+ {
+ stage_sum = 0.0;
+ if( cascade->stage_classifier[i].two_rects )
+ {
+ for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+ {
+ CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+ CvHidHaarTreeNode* node = classifier->node;
+ double t = node->threshold*variance_norm_factor;
+ double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+ sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+ stage_sum += classifier->alpha[sum >= t];
+ }
+ }
+ else
+ {
+ for( j = 0; j < cascade->stage_classifier[i].count; j++ )
+ {
+ CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
+ CvHidHaarTreeNode* node = classifier->node;
+ double t = node->threshold*variance_norm_factor;
+ double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
+ sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
+ if( node->feature.rect[2].p0 )
+ sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
+ stage_sum += classifier->alpha[sum >= t];
+ }
+ }
+ if( stage_sum < cascade->stage_classifier[i].threshold )
+ return -i;
+ }
+ }
+ }
+
+ else
{
for( i = start_stage; i < cascade->count; i++ )
{
stage_sum = 0.0;
- int j = 0;
- #ifdef CV_HAAR_USE_AVX
- if(haveAVX)
- {
- for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
- {
- stage_sum += icvEvalHidHaarClassifierAVX(
- cascade->stage_classifier[i].classifier+j,
- variance_norm_factor, p_offset );
- }
- }
- #endif
- for(; j < cascade->stage_classifier[i].count; j++ )
- {
-
- stage_sum += icvEvalHidHaarClassifier(
- cascade->stage_classifier[i].classifier + j,
- variance_norm_factor, p_offset );
- }
-
+ int k = 0;
+ #ifdef CV_HAAR_USE_AVX
+ if(haveAVX)
+ {
+ for( ; k < cascade->stage_classifier[i].count-8; k+=8 )
+ {
+ stage_sum += icvEvalHidHaarClassifierAVX(
+ cascade->stage_classifier[i].classifier+k,
+ variance_norm_factor, p_offset );
+ }
+ }
+ #endif
+ for(; k < cascade->stage_classifier[i].count; k++ )
+ {
+
+ stage_sum += icvEvalHidHaarClassifier(
+ cascade->stage_classifier[i].classifier + k,
+ variance_norm_factor, p_offset );
+ }
+
if( stage_sum < cascade->stage_classifier[i].threshold )
return -i;
}
}
- //_mm256_zeroupper();
+ //_mm256_zeroupper();
return 1;
}