From 07d92d9e5a4f3250231c3cddbd51911ebd00c666 Mon Sep 17 00:00:00 2001 From: Andrey Kamaev Date: Tue, 4 Sep 2012 17:44:23 +0400 Subject: [PATCH] Fix android build warnings --- modules/contrib/src/bowmsctrainer.cpp | 22 +- modules/contrib/src/openfabmap.cpp | 28 +- modules/objdetect/src/haar.cpp | 820 +++++++++++++++++----------------- samples/cpp/pca.cpp | 64 +-- 4 files changed, 467 insertions(+), 467 deletions(-) diff --git a/modules/contrib/src/bowmsctrainer.cpp b/modules/contrib/src/bowmsctrainer.cpp index c952282..448505c 100644 --- a/modules/contrib/src/bowmsctrainer.cpp +++ b/modules/contrib/src/bowmsctrainer.cpp @@ -81,46 +81,46 @@ Mat BOWMSCTrainer::cluster() const { return cluster(mergedDescriptors); } -Mat BOWMSCTrainer::cluster(const Mat& descriptors) const { +Mat BOWMSCTrainer::cluster(const Mat& _descriptors) const { - CV_Assert(!descriptors.empty()); + CV_Assert(!_descriptors.empty()); // TODO: sort the descriptors before clustering. - Mat icovar = Mat::eye(descriptors.cols,descriptors.cols,descriptors.type()); + Mat icovar = Mat::eye(_descriptors.cols,_descriptors.cols,_descriptors.type()); vector initialCentres; - initialCentres.push_back(descriptors.row(0)); - for (int i = 1; i < descriptors.rows; i++) { + initialCentres.push_back(_descriptors.row(0)); + for (int i = 1; i < _descriptors.rows; i++) { double minDist = DBL_MAX; for (size_t j = 0; j < initialCentres.size(); j++) { minDist = std::min(minDist, - cv::Mahalanobis(descriptors.row(i),initialCentres[j], + cv::Mahalanobis(_descriptors.row(i),initialCentres[j], icovar)); } if (minDist > clusterSize) - initialCentres.push_back(descriptors.row(i)); + initialCentres.push_back(_descriptors.row(i)); } std::vector > clusters; clusters.resize(initialCentres.size()); - for (int i = 0; i < descriptors.rows; i++) { + for (int i = 0; i < _descriptors.rows; i++) { int index = 0; double dist = 0, minDist = DBL_MAX; for (size_t j = 0; j < initialCentres.size(); j++) { - dist = cv::Mahalanobis(descriptors.row(i),initialCentres[j],icovar); + dist = cv::Mahalanobis(_descriptors.row(i),initialCentres[j],icovar); if (dist < minDist) { minDist = dist; index = (int)j; } } - clusters[index].push_back(descriptors.row(i)); + clusters[index].push_back(_descriptors.row(i)); } // TODO: throw away small clusters. Mat vocabulary; - Mat centre = Mat::zeros(1,descriptors.cols,descriptors.type()); + Mat centre = Mat::zeros(1,_descriptors.cols,_descriptors.type()); for (size_t i = 0; i < clusters.size(); i++) { centre.setTo(0); for (std::list::iterator Ci = clusters[i].begin(); Ci != clusters[i].end(); Ci++) { diff --git a/modules/contrib/src/openfabmap.cpp b/modules/contrib/src/openfabmap.cpp index 99795c9..a6b4ac1 100644 --- a/modules/contrib/src/openfabmap.cpp +++ b/modules/contrib/src/openfabmap.cpp @@ -63,7 +63,7 @@ namespace of2 { static double logsumexp(double a, double b) { return a > b ? log(1 + exp(b - a)) + a : log(1 + exp(a - b)) + b; } - + FabMap::FabMap(const Mat& _clTree, double _PzGe, double _PzGNe, int _flags, int _numSamples) : clTree(_clTree), PzGe(_PzGe), PzGNe(_PzGNe), flags( @@ -445,16 +445,16 @@ FabMap1::~FabMap1() { } void FabMap1::getLikelihoods(const Mat& queryImgDescriptor, - const vector& testImgDescriptors, vector& matches) { + const vector& testImageDescriptors, vector& matches) { - for (size_t i = 0; i < testImgDescriptors.size(); i++) { + for (size_t i = 0; i < testImageDescriptors.size(); i++) { bool zq, zpq, Lzq; double logP = 0; for (int q = 0; q < clTree.cols; q++) { zq = queryImgDescriptor.at(0,q) > 0; zpq = queryImgDescriptor.at(0,pq(q)) > 0; - Lzq = testImgDescriptors[i].at(0,q) > 0; + Lzq = testImageDescriptors[i].at(0,q) > 0; logP += log((this->*PzGL)(q, zq, zpq, Lzq)); @@ -490,16 +490,16 @@ FabMapLUT::~FabMapLUT() { } void FabMapLUT::getLikelihoods(const Mat& queryImgDescriptor, - const vector& testImgDescriptors, vector& matches) { + const vector& testImageDescriptors, vector& matches) { double precFactor = (double)pow(10.0, -precision); - for (size_t i = 0; i < testImgDescriptors.size(); i++) { + for (size_t i = 0; i < testImageDescriptors.size(); i++) { unsigned long long int logP = 0; for (int q = 0; q < clTree.cols; q++) { logP += table[q][(queryImgDescriptor.at(0,pq(q)) > 0) + ((queryImgDescriptor.at(0, q) > 0) << 1) + - ((testImgDescriptors[i].at(0,q) > 0) << 2)]; + ((testImageDescriptors[i].at(0,q) > 0) << 2)]; } matches.push_back(IMatch(0,(int)i,-precFactor*(double)logP,0)); } @@ -518,7 +518,7 @@ FabMapFBO::~FabMapFBO() { } void FabMapFBO::getLikelihoods(const Mat& queryImgDescriptor, - const vector& testImgDescriptors, vector& matches) { + const vector& testImageDescriptors, vector& matches) { std::multiset wordData; setWordStatistics(queryImgDescriptor, wordData); @@ -526,7 +526,7 @@ void FabMapFBO::getLikelihoods(const Mat& queryImgDescriptor, vector matchIndices; vector queryMatches; - for (size_t i = 0; i < testImgDescriptors.size(); i++) { + for (size_t i = 0; i < testImageDescriptors.size(); i++) { queryMatches.push_back(IMatch(0,(int)i,0,0)); matchIndices.push_back((int)i); } @@ -543,7 +543,7 @@ void FabMapFBO::getLikelihoods(const Mat& queryImgDescriptor, for (size_t i = 0; i < matchIndices.size(); i++) { bool Lzq = - testImgDescriptors[matchIndices[i]].at(0,wordIter->q) > 0; + testImageDescriptors[matchIndices[i]].at(0,wordIter->q) > 0; queryMatches[matchIndices[i]].likelihood += log((this->*PzGL)(wordIter->q,zq,zpq,Lzq)); currBest = @@ -689,17 +689,17 @@ void FabMap2::add(const vector& queryImgDescriptors) { } void FabMap2::getLikelihoods(const Mat& queryImgDescriptor, - const vector& testImgDescriptors, vector& matches) { + const vector& testImageDescriptors, vector& matches) { - if (&testImgDescriptors== &(this->testImgDescriptors)) { + if (&testImageDescriptors == &testImgDescriptors) { getIndexLikelihoods(queryImgDescriptor, testDefaults, testInvertedMap, matches); } else { CV_Assert(!(flags & MOTION_MODEL)); vector defaults; std::map > invertedMap; - for (size_t i = 0; i < testImgDescriptors.size(); i++) { - addToIndex(testImgDescriptors[i],defaults,invertedMap); + for (size_t i = 0; i < testImageDescriptors.size(); i++) { + addToIndex(testImageDescriptors[i],defaults,invertedMap); } getIndexLikelihoods(queryImgDescriptor, defaults, invertedMap, matches); } diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp index 3dd032f..4ba4e7f 100644 --- a/modules/objdetect/src/haar.cpp +++ b/modules/objdetect/src/haar.cpp @@ -47,18 +47,18 @@ #if CV_SSE2 || CV_SSE3 # if !CV_SSE4_1 && !CV_SSE4_2 -# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) -# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) +# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) +# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) # endif #endif -# if CV_AVX -# define CV_HAAR_USE_AVX 1 -# else -# if CV_SSE2 || CV_SSE3 -# define CV_HAAR_USE_SSE 1 -# endif -# endif +# if CV_AVX +# define CV_HAAR_USE_AVX 1 +# else +# if CV_SSE2 || CV_SSE3 +# define CV_HAAR_USE_SSE 1 +# endif +# endif /* these settings affect the quality of detection: change with care */ #define CV_ADJUST_FEATURES 1 @@ -634,86 +634,86 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade, //AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!! -#ifdef CV_HAAR_USE_AVX +#ifdef CV_HAAR_USE_AVX CV_INLINE double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier, - double variance_norm_factor, size_t p_offset ) + double variance_norm_factor, size_t p_offset ) { - int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0}; - char flags[8] = {0,0,0,0,0,0,0,0}; - CvHidHaarTreeNode* nodes[8]; - double res = 0; - char exitConditionFlag = 0; - for(;;) - { - float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; - nodes[0] = classifier ->node + idxV[0]; - nodes[1] = (classifier+1)->node + idxV[1]; - nodes[2] = (classifier+2)->node + idxV[2]; - nodes[3] = (classifier+3)->node + idxV[3]; - nodes[4] = (classifier+4)->node + idxV[4]; - nodes[5] = (classifier+5)->node + idxV[5]; - nodes[6] = (classifier+6)->node + idxV[6]; - nodes[7] = (classifier+7)->node + idxV[7]; - - __m256 t = _mm256_set1_ps(variance_norm_factor); - t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); - - __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), - calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], - p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); - __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, - nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); - __m256 sum = _mm256_mul_ps(offset, weight); - - offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), - calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), - calc_sum(nodes[0]->feature.rect[1],p_offset)); - weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, - nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); - - sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); - - if( nodes[0]->feature.rect[2].p0 ) - tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; - if( nodes[1]->feature.rect[2].p0 ) + int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0}; + char flags[8] = {0,0,0,0,0,0,0,0}; + CvHidHaarTreeNode* nodes[8]; + double res = 0; + char exitConditionFlag = 0; + for(;;) + { + float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; + nodes[0] = classifier ->node + idxV[0]; + nodes[1] = (classifier+1)->node + idxV[1]; + nodes[2] = (classifier+2)->node + idxV[2]; + nodes[3] = (classifier+3)->node + idxV[3]; + nodes[4] = (classifier+4)->node + idxV[4]; + nodes[5] = (classifier+5)->node + idxV[5]; + nodes[6] = (classifier+6)->node + idxV[6]; + nodes[7] = (classifier+7)->node + idxV[7]; + + __m256 t = _mm256_set1_ps(variance_norm_factor); + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), + calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], + p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), + calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), + calc_sum(nodes[0]->feature.rect[1],p_offset)); + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); + + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); + + if( nodes[0]->feature.rect[2].p0 ) + tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; + if( nodes[1]->feature.rect[2].p0 ) tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; - if( nodes[2]->feature.rect[2].p0 ) + if( nodes[2]->feature.rect[2].p0 ) tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; - if( nodes[3]->feature.rect[2].p0 ) + if( nodes[3]->feature.rect[2].p0 ) tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; - if( nodes[4]->feature.rect[2].p0 ) - tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; - if( nodes[5]->feature.rect[2].p0 ) + if( nodes[4]->feature.rect[2].p0 ) + tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; + if( nodes[5]->feature.rect[2].p0 ) tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; - if( nodes[6]->feature.rect[2].p0 ) + if( nodes[6]->feature.rect[2].p0 ) tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; - if( nodes[7]->feature.rect[2].p0 ) + if( nodes[7]->feature.rect[2].p0 ) tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; - - sum = _mm256_add_ps(sum,_mm256_load_ps(tmp)); - - __m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left); - __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right); - - _mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ )))); - - for(int i = 0; i < 8; i++) - { - if(idxV[i]<=0) - { - if(!flags[i]) - { - exitConditionFlag++; - flags[i]=1; - res+=((classifier+i)->alpha[-idxV[i]]); - } - idxV[i]=0; - } - } - if(exitConditionFlag==8) - return res; - } + + sum = _mm256_add_ps(sum,_mm256_load_ps(tmp)); + + __m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left); + __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right); + + _mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ )))); + + for(int i = 0; i < 8; i++) + { + if(idxV[i]<=0) + { + if(!flags[i]) + { + exitConditionFlag++; + flags[i]=1; + res+=((classifier+i)->alpha[-idxV[i]]); + } + idxV[i]=0; + } + } + if(exitConditionFlag==8) + return res; + } } #endif @@ -723,50 +723,50 @@ double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier, size_t p_offset ) { int idx = 0; - /*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX - if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow - { - double CV_DECL_ALIGNED(16) temp[2]; - __m128d zero = _mm_setzero_pd(); - do - { - CvHidHaarTreeNode* node = classifier->node + idx; - __m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor); - __m128d left = _mm_set1_pd(node->left); - __m128d right = _mm_set1_pd(node->right); - - double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - - __m128d sum = _mm_set1_pd(_sum); - t = _mm_cmplt_sd(sum, t); - sum = _mm_blendv_pd(right, left, t); - - _mm_store_pd(temp, sum); - idx = (int)temp[0]; - } - while(idx > 0 ); - - } - else - #endif*/ + /*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX + if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow + { + double CV_DECL_ALIGNED(16) temp[2]; + __m128d zero = _mm_setzero_pd(); + do + { + CvHidHaarTreeNode* node = classifier->node + idx; + __m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor); + __m128d left = _mm_set1_pd(node->left); + __m128d right = _mm_set1_pd(node->right); + + double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + + __m128d sum = _mm_set1_pd(_sum); + t = _mm_cmplt_sd(sum, t); + sum = _mm_blendv_pd(right, left, t); + + _mm_store_pd(temp, sum); + idx = (int)temp[0]; + } + while(idx > 0 ); + + } + else + #endif*/ { - do - { + do + { CvHidHaarTreeNode* node = classifier->node + idx; double t = node->threshold * variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + if( node->feature.rect[2].p0 ) + sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - idx = sum < t ? node->left : node->right; - } - while( idx > 0 ); + idx = sum < t ? node->left : node->right; + } + while( idx > 0 ); } return classifier->alpha[-idx]; } @@ -777,18 +777,18 @@ static int cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, CvPoint pt, double& stage_sum, int start_stage ) { - #ifdef CV_HAAR_USE_AVX - bool haveAVX = false; - if(cv::checkHardwareSupport(CV_CPU_AVX)) - if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers - { - haveAVX = true; - } - #else - #ifdef CV_HAAR_USE_SSE - bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); - #endif - #endif + #ifdef CV_HAAR_USE_AVX + bool haveAVX = false; + if(cv::checkHardwareSupport(CV_CPU_AVX)) + if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers + { + haveAVX = true; + } + #else + #ifdef CV_HAAR_USE_SSE + bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); + #endif + #endif int p_offset, pq_offset; int i, j; @@ -828,17 +828,17 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, { stage_sum = 0.0; - #ifdef CV_HAAR_USE_AVX - if(haveAVX) - { - for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) - { - stage_sum += icvEvalHidHaarClassifierAVX( - cascade->stage_classifier[i].classifier+j, - variance_norm_factor, p_offset ); - } - } - #endif + #ifdef CV_HAAR_USE_AVX + if(haveAVX) + { + for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) + { + stage_sum += icvEvalHidHaarClassifierAVX( + cascade->stage_classifier[i].classifier+j, + variance_norm_factor, p_offset ); + } + } + #endif for( j = 0; j < ptr->count; j++ ) { stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset ); @@ -859,283 +859,283 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, } else if( cascade->isStumpBased ) { - #ifdef CV_HAAR_USE_AVX - if(haveAVX) - { - CvHidHaarClassifier* classifiers[8]; - CvHidHaarTreeNode* nodes[8]; - for( i = start_stage; i < cascade->count; i++ ) - { - stage_sum = 0.0; - int j = 0; - float CV_DECL_ALIGNED(32) buf[8]; - if( cascade->stage_classifier[i].two_rects ) - { - for( ; j <= cascade->stage_classifier[i].count-8; j+=8 ) - { - //__m256 stage_sumPart = _mm256_setzero_ps(); - classifiers[0] = cascade->stage_classifier[i].classifier + j; - nodes[0] = classifiers[0]->node; - classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; - nodes[1] = classifiers[1]->node; - classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; - nodes[2]= classifiers[2]->node; - classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; - nodes[3] = classifiers[3]->node; - classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; - nodes[4] = classifiers[4]->node; - classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; - nodes[5] = classifiers[5]->node; - classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; - nodes[6] = classifiers[6]->node; - classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; - nodes[7] = classifiers[7]->node; - - __m256 t = _mm256_set1_ps(variance_norm_factor); - t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); - - __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), - calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], - p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); - __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, - nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); - __m256 sum = _mm256_mul_ps(offset, weight); - - offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), - calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), - calc_sum(nodes[0]->feature.rect[1],p_offset)); - weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, - nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); - sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); - - __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], - classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); - __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], - classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); - - _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ))); - stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); - - } - - for( ; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; - - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - stage_sum += classifier->alpha[sum >= t]; - } - } - else - { - for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 ) - { - float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; - - classifiers[0] = cascade->stage_classifier[i].classifier + j; - nodes[0] = classifiers[0]->node; - classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; - nodes[1] = classifiers[1]->node; - classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; - nodes[2]= classifiers[2]->node; - classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; - nodes[3] = classifiers[3]->node; - classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; - nodes[4] = classifiers[4]->node; - classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; - nodes[5] = classifiers[5]->node; - classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; - nodes[6] = classifiers[6]->node; - classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; - nodes[7] = classifiers[7]->node; - - __m256 t = _mm256_set1_ps(variance_norm_factor); - t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); - - __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), - calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], - p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); - __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, - nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); - __m256 sum = _mm256_mul_ps(offset, weight); - - offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), - calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), - calc_sum(nodes[0]->feature.rect[1],p_offset)); - weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, - nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); - - sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); - - if( nodes[0]->feature.rect[2].p0 ) - tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; - if( nodes[1]->feature.rect[2].p0 ) - tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; - if( nodes[2]->feature.rect[2].p0 ) - tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; - if( nodes[3]->feature.rect[2].p0 ) - tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; - if( nodes[4]->feature.rect[2].p0 ) - tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; - if( nodes[5]->feature.rect[2].p0 ) - tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; - if( nodes[6]->feature.rect[2].p0 ) - tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; - if( nodes[7]->feature.rect[2].p0 ) - tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; - - sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); - - __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], - classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); - __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], - classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); - - __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )); - outBuf = _mm256_hadd_ps(outBuf, outBuf); - outBuf = _mm256_hadd_ps(outBuf, outBuf); - _mm256_store_ps(buf, outBuf); - stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); - } - - for( ; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; - - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - stage_sum += classifier->alpha[sum >= t]; - } - } - if( stage_sum < cascade->stage_classifier[i].threshold ) - return -i; - } - } - else - #endif - #ifdef CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization - if(haveSSE2) - { - for( i = start_stage; i < cascade->count; i++ ) - { - __m128d stage_sum = _mm_setzero_pd(); - if( cascade->stage_classifier[i].two_rects ) - { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; - - // ayasin - NHM perf optim. Avoid use of costly flaky jcc - __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); - __m128d a = _mm_set_sd(classifier->alpha[0]); - __m128d b = _mm_set_sd(classifier->alpha[1]); - __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + - calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); - t = _mm_cmpgt_sd(t, sum); - stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); - } - } - else - { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; - // ayasin - NHM perf optim. Avoid use of costly flaky jcc - __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); - __m128d a = _mm_set_sd(classifier->alpha[0]); - __m128d b = _mm_set_sd(classifier->alpha[1]); - double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - __m128d sum = _mm_set_sd(_sum); - - t = _mm_cmpgt_sd(t, sum); - stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); - } - } - __m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold); - if( _mm_comilt_sd(stage_sum, i_threshold) ) - return -i; - } - } - else - #endif - { - for( i = start_stage; i < cascade->count; i++ ) - { - stage_sum = 0.0; - if( cascade->stage_classifier[i].two_rects ) - { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - stage_sum += classifier->alpha[sum >= t]; - } - } - else - { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) - { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - stage_sum += classifier->alpha[sum >= t]; - } - } - if( stage_sum < cascade->stage_classifier[i].threshold ) - return -i; - } - } - } - - else + #ifdef CV_HAAR_USE_AVX + if(haveAVX) + { + CvHidHaarClassifier* classifiers[8]; + CvHidHaarTreeNode* nodes[8]; + for( i = start_stage; i < cascade->count; i++ ) + { + stage_sum = 0.0; + int j = 0; + float CV_DECL_ALIGNED(32) buf[8]; + if( cascade->stage_classifier[i].two_rects ) + { + for( ; j <= cascade->stage_classifier[i].count-8; j+=8 ) + { + //__m256 stage_sumPart = _mm256_setzero_ps(); + classifiers[0] = cascade->stage_classifier[i].classifier + j; + nodes[0] = classifiers[0]->node; + classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; + nodes[1] = classifiers[1]->node; + classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; + nodes[2]= classifiers[2]->node; + classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; + nodes[3] = classifiers[3]->node; + classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; + nodes[4] = classifiers[4]->node; + classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; + nodes[5] = classifiers[5]->node; + classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; + nodes[6] = classifiers[6]->node; + classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; + nodes[7] = classifiers[7]->node; + + __m256 t = _mm256_set1_ps(variance_norm_factor); + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), + calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], + p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), + calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), + calc_sum(nodes[0]->feature.rect[1],p_offset)); + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); + + __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], + classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); + __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], + classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); + + _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ))); + stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); + + } + + for( ; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + else + { + for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 ) + { + float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; + + classifiers[0] = cascade->stage_classifier[i].classifier + j; + nodes[0] = classifiers[0]->node; + classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; + nodes[1] = classifiers[1]->node; + classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; + nodes[2]= classifiers[2]->node; + classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; + nodes[3] = classifiers[3]->node; + classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; + nodes[4] = classifiers[4]->node; + classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; + nodes[5] = classifiers[5]->node; + classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; + nodes[6] = classifiers[6]->node; + classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; + nodes[7] = classifiers[7]->node; + + __m256 t = _mm256_set1_ps(variance_norm_factor); + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), + calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], + p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), + calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), + calc_sum(nodes[0]->feature.rect[1],p_offset)); + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); + + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); + + if( nodes[0]->feature.rect[2].p0 ) + tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; + if( nodes[1]->feature.rect[2].p0 ) + tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; + if( nodes[2]->feature.rect[2].p0 ) + tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; + if( nodes[3]->feature.rect[2].p0 ) + tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; + if( nodes[4]->feature.rect[2].p0 ) + tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; + if( nodes[5]->feature.rect[2].p0 ) + tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; + if( nodes[6]->feature.rect[2].p0 ) + tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; + if( nodes[7]->feature.rect[2].p0 ) + tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; + + sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); + + __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], + classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); + __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], + classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); + + __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )); + outBuf = _mm256_hadd_ps(outBuf, outBuf); + outBuf = _mm256_hadd_ps(outBuf, outBuf); + _mm256_store_ps(buf, outBuf); + stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); + } + + for( ; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + if( stage_sum < cascade->stage_classifier[i].threshold ) + return -i; + } + } + else + #endif + #if defined CV_HAAR_USE_SSE && CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization + if(haveSSE2) + { + for( i = start_stage; i < cascade->count; i++ ) + { + __m128d stage_sum = _mm_setzero_pd(); + if( cascade->stage_classifier[i].two_rects ) + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + + // ayasin - NHM perf optim. Avoid use of costly flaky jcc + __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); + __m128d a = _mm_set_sd(classifier->alpha[0]); + __m128d b = _mm_set_sd(classifier->alpha[1]); + __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + + calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); + t = _mm_cmpgt_sd(t, sum); + stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); + } + } + else + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + // ayasin - NHM perf optim. Avoid use of costly flaky jcc + __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); + __m128d a = _mm_set_sd(classifier->alpha[0]); + __m128d b = _mm_set_sd(classifier->alpha[1]); + double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + __m128d sum = _mm_set_sd(_sum); + + t = _mm_cmpgt_sd(t, sum); + stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); + } + } + __m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold); + if( _mm_comilt_sd(stage_sum, i_threshold) ) + return -i; + } + } + else + #endif + { + for( i = start_stage; i < cascade->count; i++ ) + { + stage_sum = 0.0; + if( cascade->stage_classifier[i].two_rects ) + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + else + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + stage_sum += classifier->alpha[sum >= t]; + } + } + if( stage_sum < cascade->stage_classifier[i].threshold ) + return -i; + } + } + } + + else { for( i = start_stage; i < cascade->count; i++ ) { stage_sum = 0.0; - int j = 0; - #ifdef CV_HAAR_USE_AVX - if(haveAVX) - { - for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) - { - stage_sum += icvEvalHidHaarClassifierAVX( - cascade->stage_classifier[i].classifier+j, - variance_norm_factor, p_offset ); - } - } - #endif - for(; j < cascade->stage_classifier[i].count; j++ ) - { - - stage_sum += icvEvalHidHaarClassifier( - cascade->stage_classifier[i].classifier + j, - variance_norm_factor, p_offset ); - } - + int k = 0; + #ifdef CV_HAAR_USE_AVX + if(haveAVX) + { + for( ; k < cascade->stage_classifier[i].count-8; k+=8 ) + { + stage_sum += icvEvalHidHaarClassifierAVX( + cascade->stage_classifier[i].classifier+k, + variance_norm_factor, p_offset ); + } + } + #endif + for(; k < cascade->stage_classifier[i].count; k++ ) + { + + stage_sum += icvEvalHidHaarClassifier( + cascade->stage_classifier[i].classifier + k, + variance_norm_factor, p_offset ); + } + if( stage_sum < cascade->stage_classifier[i].threshold ) return -i; } } - //_mm256_zeroupper(); + //_mm256_zeroupper(); return 1; } diff --git a/samples/cpp/pca.cpp b/samples/cpp/pca.cpp index b489a5e..7ab86ea 100644 --- a/samples/cpp/pca.cpp +++ b/samples/cpp/pca.cpp @@ -1,13 +1,13 @@ /* * pca.cpp * -* Author: +* Author: * Kevin Hughes * * Special Thanks to: * Philipp Wagner * -* This program demonstrates how to use OpenCV PCA with a +* This program demonstrates how to use OpenCV PCA with a * specified amount of variance to retain. The effect * is illustrated further by using a trackbar to * change the value for retained varaince. @@ -17,9 +17,9 @@ * on this list of images. The author recommends using * the first 15 faces of the AT&T face data set: * http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html -* +* * so for example your input text file would look like this: -* +* * /orl_faces/s1/1.pgm * /orl_faces/s2/1.pgm * /orl_faces/s3/1.pgm @@ -50,7 +50,7 @@ using namespace std; /////////////////////// // Functions -void read_imgList(const string& filename, vector& images) { +static void read_imgList(const string& filename, vector& images) { std::ifstream file(filename.c_str(), ifstream::in); if (!file) { string error_message = "No valid input file was given, please check the given filename."; @@ -62,19 +62,19 @@ void read_imgList(const string& filename, vector& images) { } } -Mat formatImagesForPCA(const vector &data) +static Mat formatImagesForPCA(const vector &data) { Mat dst(data.size(), data[0].rows*data[0].cols, CV_32F); for(unsigned int i = 0; i < data.size(); i++) { Mat image_row = data[i].clone().reshape(1,1); Mat row_i = dst.row(i); - image_row.convertTo(row_i,CV_32F); + image_row.convertTo(row_i,CV_32F); } return dst; } -Mat toGrayscale(InputArray _src) { +static Mat toGrayscale(InputArray _src) { Mat src = _src.getMat(); // only allow one channel if(src.channels() != 1) { @@ -95,22 +95,22 @@ struct params string winName; }; -void onTrackbar(int pos, void* ptr) -{ +static void onTrackbar(int pos, void* ptr) +{ cout << "Retained Variance = " << pos << "% "; cout << "re-calculating PCA..." << std::flush; - + double var = pos / 100.0; - + struct params *p = (struct params *)ptr; - + p->pca = PCA(p->data, cv::Mat(), CV_PCA_DATA_AS_ROW, var); - + Mat point = p->pca.project(p->data.row(0)); Mat reconstruction = p->pca.backProject(point); reconstruction = reconstruction.reshape(p->ch, p->rows); reconstruction = toGrayscale(reconstruction); - + imshow(p->winName, reconstruction); cout << "done! # of principal components: " << p->pca.eigenvectors.rows << endl; } @@ -118,19 +118,19 @@ void onTrackbar(int pos, void* ptr) /////////////////////// // Main -int main(int argc, char** argv) +int main(int argc, char** argv) { if (argc != 2) { cout << "usage: " << argv[0] << " " << endl; exit(1); } - + // Get the path to your CSV. string imgList = string(argv[1]); - + // vector to hold the images vector images; - + // Read in the data. This can fail if not valid try { read_imgList(imgList, images); @@ -138,29 +138,29 @@ int main(int argc, char** argv) cerr << "Error opening file \"" << imgList << "\". Reason: " << e.msg << endl; exit(1); } - + // Quit if there are not enough images for this demo. if(images.size() <= 1) { string error_message = "This demo needs at least 2 images to work. Please add more images to your data set!"; CV_Error(CV_StsError, error_message); } - + // Reshape and stack images into a rowMatrix Mat data = formatImagesForPCA(images); - + // perform PCA PCA pca(data, cv::Mat(), CV_PCA_DATA_AS_ROW, 0.95); // trackbar is initially set here, also this is a common value for retainedVariance - - // Demonstration of the effect of retainedVariance on the first image + + // Demonstration of the effect of retainedVariance on the first image Mat point = pca.project(data.row(0)); // project into the eigenspace, thus the image becomes a "point" Mat reconstruction = pca.backProject(point); // re-create the image from the "point" reconstruction = reconstruction.reshape(images[0].channels(), images[0].rows); // reshape from a row vector into image shape reconstruction = toGrayscale(reconstruction); // re-scale for displaying purposes - + // init highgui window string winName = "Reconstruction | press 'q' to quit"; namedWindow(winName, CV_WINDOW_NORMAL); - + // params struct to pass to the trackbar handler params p; p.data = data; @@ -168,17 +168,17 @@ int main(int argc, char** argv) p.rows = images[0].rows; p.pca = pca; p.winName = winName; - + // create the tracbar int pos = 95; - createTrackbar("Retained Variance (%)", winName, &pos, 100, onTrackbar, (void*)&p); - + createTrackbar("Retained Variance (%)", winName, &pos, 100, onTrackbar, (void*)&p); + // display until user presses q imshow(winName, reconstruction); - + char key = 0; while(key != 'q') key = waitKey(); - - return 0; + + return 0; } -- 2.7.4