From 1dbe5ccc5f416debb5762c883c98ad4c95a89fc7 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 22 Sep 2011 10:40:48 +0000 Subject: [PATCH] improved phaseCorrelate() performance (thanks to Will Lucas for the patch) --- modules/imgproc/src/phasecorr.cpp | 364 +++++++++++++++++++++++++++++++------- 1 file changed, 300 insertions(+), 64 deletions(-) diff --git a/modules/imgproc/src/phasecorr.cpp b/modules/imgproc/src/phasecorr.cpp index fec4d56..f5f6e18 100644 --- a/modules/imgproc/src/phasecorr.cpp +++ b/modules/imgproc/src/phasecorr.cpp @@ -38,66 +38,317 @@ namespace cv { -static void divComplex(InputArray _src1, InputArray _src2, OutputArray _dst) +static void magSpectrums( InputArray _src, OutputArray _dst) { - Mat src1 = _src1.getMat(); - Mat src2 = _src2.getMat(); - - CV_Assert( src1.type() == src2.type() && src1.size() == src2.size()); - CV_Assert( src1.type() == CV_32FC2 || src1.type() == CV_64FC2 ); - - _dst.create(src1.size(), src1.type()); - Mat dst = _dst.getMat(); - - int length = src1.rows*src1.cols; - - if(src1.depth() == CV_32F) + Mat src = _src.getMat(); + int depth = src.depth(), cn = src.channels(), type = src.type(); + int rows = src.rows, cols = src.cols; + int j, k; + + CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 ); + + if(src.depth() == CV_32F) + _dst.create( src.rows, src.cols, CV_32FC1 ); + else + _dst.create( src.rows, src.cols, CV_64FC1 ); + + Mat dst = _dst.getMat(); + + bool is_1d = (rows == 1 || (cols == 1 && src.isContinuous() && dst.isContinuous())); + + if( is_1d ) + cols = cols + rows - 1, rows = 1; + + int ncols = cols*cn; + int j0 = cn == 1; + int j1 = ncols - (cols % 2 == 0 && cn == 1); + + if( depth == CV_32F ) { - const float* dataA = (const float*)src1.data; - const float* dataB = (const float*)src2.data; + const float* dataSrc = (const float*)src.data; + float* dataDst = (float*)dst.data; + + size_t stepSrc = src.step/sizeof(dataSrc[0]); + size_t stepDst = dst.step/sizeof(dataDst[0]); + + if( !is_1d && cn == 1 ) + { + for( k = 0; k < (cols % 2 ? 1 : 2); k++ ) + { + if( k == 1 ) + dataSrc += cols - 1, dataDst += cols - 1; + dataDst[0] = dataSrc[0]*dataSrc[0]; + if( rows % 2 == 0 ) + dataDst[(rows-1)*stepDst] = dataSrc[(rows-1)*stepSrc]*dataSrc[(rows-1)*stepSrc]; + + for( j = 1; j <= rows - 2; j += 2 ) + { + dataDst[j*stepDst] = (double)dataSrc[j*stepSrc]*dataSrc[j*stepSrc] + + (double)dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]; + } + + if( k == 1 ) + dataSrc -= cols - 1, dataDst -= cols - 1; + } + } + + for( ; rows--; dataSrc += stepSrc, dataDst += stepDst ) + { + if( is_1d && cn == 1 ) + { + dataDst[0] = dataSrc[0]*dataSrc[0]; + if( cols % 2 == 0 ) + dataDst[j1] = dataSrc[j1]*dataSrc[j1]; + } + + for( j = j0; j < j1; j += 2 ) + { + dataDst[j] = (double)dataSrc[j]*dataSrc[j] + (double)dataSrc[j+1]*dataSrc[j+1]; + } + } + } + else + { + const double* dataSrc = (const double*)src.data; + double* dataDst = (double*)dst.data; + + size_t stepSrc = src.step/sizeof(dataSrc[0]); + size_t stepDst = dst.step/sizeof(dataDst[0]); + + if( !is_1d && cn == 1 ) + { + for( k = 0; k < (cols % 2 ? 1 : 2); k++ ) + { + if( k == 1 ) + dataSrc += cols - 1, dataDst += cols - 1; + dataDst[0] = dataSrc[0]*dataSrc[0]; + if( rows % 2 == 0 ) + dataDst[(rows-1)*stepDst] = dataSrc[(rows-1)*stepSrc]*dataSrc[(rows-1)*stepSrc]; + + for( j = 1; j <= rows - 2; j += 2 ) + { + dataDst[j*stepDst] = dataSrc[j*stepSrc]*dataSrc[j*stepSrc] + + dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]; + } + + if( k == 1 ) + dataSrc -= cols - 1, dataDst -= cols - 1; + } + } + + for( ; rows--; dataSrc += stepSrc, dataDst += stepDst ) + { + if( is_1d && cn == 1 ) + { + dataDst[0] = dataSrc[0]*dataSrc[0]; + if( cols % 2 == 0 ) + dataDst[j1] = dataSrc[j1]*dataSrc[j1]; + } + + for( j = j0; j < j1; j += 2 ) + { + dataDst[j] = dataSrc[j]*dataSrc[j] + dataSrc[j+1]*dataSrc[j+1]; + } + } + } + + // do batch sqrt to use SSE optimizations... + cv::sqrt(dst, dst); +} + +static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, int flags, bool conjB) +{ + Mat srcA = _srcA.getMat(), srcB = _srcB.getMat(); + int depth = srcA.depth(), cn = srcA.channels(), type = srcA.type(); + int rows = srcA.rows, cols = srcA.cols; + int j, k; + + CV_Assert( type == srcB.type() && srcA.size() == srcB.size() ); + CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 ); + + _dst.create( srcA.rows, srcA.cols, type ); + Mat dst = _dst.getMat(); + + bool is_1d = (flags & DFT_ROWS) || (rows == 1 || (cols == 1 && + srcA.isContinuous() && srcB.isContinuous() && dst.isContinuous())); + + if( is_1d && !(flags & DFT_ROWS) ) + cols = cols + rows - 1, rows = 1; + + int ncols = cols*cn; + int j0 = cn == 1; + int j1 = ncols - (cols % 2 == 0 && cn == 1); + + if( depth == CV_32F ) + { + const float* dataA = (const float*)srcA.data; + const float* dataB = (const float*)srcB.data; float* dataC = (float*)dst.data; float eps = FLT_EPSILON; // prevent div0 problems - - for(int j = 0; j < length - 1; j += 2) + + size_t stepA = srcA.step/sizeof(dataA[0]); + size_t stepB = srcB.step/sizeof(dataB[0]); + size_t stepC = dst.step/sizeof(dataC[0]); + + if( !is_1d && cn == 1 ) { - double denom = (double)(dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps); - double re = (double)(dataA[j]*dataB[j] + dataA[j+1]*dataB[j+1]); - double im = (double)(dataA[j+1]*dataB[j] - dataA[j]*dataB[j+1]); - dataC[j] = (float)(re / denom); - dataC[j+1] = (float)(im / denom); + for( k = 0; k < (cols % 2 ? 1 : 2); k++ ) + { + if( k == 1 ) + dataA += cols - 1, dataB += cols - 1, dataC += cols - 1; + dataC[0] = dataA[0] / dataB[0]; + if( rows % 2 == 0 ) + dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / dataB[(rows-1)*stepB]; + if( !conjB ) + for( j = 1; j <= rows - 2; j += 2 ) + { + double denom = (double)dataB[j*stepB]*dataB[j*stepB] + + (double)dataB[(j+1)*stepB]*dataB[(j+1)*stepB] + (double)eps; + + double re = (double)dataA[j*stepA]*dataB[j*stepB] + + (double)dataA[(j+1)*stepA]*dataB[(j+1)*stepB]; + + double im = (double)dataA[(j+1)*stepA]*dataB[j*stepB] - + (double)dataA[j*stepA]*dataB[(j+1)*stepB]; + + dataC[j*stepC] = (float)(re / denom); + dataC[(j+1)*stepC] = (float)(im / denom); + } + else + for( j = 1; j <= rows - 2; j += 2 ) + { + + double denom = (double)dataB[j*stepB]*dataB[j*stepB] + + (double)dataB[(j+1)*stepB]*dataB[(j+1)*stepB] + (double)eps; + + double re = (double)dataA[j*stepA]*dataB[j*stepB] - + (double)dataA[(j+1)*stepA]*dataB[(j+1)*stepB]; + + double im = (double)dataA[(j+1)*stepA]*dataB[j*stepB] + + (double)dataA[j*stepA]*dataB[(j+1)*stepB]; + + dataC[j*stepC] = (float)(re / denom); + dataC[(j+1)*stepC] = (float)(im / denom); + } + if( k == 1 ) + dataA -= cols - 1, dataB -= cols - 1, dataC -= cols - 1; + } + } + + for( ; rows--; dataA += stepA, dataB += stepB, dataC += stepC ) + { + if( is_1d && cn == 1 ) + { + dataC[0] = dataA[0] / dataB[0]; + if( cols % 2 == 0 ) + dataC[j1] = dataA[j1] / dataB[j1]; + } + + if( !conjB ) + for( j = j0; j < j1; j += 2 ) + { + double denom = (double)(dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps); + double re = (double)(dataA[j]*dataB[j] + dataA[j+1]*dataB[j+1]); + double im = (double)(dataA[j+1]*dataB[j] - dataA[j]*dataB[j+1]); + dataC[j] = (float)(re / denom); + dataC[j+1] = (float)(im / denom); + } + else + for( j = j0; j < j1; j += 2 ) + { + double denom = (double)(dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps); + double re = (double)(dataA[j]*dataB[j] - dataA[j+1]*dataB[j+1]); + double im = (double)(dataA[j+1]*dataB[j] + dataA[j]*dataB[j+1]); + dataC[j] = (float)(re / denom); + dataC[j+1] = (float)(im / denom); + } } } else { - const double* dataA = (const double*)src1.data; - const double* dataB = (const double*)src2.data; + const double* dataA = (const double*)srcA.data; + const double* dataB = (const double*)srcB.data; double* dataC = (double*)dst.data; double eps = DBL_EPSILON; // prevent div0 problems - - for(int j = 0; j < length - 1; j += 2) + + size_t stepA = srcA.step/sizeof(dataA[0]); + size_t stepB = srcB.step/sizeof(dataB[0]); + size_t stepC = dst.step/sizeof(dataC[0]); + + if( !is_1d && cn == 1 ) + { + for( k = 0; k < (cols % 2 ? 1 : 2); k++ ) + { + if( k == 1 ) + dataA += cols - 1, dataB += cols - 1, dataC += cols - 1; + dataC[0] = dataA[0] / dataB[0]; + if( rows % 2 == 0 ) + dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / dataB[(rows-1)*stepB]; + if( !conjB ) + for( j = 1; j <= rows - 2; j += 2 ) + { + double denom = dataB[j*stepB]*dataB[j*stepB] + + dataB[(j+1)*stepB]*dataB[(j+1)*stepB] + eps; + + double re = dataA[j*stepA]*dataB[j*stepB] + + dataA[(j+1)*stepA]*dataB[(j+1)*stepB]; + + double im = dataA[(j+1)*stepA]*dataB[j*stepB] - + dataA[j*stepA]*dataB[(j+1)*stepB]; + + dataC[j*stepC] = re / denom; + dataC[(j+1)*stepC] = im / denom; + } + else + for( j = 1; j <= rows - 2; j += 2 ) + { + double denom = dataB[j*stepB]*dataB[j*stepB] + + dataB[(j+1)*stepB]*dataB[(j+1)*stepB] + eps; + + double re = dataA[j*stepA]*dataB[j*stepB] - + dataA[(j+1)*stepA]*dataB[(j+1)*stepB]; + + double im = dataA[(j+1)*stepA]*dataB[j*stepB] + + dataA[j*stepA]*dataB[(j+1)*stepB]; + + dataC[j*stepC] = re / denom; + dataC[(j+1)*stepC] = im / denom; + } + if( k == 1 ) + dataA -= cols - 1, dataB -= cols - 1, dataC -= cols - 1; + } + } + + for( ; rows--; dataA += stepA, dataB += stepB, dataC += stepC ) { - double denom = dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps; - double re = dataA[j]*dataB[j] + dataA[j+1]*dataB[j+1]; - double im = dataA[j+1]*dataB[j] - dataA[j]*dataB[j+1]; - dataC[j] = re / denom; - dataC[j+1] = im / denom; + if( is_1d && cn == 1 ) + { + dataC[0] = dataA[0] / dataB[0]; + if( cols % 2 == 0 ) + dataC[j1] = dataA[j1] / dataB[j1]; + } + + if( !conjB ) + for( j = j0; j < j1; j += 2 ) + { + double denom = dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps; + double re = dataA[j]*dataB[j] + dataA[j+1]*dataB[j+1]; + double im = dataA[j+1]*dataB[j] - dataA[j]*dataB[j+1]; + dataC[j] = re / denom; + dataC[j+1] = im / denom; + } + else + for( j = j0; j < j1; j += 2 ) + { + double denom = dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps; + double re = dataA[j]*dataB[j] - dataA[j+1]*dataB[j+1]; + double im = dataA[j+1]*dataB[j] + dataA[j]*dataB[j+1]; + dataC[j] = re / denom; + dataC[j+1] = im / denom; + } } } -} -static void absComplex(InputArray _src, OutputArray _dst) -{ - Mat src = _src.getMat(); - - CV_Assert( src.type() == CV_32FC2 || src.type() == CV_64FC2 ); - - vector planes; - split(src, planes); - - magnitude(planes[0], planes[1], planes[0]); - planes[1] = Mat::zeros(planes[0].size(), planes[0].type()); - - merge(planes, _dst); } static void fftShift(InputOutputArray _out) @@ -258,33 +509,18 @@ cv::Point2d cv::phaseCorrelate(InputArray _src1, InputArray _src2, InputArray _w multiply(paddedWin, padded2, padded2); } - // TODO should be able to improve speed by switching to CCS packed matrices - vector cplx1, cplx2; - cplx1.push_back(padded1); - cplx1.push_back(Mat::zeros(padded1.size(), padded1.type())); - merge(cplx1, FFT1); - - cplx2.push_back(padded2); - cplx2.push_back(Mat::zeros(padded2.size(), padded2.type())); - merge(cplx2, FFT2); - // execute phase correlation equation // Reference: http://en.wikipedia.org/wiki/Phase_correlation - dft(FFT1, FFT1); - dft(FFT2, FFT2); + dft(padded1, FFT1, DFT_REAL_OUTPUT); + dft(padded2, FFT2, DFT_REAL_OUTPUT); mulSpectrums(FFT1, FFT2, P, 0, true); - // TODO these two functions need to be changed to work with CCS packed matrices... - absComplex(P, Pm); - divComplex(P, Pm, C); // FF* / |FF*| (phase correlation equation completed here...) + magSpectrums(P, Pm); + divSpectrums(P, Pm, C, 0, false); // FF* / |FF*| (phase correlation equation completed here...) idft(C, C); // gives us the nice peak shift location... - vector Cplanes; - split(C, Cplanes); - C = Cplanes[0]; // use only the real plane since that's all that's left... - fftShift(C); // shift the energy to the center of the frame. // locate the highest peak -- 2.7.4