From ec5bdc7de868c99d0203278e00701c1b9372c525 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 5 Mar 2012 10:36:57 +0000 Subject: [PATCH] added patch error calculation to gpu::PyrLKOpticalFlow --- modules/gpu/include/opencv2/gpu/gpu.hpp | 2 + modules/gpu/src/cuda/pyrlk.cu | 157 ++++++++++++++++++++++++++------ modules/gpu/src/pyrlk.cpp | 8 +- modules/gpu/test/test_video.cpp | 4 +- samples/gpu/performance/tests.cpp | 10 +- 5 files changed, 143 insertions(+), 38 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index b389d6f..cc6d607 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1817,6 +1817,7 @@ public: derivLambda = 0.5; useInitialFlow = false; minEigThreshold = 1e-4f; + getMinEigenVals = false; } void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, @@ -1830,6 +1831,7 @@ public: double derivLambda; bool useInitialFlow; float minEigThreshold; + bool getMinEigenVals; void releaseMemory() { diff --git a/modules/gpu/src/cuda/pyrlk.cu b/modules/gpu/src/cuda/pyrlk.cu index 32dabb1..e41e4ca 100644 --- a/modules/gpu/src/cuda/pyrlk.cu +++ b/modules/gpu/src/cuda/pyrlk.cu @@ -274,9 +274,39 @@ namespace cv { namespace gpu { namespace device } } + __device__ void reduce(float& val1, float* smem1, int tid) + { + smem1[tid] = val1; + __syncthreads(); + + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + } + __syncthreads(); + + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + } + __syncthreads(); + + if (tid < 32) + { + volatile float* vmem1 = smem1; + + vmem1[tid] = val1 += vmem1[tid + 32]; + vmem1[tid] = val1 += vmem1[tid + 16]; + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem1[tid] = val1 += vmem1[tid + 1]; + } + } + #define SCALE (1.0f / (1 << 20)) - template + template __global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep dIdx, const PtrStep dIdy, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) { @@ -349,7 +379,7 @@ namespace cv { namespace gpu { namespace device float D = A11 * A22 - A12 * A12; float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y); - if (calcErr && tid == 0) + if (calcErr && GET_MIN_EIGENVALS && tid == 0) err[blockIdx.x] = minEig; if (minEig < c_minEigThreshold || D < numeric_limits::epsilon()) @@ -377,7 +407,7 @@ namespace cv { namespace gpu { namespace device bool status_ = true; for (int k = 0; k < c_iters; ++k) - { + { if (nextPt.x < -c_winSize_x || nextPt.x >= cols || nextPt.y < -c_winSize_y || nextPt.y >= rows) { status_ = false; @@ -415,38 +445,76 @@ namespace cv { namespace gpu { namespace device nextPt.y += delta.y; if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) + { + nextPt.x -= delta.x * 0.5f; + nextPt.y -= delta.y * 0.5f; break; + } } - if (tid == 0) + if (nextPt.x < -c_winSize_x || nextPt.x >= cols || nextPt.y < -c_winSize_y || nextPt.y >= rows) + status_ = false; + + // TODO : Why do we compute patch error in shifted window? + nextPt.x += c_halfWin_x; + nextPt.y += c_halfWin_y; + + float errval = 0.f; + if (calcErr && !GET_MIN_EIGENVALS && status_) { - nextPt.x += c_halfWin_x; - nextPt.y += c_halfWin_y; + for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) + { + for (int x = threadIdx.x, j = 0; x < c_winSize_x_cn; x += blockDim.x, ++j) + { + int diff = linearFilter(J, nextPt, x, y) - I_patch[i][j]; + errval += ::fabsf((float)diff); + } + } - nextPts[blockIdx.x] = nextPt; + reduce(errval, smem1, tid); + + errval /= 32 * c_winSize_x_cn * c_winSize_y; + } + + if (tid == 0) + { status[blockIdx.x] = status_; + nextPts[blockIdx.x] = nextPt; + + if (calcErr && !GET_MIN_EIGENVALS) + err[blockIdx.x] = errval; } } template void lkSparse_caller(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, cudaStream_t stream) { dim3 grid(ptcount); - if (err) + if (level == 0 && err) { - cudaSafeCall( cudaFuncSetCacheConfig(lkSparse, cudaFuncCachePreferL1) ); + if (GET_MIN_EIGENVALS) + { + cudaSafeCall( cudaFuncSetCacheConfig(lkSparse, cudaFuncCachePreferL1) ); + + lkSparse<<>>(I, J, dIdx, dIdy, + prevPts, nextPts, status, err, level, I.rows, I.cols); + } + else + { + cudaSafeCall( cudaFuncSetCacheConfig(lkSparse, cudaFuncCachePreferL1) ); - lkSparse<<>>(I, J, dIdx, dIdy, - prevPts, nextPts, status, err, level, I.rows, I.cols); + lkSparse<<>>(I, J, dIdx, dIdy, + prevPts, nextPts, status, err, level, I.rows, I.cols); + } } else { - cudaSafeCall( cudaFuncSetCacheConfig(lkSparse, cudaFuncCachePreferL1) ); + cudaSafeCall( cudaFuncSetCacheConfig(lkSparse, cudaFuncCachePreferL1) ); - lkSparse<<>>(I, J, dIdx, dIdy, + lkSparse<<>>(I, J, dIdx, dIdy, prevPts, nextPts, status, err, level, I.rows, I.cols); } @@ -457,11 +525,11 @@ namespace cv { namespace gpu { namespace device } void lkSparse_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, dim3 patch, cudaStream_t stream) { typedef void (*func_t)(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, cudaStream_t stream); static const func_t funcs[5][5] = @@ -474,11 +542,11 @@ namespace cv { namespace gpu { namespace device }; funcs[patch.y - 1][patch.x - 1](I, J, dIdx, dIdy, - prevPts, nextPts, status, err, ptcount, + prevPts, nextPts, status, err, GET_MIN_EIGENVALS, ptcount, level, block, stream); } - template + template __global__ void lkDense(const PtrStepb I, const PtrStepb J, const PtrStep dIdx, const PtrStep dIdy, PtrStepf u, PtrStepf v, PtrStepf err, const int rows, const int cols) { @@ -515,7 +583,7 @@ namespace cv { namespace gpu { namespace device float D = A11 * A22 - A12 * A12; float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y); - if (calcErr) + if (calcErr && GET_MIN_EIGENVALS) err(y, x) = minEig; if (minEig < c_minEigThreshold || D < numeric_limits::epsilon()) @@ -565,30 +633,63 @@ namespace cv { namespace gpu { namespace device if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) break; - } + } + + // TODO : Why do we compute patch error in shifted window? + nextPt.x += c_halfWin_x; + nextPt.y += c_halfWin_y; - u(y, x) = nextPt.x - x + c_halfWin_x; - v(y, x) = nextPt.y - y + c_halfWin_y; + u(y, x) = nextPt.x - x; + v(y, x) = nextPt.y - y; + + if (calcErr && !GET_MIN_EIGENVALS) + { + float errval = 0.0f; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int I_val = I(y - c_halfWin_y + i, x - c_halfWin_x + j); + int diff = linearFilter(J, nextPt, j, i) - CV_DESCALE(I_val * (1 << W_BITS), W_BITS1 - 5); + errval += ::fabsf((float)diff); + } + } + + errval /= 32 * c_winSize_x_cn * c_winSize_y; + + err(y, x) = errval; + } } void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - DevMem2Df u, DevMem2Df v, DevMem2Df* err, cudaStream_t stream) + DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream) { dim3 block(32, 8); dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); if (err) { - cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); + if (GET_MIN_EIGENVALS) + { + cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); - lkDense<<>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols); - cudaSafeCall( cudaGetLastError() ); + lkDense<<>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols); + cudaSafeCall( cudaGetLastError() ); + } + else + { + cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); + + lkDense<<>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols); + cudaSafeCall( cudaGetLastError() ); + } } else { - cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); + cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); - lkDense<<>>(I, J, dIdx, dIdy, u, v, PtrStepf(), I.rows, I.cols); + lkDense<<>>(I, J, dIdx, dIdy, u, v, PtrStepf(), I.rows, I.cols); cudaSafeCall( cudaGetLastError() ); } diff --git a/modules/gpu/src/pyrlk.cpp b/modules/gpu/src/pyrlk.cpp index 37427c5..83fc1f7 100644 --- a/modules/gpu/src/pyrlk.cpp +++ b/modules/gpu/src/pyrlk.cpp @@ -63,11 +63,11 @@ namespace cv { namespace gpu { namespace device cudaStream_t stream = 0); void lkSparse_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, dim3 patch, cudaStream_t stream = 0); void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - DevMem2Df u, DevMem2Df v, DevMem2Df* err, cudaStream_t stream = 0); + DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream = 0); } }}} @@ -205,7 +205,7 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next calcSharrDeriv(prevPyr_[level], dIdx, dIdy); lkSparse_gpu(prevPyr_[level], nextPyr_[level], dIdx, dIdy, - prevPts.ptr(), nextPts.ptr(), status.ptr(), level == 0 && err ? err->ptr() : 0, prevPts.cols, + prevPts.ptr(), nextPts.ptr(), status.ptr(), level == 0 && err ? err->ptr() : 0, getMinEigenVals, prevPts.cols, level, block, patch); } } @@ -272,7 +272,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI calcSharrDeriv(prevPyr_[level], dIdx, dIdy); lkDense_gpu(prevPyr_[level], nextPyr_[level], dIdx, dIdy, uPyr_[level], vPyr_[level], - level == 0 && err ? &derr : 0); + level == 0 && err ? &derr : 0, getMinEigenVals); if (level == 0) { diff --git a/modules/gpu/test/test_video.cpp b/modules/gpu/test/test_video.cpp index a63c4fd..fa98d1e 100644 --- a/modules/gpu/test/test_video.cpp +++ b/modules/gpu/test/test_video.cpp @@ -358,7 +358,7 @@ PARAM_TEST_CASE(PyrLKOpticalFlowSparse, cv::gpu::DeviceInfo, bool) cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0); cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, err_gold, cv::Size(21, 21), 3, - cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01), 0.5, CV_LKFLOW_GET_MIN_EIGENVALS); + cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01), 0.5); } }; @@ -410,7 +410,7 @@ TEST_P(PyrLKOpticalFlowSparse, Accuracy) bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1; float errdiff = std::abs(err[i] - err_gold[i]); - if (!eq || errdiff > 1e-4) + if (!eq || errdiff > 1e-1) ++mistmatch; } } diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index cb51abd..d89b7d1 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -1157,10 +1157,12 @@ TEST(PyrLKOpticalFlow) vector nextPts; vector status; - calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, noArray()); + vector err; + + calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err); CPU_ON; - calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, noArray()); + calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err); CPU_OFF; gpu::PyrLKOpticalFlow d_pyrLK; @@ -1176,10 +1178,10 @@ TEST(PyrLKOpticalFlow) gpu::GpuMat d_status; gpu::GpuMat d_err; - d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status); + d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err); GPU_ON; - d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status); + d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err); GPU_OFF; } } -- 2.7.4