From 5c19c6cb6725eb88c6b505ca4a65611d63347c56 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 27 Jun 2012 09:58:33 +0000 Subject: [PATCH] Merged revision(s) 8664 from trunk: new implementation of gpu::PyrLKOpticalFlow::dense (1.5 - 2x faster) ........ --- modules/gpu/perf/perf_video.cpp | 27 ++- modules/gpu/src/cuda/pyrlk.cu | 355 ++++++++++++++++++++----------------- modules/gpu/src/pyrlk.cpp | 94 ++++------ samples/gpu/pyrlk_optical_flow.cpp | 3 - 4 files changed, 247 insertions(+), 232 deletions(-) diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp index ff80aab..9098247 100644 --- a/modules/gpu/perf/perf_video.cpp +++ b/modules/gpu/perf/perf_video.cpp @@ -145,6 +145,8 @@ INSTANTIATE_TEST_CASE_P(Video, GoodFeaturesToTrack, testing::Combine(ALL_DEVICES ////////////////////////////////////////////////////// // PyrLKOpticalFlowSparse +IMPLEMENT_PARAM_CLASS(WinSize, int) + GPU_PERF_TEST(PyrLKOpticalFlowSparse, cv::gpu::DeviceInfo, bool, int, int) { cv::gpu::DeviceInfo devInfo = GET_PARAM(0); @@ -196,12 +198,19 @@ INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowSparse, testing::Combine ////////////////////////////////////////////////////// // PyrLKOpticalFlowDense -GPU_PERF_TEST_1(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo) +IMPLEMENT_PARAM_CLASS(Levels, int) +IMPLEMENT_PARAM_CLASS(Iters, int) + +GPU_PERF_TEST(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo, WinSize, Levels, Iters) { - cv::gpu::DeviceInfo devInfo = GetParam(); + cv::gpu::DeviceInfo devInfo = GET_PARAM(0); cv::gpu::setDevice(devInfo.deviceID()); + int winSize = GET_PARAM(1); + int levels = GET_PARAM(2); + int iters = GET_PARAM(3); + cv::Mat frame0_host = readImage("gpu/opticalflow/frame0.png", cv::IMREAD_GRAYSCALE); cv::Mat frame1_host = readImage("gpu/opticalflow/frame1.png", cv::IMREAD_GRAYSCALE); @@ -215,7 +224,13 @@ GPU_PERF_TEST_1(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo) cv::gpu::PyrLKOpticalFlow pyrLK; - declare.time(10); + pyrLK.winSize = cv::Size(winSize, winSize); + pyrLK.maxLevel = levels - 1; + pyrLK.iters = iters; + + pyrLK.dense(frame0, frame1, u, v); + + declare.time(30); TEST_CYCLE() { @@ -223,7 +238,11 @@ GPU_PERF_TEST_1(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo) } } -INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowDense, ALL_DEVICES); +INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowDense, testing::Combine( + ALL_DEVICES, + testing::Values(WinSize(3), WinSize(5), WinSize(7), WinSize(9), WinSize(13), WinSize(17), WinSize(21)), + testing::Values(Levels(1), Levels(2), Levels(3)), + testing::Values(Iters(1), Iters(10)))); ////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/pyrlk.cu b/modules/gpu/src/cuda/pyrlk.cu index 22fbc44..12dfab6 100644 --- a/modules/gpu/src/cuda/pyrlk.cu +++ b/modules/gpu/src/cuda/pyrlk.cu @@ -40,7 +40,7 @@ // // Copyright (c) 2010, Paul Furgale, Chi Hay Tong // -// The original code was written by Paul Furgale and Chi Hay Tong +// The original code was written by Paul Furgale and Chi Hay Tong // and later optimized and prepared for integration into OpenCV by Itseez. // //M*/ @@ -50,9 +50,9 @@ #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/limits.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { - namespace pyrlk + namespace pyrlk { __constant__ int c_cn; __constant__ float c_minEigThreshold; @@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device void loadConstants(int cn, float minEigThreshold, int2 winSize, int iters) { - int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); + int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); cudaSafeCall( cudaMemcpyToSymbol(c_cn, &cn, sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(c_minEigThreshold, &minEigThreshold, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) ); @@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device const uchar src_val0 = src(y > 0 ? y - 1 : 1, x); const uchar src_val1 = src(y, x); const uchar src_val2 = src(y < rows - 1 ? y + 1 : rows - 2, x); - + dx_buf(y, x) = (src_val0 + src_val2) * 3 + src_val1 * 10; dy_buf(y, x) = src_val2 - src_val0; } @@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device } } - void calcSharrDeriv_gpu(DevMem2Db src, DevMem2D_ dx_buf, DevMem2D_ dy_buf, DevMem2D_ dIdx, DevMem2D_ dIdy, int cn, + void calcSharrDeriv_gpu(DevMem2Db src, DevMem2D_ dx_buf, DevMem2D_ dy_buf, DevMem2D_ dIdx, DevMem2D_ dIdy, int cn, cudaStream_t stream) { dim3 block(32, 8); @@ -182,21 +182,21 @@ namespace cv { namespace gpu { namespace device __syncthreads(); #if __CUDA_ARCH__ > 110 - if (tid < 128) - { - smem1[tid] = val1 += smem1[tid + 128]; - smem2[tid] = val2 += smem2[tid + 128]; - smem3[tid] = val3 += smem3[tid + 128]; - } + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + smem3[tid] = val3 += smem3[tid + 128]; + } __syncthreads(); #endif - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - smem2[tid] = val2 += smem2[tid + 64]; + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; smem3[tid] = val3 += smem3[tid + 64]; - } + } __syncthreads(); if (tid < 32) @@ -205,28 +205,28 @@ namespace cv { namespace gpu { namespace device volatile float* vmem2 = smem2; volatile float* vmem3 = smem3; - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem2[tid] = val2 += vmem2[tid + 32]; + vmem1[tid] = val1 += vmem1[tid + 32]; + vmem2[tid] = val2 += vmem2[tid + 32]; vmem3[tid] = val3 += vmem3[tid + 32]; - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem2[tid] = val2 += vmem2[tid + 16]; + vmem1[tid] = val1 += vmem1[tid + 16]; + vmem2[tid] = val2 += vmem2[tid + 16]; vmem3[tid] = val3 += vmem3[tid + 16]; - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem2[tid] = val2 += vmem2[tid + 8]; + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem2[tid] = val2 += vmem2[tid + 8]; vmem3[tid] = val3 += vmem3[tid + 8]; - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem2[tid] = val2 += vmem2[tid + 4]; + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem2[tid] = val2 += vmem2[tid + 4]; vmem3[tid] = val3 += vmem3[tid + 4]; - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem2[tid] = val2 += vmem2[tid + 2]; + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem2[tid] = val2 += vmem2[tid + 2]; vmem3[tid] = val3 += vmem3[tid + 2]; - vmem1[tid] = val1 += vmem1[tid + 1]; - vmem2[tid] = val2 += vmem2[tid + 1]; + vmem1[tid] = val1 += vmem1[tid + 1]; + vmem2[tid] = val2 += vmem2[tid + 1]; vmem3[tid] = val3 += vmem3[tid + 1]; } } @@ -238,19 +238,19 @@ namespace cv { namespace gpu { namespace device __syncthreads(); #if __CUDA_ARCH__ > 110 - if (tid < 128) - { - smem1[tid] = val1 += smem1[tid + 128]; - smem2[tid] = val2 += smem2[tid + 128]; - } + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + } __syncthreads(); #endif - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - smem2[tid] = val2 += smem2[tid + 64]; - } + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + } __syncthreads(); if (tid < 32) @@ -258,23 +258,23 @@ namespace cv { namespace gpu { namespace device volatile float* vmem1 = smem1; volatile float* vmem2 = smem2; - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem2[tid] = val2 += vmem2[tid + 32]; + vmem1[tid] = val1 += vmem1[tid + 32]; + vmem2[tid] = val2 += vmem2[tid + 32]; - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem2[tid] = val2 += vmem2[tid + 16]; + vmem1[tid] = val1 += vmem1[tid + 16]; + vmem2[tid] = val2 += vmem2[tid + 16]; - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem2[tid] = val2 += vmem2[tid + 8]; + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem2[tid] = val2 += vmem2[tid + 8]; - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem2[tid] = val2 += vmem2[tid + 4]; + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem2[tid] = val2 += vmem2[tid + 4]; - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem2[tid] = val2 += vmem2[tid + 2]; + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem2[tid] = val2 += vmem2[tid + 2]; - vmem1[tid] = val1 += vmem1[tid + 1]; - vmem2[tid] = val2 += vmem2[tid + 1]; + vmem1[tid] = val1 += vmem1[tid + 1]; + vmem2[tid] = val2 += vmem2[tid + 1]; } } @@ -284,29 +284,29 @@ namespace cv { namespace gpu { namespace device __syncthreads(); #if __CUDA_ARCH__ > 110 - if (tid < 128) - { - smem1[tid] = val1 += smem1[tid + 128]; - } + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + } __syncthreads(); #endif - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - } + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + } __syncthreads(); if (tid < 32) { volatile float* vmem1 = smem1; - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem1[tid] = val1 += vmem1[tid + 8]; + vmem1[tid] = val1 += vmem1[tid + 32]; + vmem1[tid] = val1 += vmem1[tid + 16]; + vmem1[tid] = val1 += vmem1[tid + 8]; vmem1[tid] = val1 += vmem1[tid + 4]; - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem1[tid] = val1 += vmem1[tid + 1]; + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem1[tid] = val1 += vmem1[tid + 1]; } } @@ -341,7 +341,7 @@ namespace cv { namespace gpu { namespace device { status[blockIdx.x] = 0; - if (calcErr) + if (calcErr) err[blockIdx.x] = 0; } @@ -349,7 +349,7 @@ namespace cv { namespace gpu { namespace device } // extract the patch from the first image, compute covariation matrix of derivatives - + float A11 = 0; float A12 = 0; float A22 = 0; @@ -359,7 +359,7 @@ namespace cv { namespace gpu { namespace device int dIdy_patch[PATCH_Y][PATCH_X]; for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) - { + { for (int x = threadIdx.x, j = 0; x < c_winSize_x_cn; x += blockDim.x, ++j) { I_patch[i][j] = linearFilter(I, prevPt, x, y); @@ -369,7 +369,7 @@ namespace cv { namespace gpu { namespace device dIdx_patch[i][j] = ixval; dIdy_patch[i][j] = iyval; - + A11 += ixval * ixval; A12 += ixval * iyval; A22 += iyval * iyval; @@ -382,7 +382,7 @@ namespace cv { namespace gpu { namespace device A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; - + A11 *= SCALE; A12 *= SCALE; A22 *= SCALE; @@ -390,8 +390,8 @@ namespace cv { namespace gpu { namespace device { float D = A11 * A22 - A12 * A12; float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y); - - if (calcErr && GET_MIN_EIGENVALS && tid == 0) + + if (calcErr && GET_MIN_EIGENVALS && tid == 0) err[blockIdx.x] = minEig; if (minEig < c_minEigThreshold || D < numeric_limits::epsilon()) @@ -403,7 +403,7 @@ namespace cv { namespace gpu { namespace device } D = 1.f / D; - + A11 *= D; A12 *= D; A22 *= D; @@ -411,8 +411,8 @@ namespace cv { namespace gpu { namespace device float2 nextPt = nextPts[blockIdx.x]; nextPt.x *= 2.f; - nextPt.y *= 2.f; - + nextPt.y *= 2.f; + nextPt.x -= c_halfWin_x; nextPt.y -= c_halfWin_y; @@ -428,7 +428,7 @@ namespace cv { namespace gpu { namespace device float b1 = 0; float b2 = 0; - + for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) { for (int x = threadIdx.x, j = 0; x < c_winSize_x_cn; x += blockDim.x, ++j) @@ -439,7 +439,7 @@ namespace cv { namespace gpu { namespace device b2 += diff * dIdy_patch[i][j]; } } - + reduce(b1, b2, smem1, smem2, tid); __syncthreads(); @@ -448,11 +448,11 @@ namespace cv { namespace gpu { namespace device b1 *= SCALE; b2 *= SCALE; - + float2 delta; delta.x = A12 * b2 - A22 * b1; delta.y = A12 * b1 - A11 * b2; - + nextPt.x += delta.x; nextPt.y += delta.y; @@ -495,7 +495,7 @@ namespace cv { namespace gpu { namespace device template void lkSparse_caller(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, + const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, cudaStream_t stream) { dim3 grid(ptcount); @@ -532,109 +532,147 @@ namespace cv { namespace gpu { namespace device } void lkSparse_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, + const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, dim3 patch, cudaStream_t stream) { typedef void (*func_t)(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, + const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, cudaStream_t stream); - static const func_t funcs[5][5] = + static const func_t funcs[5][5] = { {lkSparse_caller<1, 1>, lkSparse_caller<2, 1>, lkSparse_caller<3, 1>, lkSparse_caller<4, 1>, lkSparse_caller<5, 1>}, {lkSparse_caller<1, 2>, lkSparse_caller<2, 2>, lkSparse_caller<3, 2>, lkSparse_caller<4, 2>, lkSparse_caller<5, 2>}, {lkSparse_caller<1, 3>, lkSparse_caller<2, 3>, lkSparse_caller<3, 3>, lkSparse_caller<4, 3>, lkSparse_caller<5, 3>}, {lkSparse_caller<1, 4>, lkSparse_caller<2, 4>, lkSparse_caller<3, 4>, lkSparse_caller<4, 4>, lkSparse_caller<5, 4>}, {lkSparse_caller<1, 5>, lkSparse_caller<2, 5>, lkSparse_caller<3, 5>, lkSparse_caller<4, 5>, lkSparse_caller<5, 5>} - }; + }; funcs[patch.y - 1][patch.x - 1](I, J, dIdx, dIdy, - prevPts, nextPts, status, err, GET_MIN_EIGENVALS, ptcount, + prevPts, nextPts, status, err, GET_MIN_EIGENVALS, ptcount, level, block, stream); } - template - __global__ void lkDense(const PtrStepb I, const PtrStepb J, const PtrStep dIdx, const PtrStep dIdy, - PtrStepf u, PtrStepf v, PtrStepf err, const int rows, const int cols) + texture tex_I(false, cudaFilterModePoint, cudaAddressModeClamp); + texture tex_J(false, cudaFilterModeLinear, cudaAddressModeClamp); + + template + __global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + extern __shared__ int smem[]; + + const int patchWidth = blockDim.x + 2 * c_halfWin_x; + const int patchHeight = blockDim.y + 2 * c_halfWin_y; + + int* I_patch = smem; + int* dIdx_patch = I_patch + patchWidth * patchHeight; + int* dIdy_patch = dIdx_patch + patchWidth * patchHeight; + + const int xBase = blockIdx.x * blockDim.x; + const int yBase = blockIdx.y * blockDim.y; + + for (int i = threadIdx.y; i < patchHeight; i += blockDim.y) + { + for (int j = threadIdx.x; j < patchWidth; j += blockDim.x) + { + float x = xBase - c_halfWin_x + j + 0.5f; + float y = yBase - c_halfWin_y + i + 0.5f; + + I_patch[i * patchWidth + j] = tex2D(tex_I, x, y); + + // Sharr Deriv + + dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_I, x+1, y-1) + 10 * tex2D(tex_I, x+1, y) + 3 * tex2D(tex_I, x+1, y+1) - + (3 * tex2D(tex_I, x-1, y-1) + 10 * tex2D(tex_I, x-1, y) + 3 * tex2D(tex_I, x-1, y+1)); + + dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_I, x-1, y+1) + 10 * tex2D(tex_I, x, y+1) + 3 * tex2D(tex_I, x+1, y+1) - + (3 * tex2D(tex_I, x-1, y-1) + 10 * tex2D(tex_I, x, y-1) + 3 * tex2D(tex_I, x+1, y-1)); + } + } + + __syncthreads(); + + const int x = xBase + threadIdx.x; + const int y = yBase + threadIdx.y; if (x >= cols || y >= rows) return; - // extract the patch from the first image, compute covariation matrix of derivatives - - float A11 = 0; - float A12 = 0; - float A22 = 0; + int A11i = 0; + int A12i = 0; + int A22i = 0; for (int i = 0; i < c_winSize_y; ++i) - { + { for (int j = 0; j < c_winSize_x; ++j) { - int ixval = dIdx(y - c_halfWin_y + i, x - c_halfWin_x + j); - int iyval = dIdy(y - c_halfWin_y + i, x - c_halfWin_x + j); + int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; - A11 += ixval * ixval; - A12 += ixval * iyval; - A22 += iyval * iyval; + A11i += dIdx * dIdx; + A12i += dIdx * dIdy; + A22i += dIdy * dIdy; } } - - A11 *= SCALE; - A12 *= SCALE; - A22 *= SCALE; - { - float D = A11 * A22 - A12 * A12; - float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y); + float A11 = A11i; + float A12 = A12i; + float A22 = A22i; - if (calcErr && GET_MIN_EIGENVALS) - err(y, x) = minEig; - - if (minEig < c_minEigThreshold || D < numeric_limits::epsilon()) - return; + float D = A11 * A22 - A12 * A12; - D = 1.f / D; - - A11 *= D; - A12 *= D; - A22 *= D; + if (D < numeric_limits::epsilon()) + { + if (calcErr) + err(y, x) = numeric_limits::max(); + + return; } + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + float2 nextPt; - nextPt.x = x - c_halfWin_x + u(y, x); - nextPt.y = y - c_halfWin_y + v(y, x); + nextPt.x = x + prevU(y/2, x/2) * 2.0f; + nextPt.y = y + prevV(y/2, x/2) * 2.0f; for (int k = 0; k < c_iters; ++k) { - if (nextPt.x < -c_winSize_x || nextPt.x >= cols || nextPt.y < -c_winSize_y || nextPt.y >= rows) + if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows) + { + if (calcErr) + err(y, x) = numeric_limits::max(); + return; + } + + int b1 = 0; + int b2 = 0; - float b1 = 0; - float b2 = 0; - for (int i = 0; i < c_winSize_y; ++i) { for (int j = 0; j < c_winSize_x; ++j) { - int I_val = I(y - c_halfWin_y + i, x - c_halfWin_x + j); + int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; + int J = tex2D(tex_J, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); - int diff = linearFilter(J, nextPt, j, i) - CV_DESCALE(I_val * (1 << W_BITS), W_BITS1 - 5); - - b1 += diff * dIdx(y - c_halfWin_y + i, x - c_halfWin_x + j); - b2 += diff * dIdy(y - c_halfWin_y + i, x - c_halfWin_x + j); + int diff = (J - I) * 32; + + int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + + b1 += diff * dIdx; + b2 += diff * dIdy; } } - b1 *= SCALE; - b2 *= SCALE; - float2 delta; delta.x = A12 * b2 - A22 * b1; delta.y = A12 * b1 - A11 * b2; - + nextPt.x += delta.x; nextPt.y += delta.y; @@ -642,57 +680,50 @@ namespace cv { namespace gpu { namespace device break; } - u(y, x) = nextPt.x - x + c_halfWin_x; - v(y, x) = nextPt.y - y + c_halfWin_y; + u(y, x) = nextPt.x - x; + v(y, x) = nextPt.y - y; - if (calcErr && !GET_MIN_EIGENVALS) + if (calcErr) { - float errval = 0.0f; + int errval = 0; for (int i = 0; i < c_winSize_y; ++i) { for (int j = 0; j < c_winSize_x; ++j) { - int I_val = I(y - c_halfWin_y + i, x - c_halfWin_x + j); - int diff = linearFilter(J, nextPt, j, i) - CV_DESCALE(I_val * (1 << W_BITS), W_BITS1 - 5); - errval += ::fabsf((float)diff); + int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; + int J = tex2D(tex_J, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); + + errval += ::abs(J - I); } } - errval /= 32 * c_winSize_x_cn * c_winSize_y; - - err(y, x) = errval; + err(y, x) = static_cast(errval) / (c_winSize_x * c_winSize_y); } } - void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream) + void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV, + DevMem2Df err, int2 winSize, cudaStream_t stream) { - dim3 block(32, 8); + dim3 block(16, 16); dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); - if (err) - { - if (GET_MIN_EIGENVALS) - { - cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); + bindTexture(&tex_I, I); + bindTexture(&tex_J, J); - lkDense<<>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols); - cudaSafeCall( cudaGetLastError() ); - } - else - { - cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); + int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); + const int patchWidth = block.x + 2 * halfWin.x; + const int patchHeight = block.y + 2 * halfWin.y; + size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); - lkDense<<>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols); - cudaSafeCall( cudaGetLastError() ); - } + if (err.data) + { + lkDense<<>>(u, v, prevU, prevV, err, I.rows, I.cols); + cudaSafeCall( cudaGetLastError() ); } else { - cudaSafeCall( cudaFuncSetCacheConfig(lkDense, cudaFuncCachePreferL1) ); - - lkDense<<>>(I, J, dIdx, dIdy, u, v, PtrStepf(), I.rows, I.cols); + lkDense<<>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols); cudaSafeCall( cudaGetLastError() ); } diff --git a/modules/gpu/src/pyrlk.cpp b/modules/gpu/src/pyrlk.cpp index d85076b..adb630c 100644 --- a/modules/gpu/src/pyrlk.cpp +++ b/modules/gpu/src/pyrlk.cpp @@ -66,8 +66,8 @@ namespace cv { namespace gpu { namespace device const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, int level, dim3 block, dim3 patch, cudaStream_t stream = 0); - void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_ dIdx, DevMem2D_ dIdy, - DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream = 0); + void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV, + DevMem2Df err, int2 winSize, cudaStream_t stream = 0); } }}} @@ -160,16 +160,11 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next return; } - derivLambda = std::min(std::max(derivLambda, 0.0), 1.0); - - iters = std::min(std::max(iters, 0), 100); - const int cn = prevImg.channels(); dim3 block, patch; - calcPatchSize(winSize, cn, block, patch, isDeviceArch11_); + calcPatchSize(winSize, cn, block, patch, isDeviceArch11_); - CV_Assert(derivLambda >= 0); CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2); CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6); @@ -227,80 +222,53 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI { using namespace cv::gpu::device::pyrlk; - derivLambda = std::min(std::max(derivLambda, 0.0), 1.0); - - iters = std::min(std::max(iters, 0), 100); - CV_Assert(prevImg.type() == CV_8UC1); CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); - CV_Assert(derivLambda >= 0); - CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2); - - if (useInitialFlow) - { - CV_Assert(u.size() == prevImg.size() && u.type() == CV_32FC1); - CV_Assert(v.size() == prevImg.size() && v.type() == CV_32FC1); - } - else - { - u.create(prevImg.size(), CV_32FC1); - v.create(prevImg.size(), CV_32FC1); - - u.setTo(Scalar::all(0)); - v.setTo(Scalar::all(0)); - } + CV_Assert(maxLevel >= 0); + CV_Assert(winSize.width > 2 && winSize.height > 2); if (err) err->create(prevImg.size(), CV_32FC1); // build the image pyramids. - // we pad each level with +/-winSize.{width|height} - // pixels to simplify the further patch extraction. - buildImagePyramid(prevImg, prevPyr_, true); - buildImagePyramid(nextImg, nextPyr_, true); - buildImagePyramid(u, uPyr_, false); - buildImagePyramid(v, vPyr_, false); + buildImagePyramid(prevImg, prevPyr_, false); - // dI/dx ~ Ix, dI/dy ~ Iy + nextPyr_.resize(maxLevel + 1); + nextImg.convertTo(nextPyr_[0], CV_32F); + for (int level = 1; level <= maxLevel; ++level) + pyrDown(nextPyr_[level - 1], nextPyr_[level]); + + uPyr_.resize(2); + vPyr_.resize(2); - ensureSizeIsEnough(prevImg.rows + winSize.height * 2, prevImg.cols + winSize.width * 2, CV_16SC1, dx_buf_); - ensureSizeIsEnough(prevImg.rows + winSize.height * 2, prevImg.cols + winSize.width * 2, CV_16SC1, dy_buf_); + ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); + ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); + ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); + ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); + uPyr_[1].setTo(Scalar::all(0)); + vPyr_[1].setTo(Scalar::all(0)); - loadConstants(1, minEigThreshold, make_int2(winSize.width, winSize.height), iters); + int2 winSize2i = make_int2(winSize.width, winSize.height); + loadConstants(1, minEigThreshold, winSize2i, iters); DevMem2Df derr = err ? *err : DevMem2Df(); + int idx = 0; + for (int level = maxLevel; level >= 0; level--) { - Size imgSize = prevPyr_[level].size(); - - GpuMat dxWhole(imgSize.height + winSize.height * 2, imgSize.width + winSize.width * 2, dx_buf_.type(), dx_buf_.data, dx_buf_.step); - GpuMat dyWhole(imgSize.height + winSize.height * 2, imgSize.width + winSize.width * 2, dy_buf_.type(), dy_buf_.data, dy_buf_.step); - dxWhole.setTo(Scalar::all(0)); - dyWhole.setTo(Scalar::all(0)); - GpuMat dIdx = dxWhole(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height)); - GpuMat dIdy = dyWhole(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height)); + int idx2 = (idx + 1) & 1; - calcSharrDeriv(prevPyr_[level], dIdx, dIdy); + lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2], + level == 0 ? derr : DevMem2Df(), winSize2i); - lkDense_gpu(prevPyr_[level], nextPyr_[level], dIdx, dIdy, uPyr_[level], vPyr_[level], - level == 0 && err ? &derr : 0, getMinEigenVals); - - if (level == 0) - { - uPyr_[0].copyTo(u); - vPyr_[0].copyTo(v); - } - else - { - resize(uPyr_[level], uPyr_[level - 1], uPyr_[level - 1].size()); - resize(vPyr_[level], vPyr_[level - 1], vPyr_[level - 1].size()); - - multiply(uPyr_[level - 1], Scalar::all(2), uPyr_[level - 1]); - multiply(vPyr_[level - 1], Scalar::all(2), vPyr_[level - 1]); - } + if (level > 0) + idx = idx2; } + + uPyr_[idx].copyTo(u); + vPyr_[idx].copyTo(v); } #endif /* !defined (HAVE_CUDA) */ diff --git a/samples/gpu/pyrlk_optical_flow.cpp b/samples/gpu/pyrlk_optical_flow.cpp index 56e3ce0..5bff8f9 100644 --- a/samples/gpu/pyrlk_optical_flow.cpp +++ b/samples/gpu/pyrlk_optical_flow.cpp @@ -159,7 +159,6 @@ int main(int argc, const char* argv[]) "{ win_size | win_size | 21 | specify windows size [PyrLK] }" "{ max_level | max_level | 3 | specify max level [PyrLK] }" "{ iters | iters | 30 | specify iterations count [PyrLK] }" - "{ deriv_lambda | deriv_lambda | 0.5 | specify deriv lambda [PyrLK] }" "{ points | points | 4000 | specify points count [GoodFeatureToTrack] }" "{ min_dist | min_dist | 0 | specify minimal distance between points [GoodFeatureToTrack] }"; @@ -186,7 +185,6 @@ int main(int argc, const char* argv[]) int winSize = cmd.get("win_size"); int maxLevel = cmd.get("max_level"); int iters = cmd.get("iters"); - double derivLambda = cmd.get("deriv_lambda"); int points = cmd.get("points"); double minDist = cmd.get("min_dist"); @@ -235,7 +233,6 @@ int main(int argc, const char* argv[]) d_pyrLK.winSize.height = winSize; d_pyrLK.maxLevel = maxLevel; d_pyrLK.iters = iters; - d_pyrLK.derivLambda = derivLambda; GpuMat d_frame0(frame0); GpuMat d_frame1(frame1); -- 2.7.4