//\r
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong\r
//\r
-// The original code was written by Paul Furgale and Chi Hay Tong \r
+// The original code was written by Paul Furgale and Chi Hay Tong\r
// and later optimized and prepared for integration into OpenCV by Itseez.\r
//\r
//M*/\r
#include "opencv2/gpu/device/functional.hpp"\r
#include "opencv2/gpu/device/limits.hpp"\r
\r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
{\r
- namespace pyrlk \r
+ namespace pyrlk\r
{\r
__constant__ int c_cn;\r
__constant__ float c_minEigThreshold;\r
\r
void loadConstants(int cn, float minEigThreshold, int2 winSize, int iters)\r
{\r
- int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); \r
+ int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);\r
cudaSafeCall( cudaMemcpyToSymbol(c_cn, &cn, sizeof(int)) );\r
cudaSafeCall( cudaMemcpyToSymbol(c_minEigThreshold, &minEigThreshold, sizeof(float)) );\r
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );\r
const uchar src_val0 = src(y > 0 ? y - 1 : 1, x);\r
const uchar src_val1 = src(y, x);\r
const uchar src_val2 = src(y < rows - 1 ? y + 1 : rows - 2, x);\r
- \r
+\r
dx_buf(y, x) = (src_val0 + src_val2) * 3 + src_val1 * 10;\r
dy_buf(y, x) = src_val2 - src_val0;\r
}\r
}\r
}\r
\r
- void calcSharrDeriv_gpu(DevMem2Db src, DevMem2D_<short> dx_buf, DevMem2D_<short> dy_buf, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy, int cn, \r
+ void calcSharrDeriv_gpu(DevMem2Db src, DevMem2D_<short> dx_buf, DevMem2D_<short> dy_buf, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy, int cn,\r
cudaStream_t stream)\r
{\r
dim3 block(32, 8);\r
__syncthreads();\r
\r
#if __CUDA_ARCH__ > 110\r
- if (tid < 128) \r
- { \r
- smem1[tid] = val1 += smem1[tid + 128]; \r
- smem2[tid] = val2 += smem2[tid + 128]; \r
- smem3[tid] = val3 += smem3[tid + 128]; \r
- } \r
+ if (tid < 128)\r
+ {\r
+ smem1[tid] = val1 += smem1[tid + 128];\r
+ smem2[tid] = val2 += smem2[tid + 128];\r
+ smem3[tid] = val3 += smem3[tid + 128];\r
+ }\r
__syncthreads();\r
#endif\r
\r
- if (tid < 64) \r
- { \r
- smem1[tid] = val1 += smem1[tid + 64]; \r
- smem2[tid] = val2 += smem2[tid + 64]; \r
+ if (tid < 64)\r
+ {\r
+ smem1[tid] = val1 += smem1[tid + 64];\r
+ smem2[tid] = val2 += smem2[tid + 64];\r
smem3[tid] = val3 += smem3[tid + 64];\r
- } \r
+ }\r
__syncthreads();\r
\r
if (tid < 32)\r
volatile float* vmem2 = smem2;\r
volatile float* vmem3 = smem3;\r
\r
- vmem1[tid] = val1 += vmem1[tid + 32]; \r
- vmem2[tid] = val2 += vmem2[tid + 32]; \r
+ vmem1[tid] = val1 += vmem1[tid + 32];\r
+ vmem2[tid] = val2 += vmem2[tid + 32];\r
vmem3[tid] = val3 += vmem3[tid + 32];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 16]; \r
- vmem2[tid] = val2 += vmem2[tid + 16]; \r
+ vmem1[tid] = val1 += vmem1[tid + 16];\r
+ vmem2[tid] = val2 += vmem2[tid + 16];\r
vmem3[tid] = val3 += vmem3[tid + 16];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 8]; \r
- vmem2[tid] = val2 += vmem2[tid + 8]; \r
+ vmem1[tid] = val1 += vmem1[tid + 8];\r
+ vmem2[tid] = val2 += vmem2[tid + 8];\r
vmem3[tid] = val3 += vmem3[tid + 8];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 4]; \r
- vmem2[tid] = val2 += vmem2[tid + 4]; \r
+ vmem1[tid] = val1 += vmem1[tid + 4];\r
+ vmem2[tid] = val2 += vmem2[tid + 4];\r
vmem3[tid] = val3 += vmem3[tid + 4];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 2]; \r
- vmem2[tid] = val2 += vmem2[tid + 2]; \r
+ vmem1[tid] = val1 += vmem1[tid + 2];\r
+ vmem2[tid] = val2 += vmem2[tid + 2];\r
vmem3[tid] = val3 += vmem3[tid + 2];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 1]; \r
- vmem2[tid] = val2 += vmem2[tid + 1]; \r
+ vmem1[tid] = val1 += vmem1[tid + 1];\r
+ vmem2[tid] = val2 += vmem2[tid + 1];\r
vmem3[tid] = val3 += vmem3[tid + 1];\r
}\r
}\r
__syncthreads();\r
\r
#if __CUDA_ARCH__ > 110\r
- if (tid < 128) \r
- { \r
- smem1[tid] = val1 += smem1[tid + 128]; \r
- smem2[tid] = val2 += smem2[tid + 128]; \r
- } \r
+ if (tid < 128)\r
+ {\r
+ smem1[tid] = val1 += smem1[tid + 128];\r
+ smem2[tid] = val2 += smem2[tid + 128];\r
+ }\r
__syncthreads();\r
#endif\r
\r
- if (tid < 64) \r
- { \r
- smem1[tid] = val1 += smem1[tid + 64]; \r
- smem2[tid] = val2 += smem2[tid + 64]; \r
- } \r
+ if (tid < 64)\r
+ {\r
+ smem1[tid] = val1 += smem1[tid + 64];\r
+ smem2[tid] = val2 += smem2[tid + 64];\r
+ }\r
__syncthreads();\r
\r
if (tid < 32)\r
volatile float* vmem1 = smem1;\r
volatile float* vmem2 = smem2;\r
\r
- vmem1[tid] = val1 += vmem1[tid + 32]; \r
- vmem2[tid] = val2 += vmem2[tid + 32]; \r
+ vmem1[tid] = val1 += vmem1[tid + 32];\r
+ vmem2[tid] = val2 += vmem2[tid + 32];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 16]; \r
- vmem2[tid] = val2 += vmem2[tid + 16]; \r
+ vmem1[tid] = val1 += vmem1[tid + 16];\r
+ vmem2[tid] = val2 += vmem2[tid + 16];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 8]; \r
- vmem2[tid] = val2 += vmem2[tid + 8]; \r
+ vmem1[tid] = val1 += vmem1[tid + 8];\r
+ vmem2[tid] = val2 += vmem2[tid + 8];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 4]; \r
- vmem2[tid] = val2 += vmem2[tid + 4]; \r
+ vmem1[tid] = val1 += vmem1[tid + 4];\r
+ vmem2[tid] = val2 += vmem2[tid + 4];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 2]; \r
- vmem2[tid] = val2 += vmem2[tid + 2]; \r
+ vmem1[tid] = val1 += vmem1[tid + 2];\r
+ vmem2[tid] = val2 += vmem2[tid + 2];\r
\r
- vmem1[tid] = val1 += vmem1[tid + 1]; \r
- vmem2[tid] = val2 += vmem2[tid + 1]; \r
+ vmem1[tid] = val1 += vmem1[tid + 1];\r
+ vmem2[tid] = val2 += vmem2[tid + 1];\r
}\r
}\r
\r
__syncthreads();\r
\r
#if __CUDA_ARCH__ > 110\r
- if (tid < 128) \r
- { \r
- smem1[tid] = val1 += smem1[tid + 128]; \r
- } \r
+ if (tid < 128)\r
+ {\r
+ smem1[tid] = val1 += smem1[tid + 128];\r
+ }\r
__syncthreads();\r
#endif\r
\r
- if (tid < 64) \r
- { \r
- smem1[tid] = val1 += smem1[tid + 64]; \r
- } \r
+ if (tid < 64)\r
+ {\r
+ smem1[tid] = val1 += smem1[tid + 64];\r
+ }\r
__syncthreads();\r
\r
if (tid < 32)\r
{\r
volatile float* vmem1 = smem1;\r
\r
- vmem1[tid] = val1 += vmem1[tid + 32]; \r
- vmem1[tid] = val1 += vmem1[tid + 16]; \r
- vmem1[tid] = val1 += vmem1[tid + 8]; \r
+ vmem1[tid] = val1 += vmem1[tid + 32];\r
+ vmem1[tid] = val1 += vmem1[tid + 16];\r
+ vmem1[tid] = val1 += vmem1[tid + 8];\r
vmem1[tid] = val1 += vmem1[tid + 4];\r
- vmem1[tid] = val1 += vmem1[tid + 2]; \r
- vmem1[tid] = val1 += vmem1[tid + 1]; \r
+ vmem1[tid] = val1 += vmem1[tid + 2];\r
+ vmem1[tid] = val1 += vmem1[tid + 1];\r
}\r
}\r
\r
{\r
status[blockIdx.x] = 0;\r
\r
- if (calcErr) \r
+ if (calcErr)\r
err[blockIdx.x] = 0;\r
}\r
\r
}\r
\r
// extract the patch from the first image, compute covariation matrix of derivatives\r
- \r
+\r
float A11 = 0;\r
float A12 = 0;\r
float A22 = 0;\r
int dIdy_patch[PATCH_Y][PATCH_X];\r
\r
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)\r
- { \r
+ {\r
for (int x = threadIdx.x, j = 0; x < c_winSize_x_cn; x += blockDim.x, ++j)\r
{\r
I_patch[i][j] = linearFilter(I, prevPt, x, y);\r
\r
dIdx_patch[i][j] = ixval;\r
dIdy_patch[i][j] = iyval;\r
- \r
+\r
A11 += ixval * ixval;\r
A12 += ixval * iyval;\r
A22 += iyval * iyval;\r
A11 = smem1[0];\r
A12 = smem2[0];\r
A22 = smem3[0];\r
- \r
+\r
A11 *= SCALE;\r
A12 *= SCALE;\r
A22 *= SCALE;\r
{\r
float D = A11 * A22 - A12 * A12;\r
float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y);\r
- \r
- if (calcErr && GET_MIN_EIGENVALS && tid == 0) \r
+\r
+ if (calcErr && GET_MIN_EIGENVALS && tid == 0)\r
err[blockIdx.x] = minEig;\r
\r
if (minEig < c_minEigThreshold || D < numeric_limits<float>::epsilon())\r
}\r
\r
D = 1.f / D;\r
- \r
+\r
A11 *= D;\r
A12 *= D;\r
A22 *= D;\r
\r
float2 nextPt = nextPts[blockIdx.x];\r
nextPt.x *= 2.f;\r
- nextPt.y *= 2.f; \r
- \r
+ nextPt.y *= 2.f;\r
+\r
nextPt.x -= c_halfWin_x;\r
nextPt.y -= c_halfWin_y;\r
\r
\r
float b1 = 0;\r
float b2 = 0;\r
- \r
+\r
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)\r
{\r
for (int x = threadIdx.x, j = 0; x < c_winSize_x_cn; x += blockDim.x, ++j)\r
b2 += diff * dIdy_patch[i][j];\r
}\r
}\r
- \r
+\r
reduce(b1, b2, smem1, smem2, tid);\r
__syncthreads();\r
\r
\r
b1 *= SCALE;\r
b2 *= SCALE;\r
- \r
+\r
float2 delta;\r
delta.x = A12 * b2 - A22 * b1;\r
delta.y = A12 * b1 - A11 * b2;\r
- \r
+\r
nextPt.x += delta.x;\r
nextPt.y += delta.y;\r
\r
\r
template <int PATCH_X, int PATCH_Y>\r
void lkSparse_caller(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
- const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, \r
+ const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
int level, dim3 block, cudaStream_t stream)\r
{\r
dim3 grid(ptcount);\r
}\r
\r
void lkSparse_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
- const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, \r
+ const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
int level, dim3 block, dim3 patch, cudaStream_t stream)\r
{\r
typedef void (*func_t)(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
- const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, \r
+ const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
int level, dim3 block, cudaStream_t stream);\r
\r
- static const func_t funcs[5][5] = \r
+ static const func_t funcs[5][5] =\r
{\r
{lkSparse_caller<1, 1>, lkSparse_caller<2, 1>, lkSparse_caller<3, 1>, lkSparse_caller<4, 1>, lkSparse_caller<5, 1>},\r
{lkSparse_caller<1, 2>, lkSparse_caller<2, 2>, lkSparse_caller<3, 2>, lkSparse_caller<4, 2>, lkSparse_caller<5, 2>},\r
{lkSparse_caller<1, 3>, lkSparse_caller<2, 3>, lkSparse_caller<3, 3>, lkSparse_caller<4, 3>, lkSparse_caller<5, 3>},\r
{lkSparse_caller<1, 4>, lkSparse_caller<2, 4>, lkSparse_caller<3, 4>, lkSparse_caller<4, 4>, lkSparse_caller<5, 4>},\r
{lkSparse_caller<1, 5>, lkSparse_caller<2, 5>, lkSparse_caller<3, 5>, lkSparse_caller<4, 5>, lkSparse_caller<5, 5>}\r
- }; \r
+ };\r
\r
funcs[patch.y - 1][patch.x - 1](I, J, dIdx, dIdy,\r
- prevPts, nextPts, status, err, GET_MIN_EIGENVALS, ptcount, \r
+ prevPts, nextPts, status, err, GET_MIN_EIGENVALS, ptcount,\r
level, block, stream);\r
}\r
\r
- template <bool calcErr, bool GET_MIN_EIGENVALS>\r
- __global__ void lkDense(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,\r
- PtrStepf u, PtrStepf v, PtrStepf err, const int rows, const int cols)\r
+ texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_I(false, cudaFilterModePoint, cudaAddressModeClamp);\r
+ texture<float, cudaTextureType2D, cudaReadModeElementType> tex_J(false, cudaFilterModeLinear, cudaAddressModeClamp);\r
+\r
+ template <bool calcErr>\r
+ __global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)\r
{\r
- const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ extern __shared__ int smem[];\r
+\r
+ const int patchWidth = blockDim.x + 2 * c_halfWin_x;\r
+ const int patchHeight = blockDim.y + 2 * c_halfWin_y;\r
+\r
+ int* I_patch = smem;\r
+ int* dIdx_patch = I_patch + patchWidth * patchHeight;\r
+ int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;\r
+\r
+ const int xBase = blockIdx.x * blockDim.x;\r
+ const int yBase = blockIdx.y * blockDim.y;\r
+\r
+ for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)\r
+ {\r
+ for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)\r
+ {\r
+ float x = xBase - c_halfWin_x + j + 0.5f;\r
+ float y = yBase - c_halfWin_y + i + 0.5f;\r
+\r
+ I_patch[i * patchWidth + j] = tex2D(tex_I, x, y);\r
+\r
+ // Sharr Deriv\r
+\r
+ dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_I, x+1, y-1) + 10 * tex2D(tex_I, x+1, y) + 3 * tex2D(tex_I, x+1, y+1) -\r
+ (3 * tex2D(tex_I, x-1, y-1) + 10 * tex2D(tex_I, x-1, y) + 3 * tex2D(tex_I, x-1, y+1));\r
+\r
+ dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_I, x-1, y+1) + 10 * tex2D(tex_I, x, y+1) + 3 * tex2D(tex_I, x+1, y+1) -\r
+ (3 * tex2D(tex_I, x-1, y-1) + 10 * tex2D(tex_I, x, y-1) + 3 * tex2D(tex_I, x+1, y-1));\r
+ }\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ const int x = xBase + threadIdx.x;\r
+ const int y = yBase + threadIdx.y;\r
\r
if (x >= cols || y >= rows)\r
return;\r
\r
- // extract the patch from the first image, compute covariation matrix of derivatives\r
- \r
- float A11 = 0;\r
- float A12 = 0;\r
- float A22 = 0;\r
+ int A11i = 0;\r
+ int A12i = 0;\r
+ int A22i = 0;\r
\r
for (int i = 0; i < c_winSize_y; ++i)\r
- { \r
+ {\r
for (int j = 0; j < c_winSize_x; ++j)\r
{\r
- int ixval = dIdx(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
- int iyval = dIdy(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
+ int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
+ int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
\r
- A11 += ixval * ixval;\r
- A12 += ixval * iyval;\r
- A22 += iyval * iyval;\r
+ A11i += dIdx * dIdx;\r
+ A12i += dIdx * dIdy;\r
+ A22i += dIdy * dIdy;\r
}\r
}\r
- \r
- A11 *= SCALE;\r
- A12 *= SCALE;\r
- A22 *= SCALE;\r
\r
- {\r
- float D = A11 * A22 - A12 * A12;\r
- float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y);\r
+ float A11 = A11i;\r
+ float A12 = A12i;\r
+ float A22 = A22i;\r
\r
- if (calcErr && GET_MIN_EIGENVALS)\r
- err(y, x) = minEig;\r
- \r
- if (minEig < c_minEigThreshold || D < numeric_limits<float>::epsilon())\r
- return;\r
+ float D = A11 * A22 - A12 * A12;\r
\r
- D = 1.f / D;\r
- \r
- A11 *= D;\r
- A12 *= D;\r
- A22 *= D;\r
+ if (D < numeric_limits<float>::epsilon())\r
+ {\r
+ if (calcErr)\r
+ err(y, x) = numeric_limits<float>::max();\r
+\r
+ return;\r
}\r
\r
+ D = 1.f / D;\r
+\r
+ A11 *= D;\r
+ A12 *= D;\r
+ A22 *= D;\r
+\r
float2 nextPt;\r
- nextPt.x = x - c_halfWin_x + u(y, x);\r
- nextPt.y = y - c_halfWin_y + v(y, x);\r
+ nextPt.x = x + prevU(y/2, x/2) * 2.0f;\r
+ nextPt.y = y + prevV(y/2, x/2) * 2.0f;\r
\r
for (int k = 0; k < c_iters; ++k)\r
{\r
- if (nextPt.x < -c_winSize_x || nextPt.x >= cols || nextPt.y < -c_winSize_y || nextPt.y >= rows)\r
+ if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)\r
+ {\r
+ if (calcErr)\r
+ err(y, x) = numeric_limits<float>::max();\r
+\r
return;\r
+ }\r
+\r
+ int b1 = 0;\r
+ int b2 = 0;\r
\r
- float b1 = 0;\r
- float b2 = 0;\r
- \r
for (int i = 0; i < c_winSize_y; ++i)\r
{\r
for (int j = 0; j < c_winSize_x; ++j)\r
{\r
- int I_val = I(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
+ int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];\r
+ int J = tex2D(tex_J, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);\r
\r
- int diff = linearFilter(J, nextPt, j, i) - CV_DESCALE(I_val * (1 << W_BITS), W_BITS1 - 5);\r
- \r
- b1 += diff * dIdx(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
- b2 += diff * dIdy(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
+ int diff = (J - I) * 32;\r
+\r
+ int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
+ int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
+\r
+ b1 += diff * dIdx;\r
+ b2 += diff * dIdy;\r
}\r
}\r
\r
- b1 *= SCALE;\r
- b2 *= SCALE;\r
-\r
float2 delta;\r
delta.x = A12 * b2 - A22 * b1;\r
delta.y = A12 * b1 - A11 * b2;\r
- \r
+\r
nextPt.x += delta.x;\r
nextPt.y += delta.y;\r
\r
break;\r
}\r
\r
- u(y, x) = nextPt.x - x + c_halfWin_x;\r
- v(y, x) = nextPt.y - y + c_halfWin_y; \r
+ u(y, x) = nextPt.x - x;\r
+ v(y, x) = nextPt.y - y;\r
\r
- if (calcErr && !GET_MIN_EIGENVALS)\r
+ if (calcErr)\r
{\r
- float errval = 0.0f;\r
+ int errval = 0;\r
\r
for (int i = 0; i < c_winSize_y; ++i)\r
{\r
for (int j = 0; j < c_winSize_x; ++j)\r
{\r
- int I_val = I(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
- int diff = linearFilter(J, nextPt, j, i) - CV_DESCALE(I_val * (1 << W_BITS), W_BITS1 - 5);\r
- errval += ::fabsf((float)diff);\r
+ int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];\r
+ int J = tex2D(tex_J, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);\r
+\r
+ errval += ::abs(J - I);\r
}\r
}\r
\r
- errval /= 32 * c_winSize_x_cn * c_winSize_y;\r
-\r
- err(y, x) = errval;\r
+ err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);\r
}\r
}\r
\r
- void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy, \r
- DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream)\r
+ void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV,\r
+ DevMem2Df err, int2 winSize, cudaStream_t stream)\r
{\r
- dim3 block(32, 8);\r
+ dim3 block(16, 16);\r
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));\r
\r
- if (err)\r
- {\r
- if (GET_MIN_EIGENVALS)\r
- {\r
- cudaSafeCall( cudaFuncSetCacheConfig(lkDense<true, true>, cudaFuncCachePreferL1) );\r
+ bindTexture(&tex_I, I);\r
+ bindTexture(&tex_J, J);\r
\r
- lkDense<true, true><<<grid, block, 0, stream>>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
- else\r
- {\r
- cudaSafeCall( cudaFuncSetCacheConfig(lkDense<true, false>, cudaFuncCachePreferL1) );\r
+ int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);\r
+ const int patchWidth = block.x + 2 * halfWin.x;\r
+ const int patchHeight = block.y + 2 * halfWin.y;\r
+ size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);\r
\r
- lkDense<true, false><<<grid, block, 0, stream>>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols);\r
- cudaSafeCall( cudaGetLastError() );\r
- }\r
+ if (err.data)\r
+ {\r
+ lkDense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);\r
+ cudaSafeCall( cudaGetLastError() );\r
}\r
else\r
{\r
- cudaSafeCall( cudaFuncSetCacheConfig(lkDense<false, false>, cudaFuncCachePreferL1) );\r
-\r
- lkDense<false, false><<<grid, block, 0, stream>>>(I, J, dIdx, dIdy, u, v, PtrStepf(), I.rows, I.cols);\r
+ lkDense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);\r
cudaSafeCall( cudaGetLastError() );\r
}\r
\r
const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);\r
\r
- void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
- DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream = 0);\r
+ void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV,\r
+ DevMem2Df err, int2 winSize, cudaStream_t stream = 0);\r
}\r
}}}\r
\r
return;\r
}\r
\r
- derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);\r
-\r
- iters = std::min(std::max(iters, 0), 100);\r
-\r
const int cn = prevImg.channels();\r
\r
dim3 block, patch;\r
- calcPatchSize(winSize, cn, block, patch, isDeviceArch11_); \r
+ calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);\r
\r
- CV_Assert(derivLambda >= 0);\r
CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);\r
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());\r
CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);\r
{\r
using namespace cv::gpu::device::pyrlk;\r
\r
- derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);\r
-\r
- iters = std::min(std::max(iters, 0), 100);\r
-\r
CV_Assert(prevImg.type() == CV_8UC1);\r
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());\r
- CV_Assert(derivLambda >= 0);\r
- CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);\r
-\r
- if (useInitialFlow)\r
- {\r
- CV_Assert(u.size() == prevImg.size() && u.type() == CV_32FC1);\r
- CV_Assert(v.size() == prevImg.size() && v.type() == CV_32FC1);\r
- }\r
- else\r
- {\r
- u.create(prevImg.size(), CV_32FC1);\r
- v.create(prevImg.size(), CV_32FC1);\r
-\r
- u.setTo(Scalar::all(0));\r
- v.setTo(Scalar::all(0));\r
- }\r
+ CV_Assert(maxLevel >= 0);\r
+ CV_Assert(winSize.width > 2 && winSize.height > 2);\r
\r
if (err)\r
err->create(prevImg.size(), CV_32FC1);\r
\r
// build the image pyramids.\r
- // we pad each level with +/-winSize.{width|height}\r
- // pixels to simplify the further patch extraction.\r
\r
- buildImagePyramid(prevImg, prevPyr_, true);\r
- buildImagePyramid(nextImg, nextPyr_, true);\r
- buildImagePyramid(u, uPyr_, false);\r
- buildImagePyramid(v, vPyr_, false);\r
+ buildImagePyramid(prevImg, prevPyr_, false);\r
\r
- // dI/dx ~ Ix, dI/dy ~ Iy\r
+ nextPyr_.resize(maxLevel + 1);\r
+ nextImg.convertTo(nextPyr_[0], CV_32F);\r
+ for (int level = 1; level <= maxLevel; ++level)\r
+ pyrDown(nextPyr_[level - 1], nextPyr_[level]);\r
+\r
+ uPyr_.resize(2);\r
+ vPyr_.resize(2);\r
\r
- ensureSizeIsEnough(prevImg.rows + winSize.height * 2, prevImg.cols + winSize.width * 2, CV_16SC1, dx_buf_);\r
- ensureSizeIsEnough(prevImg.rows + winSize.height * 2, prevImg.cols + winSize.width * 2, CV_16SC1, dy_buf_);\r
+ ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);\r
+ ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);\r
+ ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);\r
+ ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);\r
+ uPyr_[1].setTo(Scalar::all(0));\r
+ vPyr_[1].setTo(Scalar::all(0));\r
\r
- loadConstants(1, minEigThreshold, make_int2(winSize.width, winSize.height), iters);\r
+ int2 winSize2i = make_int2(winSize.width, winSize.height);\r
+ loadConstants(1, minEigThreshold, winSize2i, iters);\r
\r
DevMem2Df derr = err ? *err : DevMem2Df();\r
\r
+ int idx = 0;\r
+\r
for (int level = maxLevel; level >= 0; level--)\r
{\r
- Size imgSize = prevPyr_[level].size();\r
-\r
- GpuMat dxWhole(imgSize.height + winSize.height * 2, imgSize.width + winSize.width * 2, dx_buf_.type(), dx_buf_.data, dx_buf_.step);\r
- GpuMat dyWhole(imgSize.height + winSize.height * 2, imgSize.width + winSize.width * 2, dy_buf_.type(), dy_buf_.data, dy_buf_.step);\r
- dxWhole.setTo(Scalar::all(0));\r
- dyWhole.setTo(Scalar::all(0));\r
- GpuMat dIdx = dxWhole(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));\r
- GpuMat dIdy = dyWhole(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));\r
+ int idx2 = (idx + 1) & 1;\r
\r
- calcSharrDeriv(prevPyr_[level], dIdx, dIdy);\r
+ lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],\r
+ level == 0 ? derr : DevMem2Df(), winSize2i);\r
\r
- lkDense_gpu(prevPyr_[level], nextPyr_[level], dIdx, dIdy, uPyr_[level], vPyr_[level],\r
- level == 0 && err ? &derr : 0, getMinEigenVals);\r
-\r
- if (level == 0)\r
- {\r
- uPyr_[0].copyTo(u);\r
- vPyr_[0].copyTo(v);\r
- }\r
- else\r
- {\r
- resize(uPyr_[level], uPyr_[level - 1], uPyr_[level - 1].size());\r
- resize(vPyr_[level], vPyr_[level - 1], vPyr_[level - 1].size());\r
-\r
- multiply(uPyr_[level - 1], Scalar::all(2), uPyr_[level - 1]);\r
- multiply(vPyr_[level - 1], Scalar::all(2), vPyr_[level - 1]);\r
- }\r
+ if (level > 0)\r
+ idx = idx2;\r
}\r
+\r
+ uPyr_[idx].copyTo(u);\r
+ vPyr_[idx].copyTo(v);\r
}\r
\r
#endif /* !defined (HAVE_CUDA) */\r