Merged revision(s) 8664 from trunk:
authorVladislav Vinogradov <no@email>
Wed, 27 Jun 2012 09:58:33 +0000 (09:58 +0000)
committerVladislav Vinogradov <no@email>
Wed, 27 Jun 2012 09:58:33 +0000 (09:58 +0000)
new implementation of gpu::PyrLKOpticalFlow::dense (1.5 - 2x faster)
........

modules/gpu/perf/perf_video.cpp
modules/gpu/src/cuda/pyrlk.cu
modules/gpu/src/pyrlk.cpp
samples/gpu/pyrlk_optical_flow.cpp

index ff80aab..9098247 100644 (file)
@@ -145,6 +145,8 @@ INSTANTIATE_TEST_CASE_P(Video, GoodFeaturesToTrack, testing::Combine(ALL_DEVICES
 //////////////////////////////////////////////////////\r
 // PyrLKOpticalFlowSparse\r
 \r
+IMPLEMENT_PARAM_CLASS(WinSize, int)\r
+\r
 GPU_PERF_TEST(PyrLKOpticalFlowSparse, cv::gpu::DeviceInfo, bool, int, int)\r
 {\r
     cv::gpu::DeviceInfo devInfo = GET_PARAM(0);\r
@@ -196,12 +198,19 @@ INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowSparse, testing::Combine
 //////////////////////////////////////////////////////\r
 // PyrLKOpticalFlowDense\r
 \r
-GPU_PERF_TEST_1(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo)\r
+IMPLEMENT_PARAM_CLASS(Levels, int)\r
+IMPLEMENT_PARAM_CLASS(Iters, int)\r
+\r
+GPU_PERF_TEST(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo, WinSize, Levels, Iters)\r
 {\r
-    cv::gpu::DeviceInfo devInfo = GetParam();\r
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);\r
 \r
     cv::gpu::setDevice(devInfo.deviceID());\r
 \r
+    int winSize = GET_PARAM(1);\r
+    int levels = GET_PARAM(2);\r
+    int iters = GET_PARAM(3);\r
+\r
     cv::Mat frame0_host = readImage("gpu/opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);\r
     cv::Mat frame1_host = readImage("gpu/opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);\r
 \r
@@ -215,7 +224,13 @@ GPU_PERF_TEST_1(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo)
 \r
     cv::gpu::PyrLKOpticalFlow pyrLK;\r
 \r
-    declare.time(10);\r
+    pyrLK.winSize = cv::Size(winSize, winSize);\r
+    pyrLK.maxLevel = levels - 1;\r
+    pyrLK.iters = iters;\r
+\r
+    pyrLK.dense(frame0, frame1, u, v);\r
+\r
+    declare.time(30);\r
 \r
     TEST_CYCLE()\r
     {\r
@@ -223,7 +238,11 @@ GPU_PERF_TEST_1(PyrLKOpticalFlowDense, cv::gpu::DeviceInfo)
     }\r
 }\r
 \r
-INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowDense, ALL_DEVICES);\r
+INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowDense, testing::Combine(\r
+    ALL_DEVICES,\r
+    testing::Values(WinSize(3), WinSize(5), WinSize(7), WinSize(9), WinSize(13), WinSize(17), WinSize(21)),\r
+    testing::Values(Levels(1), Levels(2), Levels(3)),\r
+    testing::Values(Iters(1), Iters(10))));\r
 \r
 \r
 //////////////////////////////////////////////////////\r
index 22fbc44..12dfab6 100644 (file)
@@ -40,7 +40,7 @@
 //\r
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong\r
 //\r
-// The original code was written by Paul Furgale and Chi Hay Tong \r
+// The original code was written by Paul Furgale and Chi Hay Tong\r
 // and later optimized and prepared for integration into OpenCV by Itseez.\r
 //\r
 //M*/\r
@@ -50,9 +50,9 @@
 #include "opencv2/gpu/device/functional.hpp"\r
 #include "opencv2/gpu/device/limits.hpp"\r
 \r
-namespace cv { namespace gpu { namespace device \r
+namespace cv { namespace gpu { namespace device\r
 {\r
-    namespace pyrlk \r
+    namespace pyrlk\r
     {\r
         __constant__ int c_cn;\r
         __constant__ float c_minEigThreshold;\r
@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
 \r
         void loadConstants(int cn, float minEigThreshold, int2 winSize, int iters)\r
         {\r
-            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);            \r
+            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);\r
             cudaSafeCall( cudaMemcpyToSymbol(c_cn, &cn, sizeof(int)) );\r
             cudaSafeCall( cudaMemcpyToSymbol(c_minEigThreshold, &minEigThreshold, sizeof(float)) );\r
             cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );\r
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
                 const uchar src_val0 = src(y > 0 ? y - 1 : 1, x);\r
                 const uchar src_val1 = src(y, x);\r
                 const uchar src_val2 = src(y < rows - 1 ? y + 1 : rows - 2, x);\r
-                \r
+\r
                 dx_buf(y, x) = (src_val0 + src_val2) * 3 + src_val1 * 10;\r
                 dy_buf(y, x) = src_val2 - src_val0;\r
             }\r
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
             }\r
         }\r
 \r
-        void calcSharrDeriv_gpu(DevMem2Db src, DevMem2D_<short> dx_buf, DevMem2D_<short> dy_buf, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy, int cn, \r
+        void calcSharrDeriv_gpu(DevMem2Db src, DevMem2D_<short> dx_buf, DevMem2D_<short> dy_buf, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy, int cn,\r
             cudaStream_t stream)\r
         {\r
             dim3 block(32, 8);\r
@@ -182,21 +182,21 @@ namespace cv { namespace gpu { namespace device
             __syncthreads();\r
 \r
 #if __CUDA_ARCH__ > 110\r
-            if (tid < 128) \r
-            { \r
-                smem1[tid] = val1 += smem1[tid + 128]; \r
-                smem2[tid] = val2 += smem2[tid + 128]; \r
-                smem3[tid] = val3 += smem3[tid + 128]; \r
-            } \r
+            if (tid < 128)\r
+            {\r
+                smem1[tid] = val1 += smem1[tid + 128];\r
+                smem2[tid] = val2 += smem2[tid + 128];\r
+                smem3[tid] = val3 += smem3[tid + 128];\r
+            }\r
             __syncthreads();\r
 #endif\r
 \r
-            if (tid < 64) \r
-            { \r
-                smem1[tid] = val1 += smem1[tid + 64]; \r
-                smem2[tid] = val2 += smem2[tid + 64]; \r
+            if (tid < 64)\r
+            {\r
+                smem1[tid] = val1 += smem1[tid + 64];\r
+                smem2[tid] = val2 += smem2[tid + 64];\r
                 smem3[tid] = val3 += smem3[tid + 64];\r
-            } \r
+            }\r
             __syncthreads();\r
 \r
             if (tid < 32)\r
@@ -205,28 +205,28 @@ namespace cv { namespace gpu { namespace device
                 volatile float* vmem2 = smem2;\r
                 volatile float* vmem3 = smem3;\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 32]; \r
-                vmem2[tid] = val2 += vmem2[tid + 32]; \r
+                vmem1[tid] = val1 += vmem1[tid + 32];\r
+                vmem2[tid] = val2 += vmem2[tid + 32];\r
                 vmem3[tid] = val3 += vmem3[tid + 32];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 16]; \r
-                vmem2[tid] = val2 += vmem2[tid + 16]; \r
+                vmem1[tid] = val1 += vmem1[tid + 16];\r
+                vmem2[tid] = val2 += vmem2[tid + 16];\r
                 vmem3[tid] = val3 += vmem3[tid + 16];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 8]; \r
-                vmem2[tid] = val2 += vmem2[tid + 8]; \r
+                vmem1[tid] = val1 += vmem1[tid + 8];\r
+                vmem2[tid] = val2 += vmem2[tid + 8];\r
                 vmem3[tid] = val3 += vmem3[tid + 8];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 4]; \r
-                vmem2[tid] = val2 += vmem2[tid + 4]; \r
+                vmem1[tid] = val1 += vmem1[tid + 4];\r
+                vmem2[tid] = val2 += vmem2[tid + 4];\r
                 vmem3[tid] = val3 += vmem3[tid + 4];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 2]; \r
-                vmem2[tid] = val2 += vmem2[tid + 2]; \r
+                vmem1[tid] = val1 += vmem1[tid + 2];\r
+                vmem2[tid] = val2 += vmem2[tid + 2];\r
                 vmem3[tid] = val3 += vmem3[tid + 2];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 1]; \r
-                vmem2[tid] = val2 += vmem2[tid + 1]; \r
+                vmem1[tid] = val1 += vmem1[tid + 1];\r
+                vmem2[tid] = val2 += vmem2[tid + 1];\r
                 vmem3[tid] = val3 += vmem3[tid + 1];\r
             }\r
         }\r
@@ -238,19 +238,19 @@ namespace cv { namespace gpu { namespace device
             __syncthreads();\r
 \r
 #if __CUDA_ARCH__ > 110\r
-            if (tid < 128) \r
-            { \r
-                smem1[tid] = val1 += smem1[tid + 128]; \r
-                smem2[tid] = val2 += smem2[tid + 128];  \r
-            } \r
+            if (tid < 128)\r
+            {\r
+                smem1[tid] = val1 += smem1[tid + 128];\r
+                smem2[tid] = val2 += smem2[tid + 128];\r
+            }\r
             __syncthreads();\r
 #endif\r
 \r
-            if (tid < 64) \r
-            { \r
-                smem1[tid] = val1 += smem1[tid + 64]; \r
-                smem2[tid] = val2 += smem2[tid + 64]; \r
-            } \r
+            if (tid < 64)\r
+            {\r
+                smem1[tid] = val1 += smem1[tid + 64];\r
+                smem2[tid] = val2 += smem2[tid + 64];\r
+            }\r
             __syncthreads();\r
 \r
             if (tid < 32)\r
@@ -258,23 +258,23 @@ namespace cv { namespace gpu { namespace device
                 volatile float* vmem1 = smem1;\r
                 volatile float* vmem2 = smem2;\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 32]; \r
-                vmem2[tid] = val2 += vmem2[tid + 32]; \r
+                vmem1[tid] = val1 += vmem1[tid + 32];\r
+                vmem2[tid] = val2 += vmem2[tid + 32];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 16]; \r
-                vmem2[tid] = val2 += vmem2[tid + 16]; \r
+                vmem1[tid] = val1 += vmem1[tid + 16];\r
+                vmem2[tid] = val2 += vmem2[tid + 16];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 8]; \r
-                vmem2[tid] = val2 += vmem2[tid + 8]; \r
+                vmem1[tid] = val1 += vmem1[tid + 8];\r
+                vmem2[tid] = val2 += vmem2[tid + 8];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 4]; \r
-                vmem2[tid] = val2 += vmem2[tid + 4]; \r
+                vmem1[tid] = val1 += vmem1[tid + 4];\r
+                vmem2[tid] = val2 += vmem2[tid + 4];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 2]; \r
-                vmem2[tid] = val2 += vmem2[tid + 2]; \r
+                vmem1[tid] = val1 += vmem1[tid + 2];\r
+                vmem2[tid] = val2 += vmem2[tid + 2];\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 1]; \r
-                vmem2[tid] = val2 += vmem2[tid + 1]; \r
+                vmem1[tid] = val1 += vmem1[tid + 1];\r
+                vmem2[tid] = val2 += vmem2[tid + 1];\r
             }\r
         }\r
 \r
@@ -284,29 +284,29 @@ namespace cv { namespace gpu { namespace device
             __syncthreads();\r
 \r
 #if __CUDA_ARCH__ > 110\r
-            if (tid < 128) \r
-            { \r
-                smem1[tid] = val1 += smem1[tid + 128]; \r
-            } \r
+            if (tid < 128)\r
+            {\r
+                smem1[tid] = val1 += smem1[tid + 128];\r
+            }\r
             __syncthreads();\r
 #endif\r
 \r
-            if (tid < 64) \r
-            { \r
-                smem1[tid] = val1 += smem1[tid + 64]; \r
-            } \r
+            if (tid < 64)\r
+            {\r
+                smem1[tid] = val1 += smem1[tid + 64];\r
+            }\r
             __syncthreads();\r
 \r
             if (tid < 32)\r
             {\r
                 volatile float* vmem1 = smem1;\r
 \r
-                vmem1[tid] = val1 += vmem1[tid + 32]; \r
-                vmem1[tid] = val1 += vmem1[tid + 16]; \r
-                vmem1[tid] = val1 += vmem1[tid + 8]; \r
+                vmem1[tid] = val1 += vmem1[tid + 32];\r
+                vmem1[tid] = val1 += vmem1[tid + 16];\r
+                vmem1[tid] = val1 += vmem1[tid + 8];\r
                 vmem1[tid] = val1 += vmem1[tid + 4];\r
-                vmem1[tid] = val1 += vmem1[tid + 2]; \r
-                vmem1[tid] = val1 += vmem1[tid + 1]; \r
+                vmem1[tid] = val1 += vmem1[tid + 2];\r
+                vmem1[tid] = val1 += vmem1[tid + 1];\r
             }\r
         }\r
 \r
@@ -341,7 +341,7 @@ namespace cv { namespace gpu { namespace device
                 {\r
                     status[blockIdx.x] = 0;\r
 \r
-                    if (calcErr) \r
+                    if (calcErr)\r
                         err[blockIdx.x] = 0;\r
                 }\r
 \r
@@ -349,7 +349,7 @@ namespace cv { namespace gpu { namespace device
             }\r
 \r
             // extract the patch from the first image, compute covariation matrix of derivatives\r
-            \r
+\r
             float A11 = 0;\r
             float A12 = 0;\r
             float A22 = 0;\r
@@ -359,7 +359,7 @@ namespace cv { namespace gpu { namespace device
             int dIdy_patch[PATCH_Y][PATCH_X];\r
 \r
             for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)\r
-            {                \r
+            {\r
                 for (int x = threadIdx.x, j = 0; x < c_winSize_x_cn; x += blockDim.x, ++j)\r
                 {\r
                     I_patch[i][j] = linearFilter(I, prevPt, x, y);\r
@@ -369,7 +369,7 @@ namespace cv { namespace gpu { namespace device
 \r
                     dIdx_patch[i][j] = ixval;\r
                     dIdy_patch[i][j] = iyval;\r
-                    \r
+\r
                     A11 += ixval * ixval;\r
                     A12 += ixval * iyval;\r
                     A22 += iyval * iyval;\r
@@ -382,7 +382,7 @@ namespace cv { namespace gpu { namespace device
             A11 = smem1[0];\r
             A12 = smem2[0];\r
             A22 = smem3[0];\r
-            \r
+\r
             A11 *= SCALE;\r
             A12 *= SCALE;\r
             A22 *= SCALE;\r
@@ -390,8 +390,8 @@ namespace cv { namespace gpu { namespace device
             {\r
                 float D = A11 * A22 - A12 * A12;\r
                 float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y);\r
-            \r
-                if (calcErr && GET_MIN_EIGENVALS && tid == 0) \r
+\r
+                if (calcErr && GET_MIN_EIGENVALS && tid == 0)\r
                     err[blockIdx.x] = minEig;\r
 \r
                 if (minEig < c_minEigThreshold || D < numeric_limits<float>::epsilon())\r
@@ -403,7 +403,7 @@ namespace cv { namespace gpu { namespace device
                 }\r
 \r
                 D = 1.f / D;\r
-            \r
+\r
                 A11 *= D;\r
                 A12 *= D;\r
                 A22 *= D;\r
@@ -411,8 +411,8 @@ namespace cv { namespace gpu { namespace device
 \r
             float2 nextPt = nextPts[blockIdx.x];\r
             nextPt.x *= 2.f;\r
-            nextPt.y *= 2.f; \r
-            \r
+            nextPt.y *= 2.f;\r
+\r
             nextPt.x -= c_halfWin_x;\r
             nextPt.y -= c_halfWin_y;\r
 \r
@@ -428,7 +428,7 @@ namespace cv { namespace gpu { namespace device
 \r
                 float b1 = 0;\r
                 float b2 = 0;\r
-                \r
+\r
                 for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)\r
                 {\r
                     for (int x = threadIdx.x, j = 0; x < c_winSize_x_cn; x += blockDim.x, ++j)\r
@@ -439,7 +439,7 @@ namespace cv { namespace gpu { namespace device
                         b2 += diff * dIdy_patch[i][j];\r
                     }\r
                 }\r
-                \r
+\r
                 reduce(b1, b2, smem1, smem2, tid);\r
                 __syncthreads();\r
 \r
@@ -448,11 +448,11 @@ namespace cv { namespace gpu { namespace device
 \r
                 b1 *= SCALE;\r
                 b2 *= SCALE;\r
-                    \r
+\r
                 float2 delta;\r
                 delta.x = A12 * b2 - A22 * b1;\r
                 delta.y = A12 * b1 - A11 * b2;\r
-                    \r
+\r
                 nextPt.x += delta.x;\r
                 nextPt.y += delta.y;\r
 \r
@@ -495,7 +495,7 @@ namespace cv { namespace gpu { namespace device
 \r
         template <int PATCH_X, int PATCH_Y>\r
         void lkSparse_caller(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
-            const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, \r
+            const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
             int level, dim3 block, cudaStream_t stream)\r
         {\r
             dim3 grid(ptcount);\r
@@ -532,109 +532,147 @@ namespace cv { namespace gpu { namespace device
         }\r
 \r
         void lkSparse_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
-            const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, \r
+            const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
             int level, dim3 block, dim3 patch, cudaStream_t stream)\r
         {\r
             typedef void (*func_t)(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
-                const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount, \r
+                const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
                 int level, dim3 block, cudaStream_t stream);\r
 \r
-            static const func_t funcs[5][5] = \r
+            static const func_t funcs[5][5] =\r
             {\r
                 {lkSparse_caller<1, 1>, lkSparse_caller<2, 1>, lkSparse_caller<3, 1>, lkSparse_caller<4, 1>, lkSparse_caller<5, 1>},\r
                 {lkSparse_caller<1, 2>, lkSparse_caller<2, 2>, lkSparse_caller<3, 2>, lkSparse_caller<4, 2>, lkSparse_caller<5, 2>},\r
                 {lkSparse_caller<1, 3>, lkSparse_caller<2, 3>, lkSparse_caller<3, 3>, lkSparse_caller<4, 3>, lkSparse_caller<5, 3>},\r
                 {lkSparse_caller<1, 4>, lkSparse_caller<2, 4>, lkSparse_caller<3, 4>, lkSparse_caller<4, 4>, lkSparse_caller<5, 4>},\r
                 {lkSparse_caller<1, 5>, lkSparse_caller<2, 5>, lkSparse_caller<3, 5>, lkSparse_caller<4, 5>, lkSparse_caller<5, 5>}\r
-            };            \r
+            };\r
 \r
             funcs[patch.y - 1][patch.x - 1](I, J, dIdx, dIdy,\r
-                prevPts, nextPts, status, err, GET_MIN_EIGENVALS, ptcount, \r
+                prevPts, nextPts, status, err, GET_MIN_EIGENVALS, ptcount,\r
                 level, block, stream);\r
         }\r
 \r
-        template <bool calcErr, bool GET_MIN_EIGENVALS>\r
-        __global__ void lkDense(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,\r
-            PtrStepf u, PtrStepf v, PtrStepf err, const int rows, const int cols)\r
+        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_I(false, cudaFilterModePoint, cudaAddressModeClamp);\r
+        texture<float, cudaTextureType2D, cudaReadModeElementType> tex_J(false, cudaFilterModeLinear, cudaAddressModeClamp);\r
+\r
+        template <bool calcErr>\r
+        __global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)\r
         {\r
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+            extern __shared__ int smem[];\r
+\r
+            const int patchWidth  = blockDim.x + 2 * c_halfWin_x;\r
+            const int patchHeight = blockDim.y + 2 * c_halfWin_y;\r
+\r
+            int* I_patch = smem;\r
+            int* dIdx_patch = I_patch + patchWidth * patchHeight;\r
+            int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;\r
+\r
+            const int xBase = blockIdx.x * blockDim.x;\r
+            const int yBase = blockIdx.y * blockDim.y;\r
+\r
+            for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)\r
+            {\r
+                for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)\r
+                {\r
+                    float x = xBase - c_halfWin_x + j + 0.5f;\r
+                    float y = yBase - c_halfWin_y + i + 0.5f;\r
+\r
+                    I_patch[i * patchWidth + j] = tex2D(tex_I, x, y);\r
+\r
+                    // Sharr Deriv\r
+\r
+                    dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_I, x+1, y-1) + 10 * tex2D(tex_I, x+1, y) + 3 * tex2D(tex_I, x+1, y+1) -\r
+                                                    (3 * tex2D(tex_I, x-1, y-1) + 10 * tex2D(tex_I, x-1, y) + 3 * tex2D(tex_I, x-1, y+1));\r
+\r
+                    dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_I, x-1, y+1) + 10 * tex2D(tex_I, x, y+1) + 3 * tex2D(tex_I, x+1, y+1) -\r
+                                                    (3 * tex2D(tex_I, x-1, y-1) + 10 * tex2D(tex_I, x, y-1) + 3 * tex2D(tex_I, x+1, y-1));\r
+                }\r
+            }\r
+\r
+            __syncthreads();\r
+\r
+            const int x = xBase + threadIdx.x;\r
+            const int y = yBase + threadIdx.y;\r
 \r
             if (x >= cols || y >= rows)\r
                 return;\r
 \r
-            // extract the patch from the first image, compute covariation matrix of derivatives\r
-            \r
-            float A11 = 0;\r
-            float A12 = 0;\r
-            float A22 = 0;\r
+            int A11i = 0;\r
+            int A12i = 0;\r
+            int A22i = 0;\r
 \r
             for (int i = 0; i < c_winSize_y; ++i)\r
-            {                \r
+            {\r
                 for (int j = 0; j < c_winSize_x; ++j)\r
                 {\r
-                    int ixval = dIdx(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
-                    int iyval = dIdy(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
+                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
+                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
 \r
-                    A11 += ixval * ixval;\r
-                    A12 += ixval * iyval;\r
-                    A22 += iyval * iyval;\r
+                    A11i += dIdx * dIdx;\r
+                    A12i += dIdx * dIdy;\r
+                    A22i += dIdy * dIdy;\r
                 }\r
             }\r
-            \r
-            A11 *= SCALE;\r
-            A12 *= SCALE;\r
-            A22 *= SCALE;\r
 \r
-            {\r
-                float D = A11 * A22 - A12 * A12;\r
-                float minEig = (A22 + A11 - ::sqrtf((A11 - A22) * (A11 - A22) + 4.f * A12 * A12)) / (2 * c_winSize_x * c_winSize_y);\r
+            float A11 = A11i;\r
+            float A12 = A12i;\r
+            float A22 = A22i;\r
 \r
-                if (calcErr && GET_MIN_EIGENVALS)\r
-                    err(y, x) = minEig;\r
-            \r
-                if (minEig < c_minEigThreshold || D < numeric_limits<float>::epsilon())\r
-                    return;\r
+            float D = A11 * A22 - A12 * A12;\r
 \r
-                D = 1.f / D;\r
-            \r
-                A11 *= D;\r
-                A12 *= D;\r
-                A22 *= D;\r
+            if (D < numeric_limits<float>::epsilon())\r
+            {\r
+                if (calcErr)\r
+                    err(y, x) = numeric_limits<float>::max();\r
+\r
+                return;\r
             }\r
 \r
+            D = 1.f / D;\r
+\r
+            A11 *= D;\r
+            A12 *= D;\r
+            A22 *= D;\r
+\r
             float2 nextPt;\r
-            nextPt.x = x - c_halfWin_x + u(y, x);\r
-            nextPt.y = y - c_halfWin_y + v(y, x);\r
+            nextPt.x = x + prevU(y/2, x/2) * 2.0f;\r
+            nextPt.y = y + prevV(y/2, x/2) * 2.0f;\r
 \r
             for (int k = 0; k < c_iters; ++k)\r
             {\r
-                if (nextPt.x < -c_winSize_x || nextPt.x >= cols || nextPt.y < -c_winSize_y || nextPt.y >= rows)\r
+                if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)\r
+                {\r
+                    if (calcErr)\r
+                        err(y, x) = numeric_limits<float>::max();\r
+\r
                     return;\r
+                }\r
+\r
+                int b1 = 0;\r
+                int b2 = 0;\r
 \r
-                float b1 = 0;\r
-                float b2 = 0;\r
-                \r
                 for (int i = 0; i < c_winSize_y; ++i)\r
                 {\r
                     for (int j = 0; j < c_winSize_x; ++j)\r
                     {\r
-                        int I_val = I(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
+                        int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];\r
+                        int J = tex2D(tex_J, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);\r
 \r
-                        int diff = linearFilter(J, nextPt, j, i) - CV_DESCALE(I_val * (1 << W_BITS), W_BITS1 - 5);\r
-                        \r
-                        b1 += diff * dIdx(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
-                        b2 += diff * dIdy(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
+                        int diff = (J - I) * 32;\r
+\r
+                        int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
+                        int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];\r
+\r
+                        b1 += diff * dIdx;\r
+                        b2 += diff * dIdy;\r
                     }\r
                 }\r
 \r
-                b1 *= SCALE;\r
-                b2 *= SCALE;\r
-\r
                 float2 delta;\r
                 delta.x = A12 * b2 - A22 * b1;\r
                 delta.y = A12 * b1 - A11 * b2;\r
-                    \r
+\r
                 nextPt.x += delta.x;\r
                 nextPt.y += delta.y;\r
 \r
@@ -642,57 +680,50 @@ namespace cv { namespace gpu { namespace device
                     break;\r
             }\r
 \r
-            u(y, x) = nextPt.x - x + c_halfWin_x;\r
-            v(y, x) = nextPt.y - y + c_halfWin_y;            \r
+            u(y, x) = nextPt.x - x;\r
+            v(y, x) = nextPt.y - y;\r
 \r
-            if (calcErr && !GET_MIN_EIGENVALS)\r
+            if (calcErr)\r
             {\r
-                float errval = 0.0f;\r
+                int errval = 0;\r
 \r
                 for (int i = 0; i < c_winSize_y; ++i)\r
                 {\r
                     for (int j = 0; j < c_winSize_x; ++j)\r
                     {\r
-                        int I_val = I(y - c_halfWin_y + i, x - c_halfWin_x + j);\r
-                        int diff = linearFilter(J, nextPt, j, i) - CV_DESCALE(I_val * (1 << W_BITS), W_BITS1 - 5);\r
-                        errval += ::fabsf((float)diff);\r
+                        int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];\r
+                        int J = tex2D(tex_J, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);\r
+\r
+                        errval += ::abs(J - I);\r
                     }\r
                 }\r
 \r
-                errval /= 32 * c_winSize_x_cn * c_winSize_y;\r
-\r
-                err(y, x) = errval;\r
+                err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);\r
             }\r
         }\r
 \r
-        void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy, \r
-            DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream)\r
+        void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV,\r
+                         DevMem2Df err, int2 winSize, cudaStream_t stream)\r
         {\r
-            dim3 block(32, 8);\r
+            dim3 block(16, 16);\r
             dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));\r
 \r
-            if (err)\r
-            {\r
-                if (GET_MIN_EIGENVALS)\r
-                {\r
-                    cudaSafeCall( cudaFuncSetCacheConfig(lkDense<true, true>, cudaFuncCachePreferL1) );\r
+            bindTexture(&tex_I, I);\r
+            bindTexture(&tex_J, J);\r
 \r
-                    lkDense<true, true><<<grid, block, 0, stream>>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols);\r
-                    cudaSafeCall( cudaGetLastError() );\r
-                }\r
-                else\r
-                {\r
-                    cudaSafeCall( cudaFuncSetCacheConfig(lkDense<true, false>, cudaFuncCachePreferL1) );\r
+            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);\r
+            const int patchWidth  = block.x + 2 * halfWin.x;\r
+            const int patchHeight = block.y + 2 * halfWin.y;\r
+            size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);\r
 \r
-                    lkDense<true, false><<<grid, block, 0, stream>>>(I, J, dIdx, dIdy, u, v, *err, I.rows, I.cols);\r
-                    cudaSafeCall( cudaGetLastError() );\r
-                }\r
+            if (err.data)\r
+            {\r
+                lkDense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);\r
+                cudaSafeCall( cudaGetLastError() );\r
             }\r
             else\r
             {\r
-                cudaSafeCall( cudaFuncSetCacheConfig(lkDense<false, false>, cudaFuncCachePreferL1) );\r
-\r
-                lkDense<false, false><<<grid, block, 0, stream>>>(I, J, dIdx, dIdy, u, v, PtrStepf(), I.rows, I.cols);\r
+                lkDense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);\r
                 cudaSafeCall( cudaGetLastError() );\r
             }\r
 \r
index d85076b..adb630c 100644 (file)
@@ -66,8 +66,8 @@ namespace cv { namespace gpu { namespace device
             const float2* prevPts, float2* nextPts, uchar* status, float* err, bool GET_MIN_EIGENVALS, int ptcount,\r
             int level, dim3 block, dim3 patch, cudaStream_t stream = 0);\r
 \r
-        void lkDense_gpu(DevMem2Db I, DevMem2Db J, DevMem2D_<short> dIdx, DevMem2D_<short> dIdy,\r
-            DevMem2Df u, DevMem2Df v, DevMem2Df* err, bool GET_MIN_EIGENVALS, cudaStream_t stream = 0);\r
+        void lkDense_gpu(DevMem2Db I, DevMem2Df J, DevMem2Df u, DevMem2Df v, DevMem2Df prevU, DevMem2Df prevV,\r
+                         DevMem2Df err, int2 winSize, cudaStream_t stream = 0);\r
     }\r
 }}}\r
 \r
@@ -160,16 +160,11 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
         return;\r
     }\r
 \r
-    derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);\r
-\r
-    iters = std::min(std::max(iters, 0), 100);\r
-\r
     const int cn = prevImg.channels();\r
 \r
     dim3 block, patch;\r
-    calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);   \r
+    calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);\r
 \r
-    CV_Assert(derivLambda >= 0);\r
     CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);\r
     CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());\r
     CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);\r
@@ -227,80 +222,53 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
 {\r
     using namespace cv::gpu::device::pyrlk;\r
 \r
-    derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);\r
-\r
-    iters = std::min(std::max(iters, 0), 100);\r
-\r
     CV_Assert(prevImg.type() == CV_8UC1);\r
     CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());\r
-    CV_Assert(derivLambda >= 0);\r
-    CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);\r
-\r
-    if (useInitialFlow)\r
-    {\r
-        CV_Assert(u.size() == prevImg.size() && u.type() == CV_32FC1);\r
-        CV_Assert(v.size() == prevImg.size() && v.type() == CV_32FC1);\r
-    }\r
-    else\r
-    {\r
-        u.create(prevImg.size(), CV_32FC1);\r
-        v.create(prevImg.size(), CV_32FC1);\r
-\r
-        u.setTo(Scalar::all(0));\r
-        v.setTo(Scalar::all(0));\r
-    }\r
+    CV_Assert(maxLevel >= 0);\r
+    CV_Assert(winSize.width > 2 && winSize.height > 2);\r
 \r
     if (err)\r
         err->create(prevImg.size(), CV_32FC1);\r
 \r
     // build the image pyramids.\r
-    // we pad each level with +/-winSize.{width|height}\r
-    // pixels to simplify the further patch extraction.\r
 \r
-    buildImagePyramid(prevImg, prevPyr_, true);\r
-    buildImagePyramid(nextImg, nextPyr_, true);\r
-    buildImagePyramid(u, uPyr_, false);\r
-    buildImagePyramid(v, vPyr_, false);\r
+    buildImagePyramid(prevImg, prevPyr_, false);\r
 \r
-    // dI/dx ~ Ix, dI/dy ~ Iy\r
+    nextPyr_.resize(maxLevel + 1);\r
+    nextImg.convertTo(nextPyr_[0], CV_32F);\r
+    for (int level = 1; level <= maxLevel; ++level)\r
+        pyrDown(nextPyr_[level - 1], nextPyr_[level]);\r
+\r
+    uPyr_.resize(2);\r
+    vPyr_.resize(2);\r
 \r
-    ensureSizeIsEnough(prevImg.rows + winSize.height * 2, prevImg.cols + winSize.width * 2, CV_16SC1, dx_buf_);\r
-    ensureSizeIsEnough(prevImg.rows + winSize.height * 2, prevImg.cols + winSize.width * 2, CV_16SC1, dy_buf_);\r
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);\r
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);\r
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);\r
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);\r
+    uPyr_[1].setTo(Scalar::all(0));\r
+    vPyr_[1].setTo(Scalar::all(0));\r
 \r
-    loadConstants(1, minEigThreshold, make_int2(winSize.width, winSize.height), iters);\r
+    int2 winSize2i = make_int2(winSize.width, winSize.height);\r
+    loadConstants(1, minEigThreshold, winSize2i, iters);\r
 \r
     DevMem2Df derr = err ? *err : DevMem2Df();\r
 \r
+    int idx = 0;\r
+\r
     for (int level = maxLevel; level >= 0; level--)\r
     {\r
-        Size imgSize = prevPyr_[level].size();\r
-\r
-        GpuMat dxWhole(imgSize.height + winSize.height * 2, imgSize.width + winSize.width * 2, dx_buf_.type(), dx_buf_.data, dx_buf_.step);\r
-        GpuMat dyWhole(imgSize.height + winSize.height * 2, imgSize.width + winSize.width * 2, dy_buf_.type(), dy_buf_.data, dy_buf_.step);\r
-        dxWhole.setTo(Scalar::all(0));\r
-        dyWhole.setTo(Scalar::all(0));\r
-        GpuMat dIdx = dxWhole(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));\r
-        GpuMat dIdy = dyWhole(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));\r
+        int idx2 = (idx + 1) & 1;\r
 \r
-        calcSharrDeriv(prevPyr_[level], dIdx, dIdy);\r
+        lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],\r
+            level == 0 ? derr : DevMem2Df(), winSize2i);\r
 \r
-        lkDense_gpu(prevPyr_[level], nextPyr_[level], dIdx, dIdy, uPyr_[level], vPyr_[level],\r
-            level == 0 && err ? &derr : 0, getMinEigenVals);\r
-\r
-        if (level == 0)\r
-        {\r
-            uPyr_[0].copyTo(u);\r
-            vPyr_[0].copyTo(v);\r
-        }\r
-        else\r
-        {\r
-            resize(uPyr_[level], uPyr_[level - 1], uPyr_[level - 1].size());\r
-            resize(vPyr_[level], vPyr_[level - 1], vPyr_[level - 1].size());\r
-\r
-            multiply(uPyr_[level - 1], Scalar::all(2), uPyr_[level - 1]);\r
-            multiply(vPyr_[level - 1], Scalar::all(2), vPyr_[level - 1]);\r
-        }\r
+        if (level > 0)\r
+            idx = idx2;\r
     }\r
+\r
+    uPyr_[idx].copyTo(u);\r
+    vPyr_[idx].copyTo(v);\r
 }\r
 \r
 #endif /* !defined (HAVE_CUDA) */\r
index 56e3ce0..5bff8f9 100644 (file)
@@ -159,7 +159,6 @@ int main(int argc, const char* argv[])
         "{ win_size     | win_size       | 21    | specify windows size [PyrLK] }"\r
         "{ max_level    | max_level      | 3     | specify max level [PyrLK] }"\r
         "{ iters        | iters          | 30    | specify iterations count [PyrLK] }"\r
-        "{ deriv_lambda | deriv_lambda   | 0.5   | specify deriv lambda [PyrLK] }"\r
         "{ points       | points         | 4000  | specify points count [GoodFeatureToTrack] }"\r
         "{ min_dist     | min_dist       | 0     | specify minimal distance between points [GoodFeatureToTrack] }";\r
 \r
@@ -186,7 +185,6 @@ int main(int argc, const char* argv[])
     int winSize = cmd.get<int>("win_size");\r
     int maxLevel = cmd.get<int>("max_level");\r
     int iters = cmd.get<int>("iters");\r
-    double derivLambda = cmd.get<double>("deriv_lambda");\r
     int points = cmd.get<int>("points");\r
     double minDist = cmd.get<double>("min_dist");\r
 \r
@@ -235,7 +233,6 @@ int main(int argc, const char* argv[])
     d_pyrLK.winSize.height = winSize;\r
     d_pyrLK.maxLevel = maxLevel;\r
     d_pyrLK.iters = iters;\r
-    d_pyrLK.derivLambda = derivLambda;\r
 \r
     GpuMat d_frame0(frame0);\r
     GpuMat d_frame1(frame1);\r