From 681ac9beda72b7116b9c72c3d302faf2c874bdaf Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Thu, 16 Feb 2012 11:26:17 +0000 Subject: [PATCH] Added missing files --- modules/gpu/src/cuda/optical_flow_farneback.cu | 614 +++++++++++++++++++++++++ modules/gpu/src/optical_flow_farneback.cpp | 399 ++++++++++++++++ 2 files changed, 1013 insertions(+) create mode 100644 modules/gpu/src/cuda/optical_flow_farneback.cu create mode 100644 modules/gpu/src/optical_flow_farneback.cpp diff --git a/modules/gpu/src/cuda/optical_flow_farneback.cu b/modules/gpu/src/cuda/optical_flow_farneback.cu new file mode 100644 index 0000000..39e4585 --- /dev/null +++ b/modules/gpu/src/cuda/optical_flow_farneback.cu @@ -0,0 +1,614 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include +#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +#define tx threadIdx.x +#define ty threadIdx.y +#define bx blockIdx.x +#define by blockIdx.y +#define bdx blockDim.x +#define bdy blockDim.y + +#define BORDER_SIZE 5 +#define MAX_KSIZE_HALF 100 + +using namespace std; + +namespace cv { namespace gpu { namespace device { namespace optflow_farneback +{ + __constant__ float c_g[8]; + __constant__ float c_xg[8]; + __constant__ float c_xxg[8]; + __constant__ float c_ig11, c_ig03, c_ig33, c_ig55; + + + template + __global__ void polynomialExpansion( + const int height, const int width, const PtrStepf src, PtrStepf dst) + { + const int y = by * bdy + ty; + const int x = bx * (bdx - 2*polyN) + tx - polyN; + + if (y < height) + { + extern __shared__ float smem[]; + volatile float *row = smem + tx; + int xWarped = ::min(::max(x, 0), width - 1); + + row[0] = src(y, xWarped) * c_g[0]; + row[bdx] = 0.f; + row[2*bdx] = 0.f; + + for (int k = 1; k <= polyN; ++k) + { + float t0 = src(::max(y - k, 0), xWarped); + float t1 = src(::min(y + k, height - 1), xWarped); + + row[0] += c_g[k] * (t0 + t1); + row[bdx] += c_xg[k] * (t1 - t0); + row[2*bdx] += c_xxg[k] * (t0 + t1); + } + + __syncthreads(); + + if (tx >= polyN && tx + polyN < bdx && x < width) + { + float b1 = c_g[0] * row[0]; + float b3 = c_g[0] * row[bdx]; + float b5 = c_g[0] * row[2*bdx]; + float b2 = 0, b4 = 0, b6 = 0; + + for (int k = 1; k <= polyN; ++k) + { + b1 += (row[k] + row[-k]) * c_g[k]; + b4 += (row[k] + row[-k]) * c_xxg[k]; + b2 += (row[k] - row[-k]) * c_xg[k]; + b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k]; + b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k]; + b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k]; + } + + dst(y, xWarped) = b3*c_ig11; + dst(height + y, xWarped) = b2*c_ig11; + dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33; + dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33; + dst(4*height + y, xWarped) = b6*c_ig55; + } + } + } + + + void setPolinomialExpansionConsts( + int polyN, const float *g, const float *xg, const float *xxg, + float ig11, float ig03, float ig33, float ig55) + { + cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g))); + cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg))); + cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg))); + cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11))); + cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03))); + cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33))); + cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55))); + } + + + void polynomialExpansionGpu(const DevMem2Df &src, int polyN, DevMem2Df dst, cudaStream_t stream) + { + dim3 block(256); + dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows); + int smem = 3 * block.x * sizeof(float); + + if (polyN == 5) + polynomialExpansion<5><<>>(src.rows, src.cols, src, dst); + else if (polyN == 7) + polynomialExpansion<7><<>>(src.rows, src.cols, src, dst); + + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + __constant__ float c_border[BORDER_SIZE + 1]; + + __global__ void updateMatrices( + const int height, const int width, const PtrStepf flowx, const PtrStepf flowy, + const PtrStepf R0, const PtrStepf R1, PtrStepf M) + { + const int y = by * bdy + ty; + const int x = bx * bdx + tx; + + if (y < height && x < width) + { + float dx = flowx(y, x); + float dy = flowy(y, x); + float fx = x + dx; + float fy = y + dy; + + int x1 = floorf(fx); + int y1 = floorf(fy); + fx -= x1; fy -= y1; + + float r2, r3, r4, r5, r6; + + if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1) + { + float a00 = (1.f - fx) * (1.f - fy); + float a01 = fx * (1.f - fy); + float a10 = (1.f - fx) * fy; + float a11 = fx * fy; + + r2 = a00 * R1(y1, x1) + + a01 * R1(y1, x1 + 1) + + a10 * R1(y1 + 1, x1) + + a11 * R1(y1 + 1, x1 + 1); + + r3 = a00 * R1(height + y1, x1) + + a01 * R1(height + y1, x1 + 1) + + a10 * R1(height + y1 + 1, x1) + + a11 * R1(height + y1 + 1, x1 + 1); + + r4 = a00 * R1(2*height + y1, x1) + + a01 * R1(2*height + y1, x1 + 1) + + a10 * R1(2*height + y1 + 1, x1) + + a11 * R1(2*height + y1 + 1, x1 + 1); + + r5 = a00 * R1(3*height + y1, x1) + + a01 * R1(3*height + y1, x1 + 1) + + a10 * R1(3*height + y1 + 1, x1) + + a11 * R1(3*height + y1 + 1, x1 + 1); + + r6 = a00 * R1(4*height + y1, x1) + + a01 * R1(4*height + y1, x1 + 1) + + a10 * R1(4*height + y1 + 1, x1) + + a11 * R1(4*height + y1 + 1, x1 + 1); + + r4 = (R0(2*height + y, x) + r4) * 0.5f; + r5 = (R0(3*height + y, x) + r5) * 0.5f; + r6 = (R0(4*height + y, x) + r6) * 0.25f; + } + else + { + r2 = r3 = 0.f; + r4 = R0(2*height + y, x); + r5 = R0(3*height + y, x); + r6 = R0(4*height + y, x) * 0.5f; + } + + r2 = (R0(y, x) - r2) * 0.5f; + r3 = (R0(height + y, x) - r3) * 0.5f; + + r2 += r4*dy + r6*dx; + r3 += r6*dy + r5*dx; + + float scale = + c_border[::min(x, BORDER_SIZE)] * + c_border[::min(y, BORDER_SIZE)] * + c_border[::min(width - x - 1, BORDER_SIZE)] * + c_border[::min(height - y - 1, BORDER_SIZE)]; + + r2 *= scale; r3 *= scale; r4 *= scale; + r5 *= scale; r6 *= scale; + + M(y, x) = r4*r4 + r6*r6; + M(height + y, x) = (r4 + r5)*r6; + M(2*height + y, x) = r5*r5 + r6*r6; + M(3*height + y, x) = r4*r2 + r6*r3; + M(4*height + y, x) = r6*r2 + r5*r3; + } + } + + + void setUpdateMatricesConsts() + { + static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f}; + cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border))); + } + + + void updateMatricesGpu( + const DevMem2Df flowx, const DevMem2Df flowy, const DevMem2Df R0, const DevMem2Df R1, + DevMem2Df M, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y)); + + updateMatrices<<>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M); + + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + __global__ void updateFlow( + const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy) + { + const int y = by * bdy + ty; + const int x = bx * bdx + tx; + + if (y < height && x < width) + { + float g11 = M(y, x); + float g12 = M(height + y, x); + float g22 = M(2*height + y, x); + float h1 = M(3*height + y, x); + float h2 = M(4*height + y, x); + + float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f); + + flowx(y, x) = (g11*h2 - g12*h1) * detInv; + flowy(y, x) = (g22*h1 - g12*h2) * detInv; + } + } + + + void updateFlowGpu(const DevMem2Df M, DevMem2Df flowx, DevMem2Df flowy, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y)); + + updateFlow<<>>(flowx.rows, flowx.cols, M, flowx, flowy); + + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + /*__global__ void boxFilter( + const int height, const int width, const PtrStepf src, + const int ksizeHalf, const float boxAreaInv, PtrStepf dst) + { + const int y = by * bdy + ty; + const int x = bx * bdx + tx; + + extern __shared__ float smem[]; + volatile float *row = smem + ty * (bdx + 2*ksizeHalf); + + if (y < height) + { + // Vertical pass + for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) + { + int xExt = int(bx * bdx) + i - ksizeHalf; + xExt = ::min(::max(xExt, 0), width - 1); + + row[i] = src(y, xExt); + for (int j = 1; j <= ksizeHalf; ++j) + row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt); + } + + if (x < width) + { + __syncthreads(); + + // Horizontal passs + row += tx + ksizeHalf; + float res = row[0]; + for (int i = 1; i <= ksizeHalf; ++i) + res += row[-i] + row[i]; + dst(y, x) = res * boxAreaInv; + } + } + } + + + void boxFilterGpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream) + { + dim3 block(256); + dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float); + + float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf)); + boxFilter<<>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst); + + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + }*/ + + + __global__ void boxFilter5( + const int height, const int width, const PtrStepf src, + const int ksizeHalf, const float boxAreaInv, PtrStepf dst) + { + const int y = by * bdy + ty; + const int x = bx * bdx + tx; + + extern __shared__ float smem[]; + + const int smw = bdx + 2*ksizeHalf; // shared memory "width" + volatile float *row = smem + 5 * ty * smw; + + if (y < height) + { + // Vertical pass + for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) + { + int xExt = int(bx * bdx) + i - ksizeHalf; + xExt = ::min(::max(xExt, 0), width - 1); + + #pragma unroll + for (int k = 0; k < 5; ++k) + row[k*smw + i] = src(k*height + y, xExt); + + for (int j = 1; j <= ksizeHalf; ++j) + #pragma unroll + for (int k = 0; k < 5; ++k) + row[k*smw + i] += + src(k*height + ::max(y - j, 0), xExt) + + src(k*height + ::min(y + j, height - 1), xExt); + } + + if (x < width) + { + __syncthreads(); + + // Horizontal passs + + row += tx + ksizeHalf; + float res[5]; + + #pragma unroll + for (int k = 0; k < 5; ++k) + res[k] = row[k*smw]; + + for (int i = 1; i <= ksizeHalf; ++i) + #pragma unroll + for (int k = 0; k < 5; ++k) + res[k] += row[k*smw - i] + row[k*smw + i]; + + #pragma unroll + for (int k = 0; k < 5; ++k) + dst(k*height + y, x) = res[k] * boxAreaInv; + } + } + } + + + void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream) + { + int height = src.rows / 5; + int width = src.cols; + + dim3 block(256); + dim3 grid(divUp(width, block.x), divUp(height, block.y)); + int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float); + + float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf)); + boxFilter5<<>>(height, width, src, ksizeHalf, boxAreaInv, dst); + + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + __constant__ float c_gKer[MAX_KSIZE_HALF + 1]; + + template + __global__ void gaussianBlur( + const int height, const int width, const PtrStepf src, const int ksizeHalf, + const Border b, PtrStepf dst) + { + const int y = by * bdy + ty; + const int x = bx * bdx + tx; + + extern __shared__ float smem[]; + volatile float *row = smem + ty * (bdx + 2*ksizeHalf); + + if (y < height) + { + // Vertical pass + for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) + { + int xExt = int(bx * bdx) + i - ksizeHalf; + xExt = b.idx_col(xExt); + row[i] = src(y, xExt) * c_gKer[0]; + for (int j = 1; j <= ksizeHalf; ++j) + row[i] += + (src(b.idx_row_low(y - j), xExt) + + src(b.idx_row_high(y + j), xExt)) * c_gKer[j]; + } + + if (x < width) + { + __syncthreads(); + + // Horizontal pass + row += tx + ksizeHalf; + float res = row[0] * c_gKer[0]; + for (int i = 1; i <= ksizeHalf; ++i) + res += (row[-i] + row[i]) * c_gKer[i]; + dst(y, x) = res; + } + } + } + + + void setGaussianBlurKernel(const float *gKer, int ksizeHalf) + { + cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer))); + } + + + template + void gaussianBlurCaller(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream) + { + int height = src.rows; + int width = src.cols; + + dim3 block(256); + dim3 grid(divUp(width, block.x), divUp(height, block.y)); + int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float); + Border b(height, width); + + gaussianBlur<<>>(height, width, src, ksizeHalf, b, dst); + + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + void gaussianBlurGpu( + const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream) + { + typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t); + + static const caller_t callers[] = + { + gaussianBlurCaller >, + gaussianBlurCaller >, + }; + + callers[borderMode](src, ksizeHalf, dst, stream); + } + + + template + __global__ void gaussianBlur5( + const int height, const int width, const PtrStepf src, const int ksizeHalf, + const Border b, PtrStepf dst) + { + const int y = by * bdy + ty; + const int x = bx * bdx + tx; + + extern __shared__ float smem[]; + + const int smw = bdx + 2*ksizeHalf; // shared memory "width" + volatile float *row = smem + 5 * ty * smw; + + if (y < height) + { + // Vertical pass + for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) + { + int xExt = int(bx * bdx) + i - ksizeHalf; + xExt = b.idx_col(xExt); + + #pragma unroll + for (int k = 0; k < 5; ++k) + row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0]; + + for (int j = 1; j <= ksizeHalf; ++j) + #pragma unroll + for (int k = 0; k < 5; ++k) + row[k*smw + i] += + (src(k*height + b.idx_row_low(y - j), xExt) + + src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j]; + } + + if (x < width) + { + __syncthreads(); + + // Horizontal pass + + row += tx + ksizeHalf; + float res[5]; + + #pragma unroll + for (int k = 0; k < 5; ++k) + res[k] = row[k*smw] * c_gKer[0]; + + for (int i = 1; i <= ksizeHalf; ++i) + #pragma unroll + for (int k = 0; k < 5; ++k) + res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i]; + + #pragma unroll + for (int k = 0; k < 5; ++k) + dst(k*height + y, x) = res[k]; + } + } + } + + + template + void gaussianBlur5Caller( + const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream) + { + int height = src.rows / 5; + int width = src.cols; + + dim3 block(256); + dim3 grid(divUp(width, block.x), divUp(height, block.y)); + int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float); + Border b(height, width); + + gaussianBlur5<<>>(height, width, src, ksizeHalf, b, dst); + + cudaSafeCall(cudaGetLastError()); + + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + + + void gaussianBlur5Gpu( + const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream) + { + typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t); + + static const caller_t callers[] = + { + gaussianBlur5Caller >, + gaussianBlur5Caller >, + }; + + callers[borderMode](src, ksizeHalf, dst, stream); + } + +}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback + diff --git a/modules/gpu/src/optical_flow_farneback.cpp b/modules/gpu/src/optical_flow_farneback.cpp new file mode 100644 index 0000000..85ebfbc --- /dev/null +++ b/modules/gpu/src/optical_flow_farneback.cpp @@ -0,0 +1,399 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include + +#define MIN_SIZE 32 + +#define S(x) StreamAccessor::getStream(x) + +// GPU resize() is fast, but it differs from the CPU analog. Disabling this flag +// leads to an inefficient code. It's for debug purposes only. +#define ENABLE_GPU_RESIZE 1 + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +#if !defined(HAVE_CUDA) + +void cv::gpu::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } + +#else + +namespace cv { namespace gpu { namespace device { namespace optflow_farneback +{ + void setPolinomialExpansionConsts( + int polyN, const float *g, const float *xg, const float *xxg, + float ig11, float ig03, float ig33, float ig55); + + void polynomialExpansionGpu(const DevMem2Df &src, int polyN, DevMem2Df dst, cudaStream_t stream); + + void setUpdateMatricesConsts(); + + void updateMatricesGpu( + const DevMem2Df flowx, const DevMem2Df flowy, const DevMem2Df R0, const DevMem2Df R1, + DevMem2Df M, cudaStream_t stream); + + void updateFlowGpu( + const DevMem2Df M, DevMem2Df flowx, DevMem2Df flowy, cudaStream_t stream); + + /*void boxFilterGpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);*/ + + void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream); + + void setGaussianBlurKernel(const float *gKer, int ksizeHalf); + + void gaussianBlurGpu( + const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream); + + void gaussianBlur5Gpu( + const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream); + +}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback + + +void cv::gpu::FarnebackOpticalFlow::prepareGaussian( + int n, double sigma, float *g, float *xg, float *xxg, + double &ig11, double &ig03, double &ig33, double &ig55) +{ + double s = 0.; + for (int x = -n; x <= n; x++) + { + g[x] = (float)std::exp(-x*x/(2*sigma*sigma)); + s += g[x]; + } + + s = 1./s; + for (int x = -n; x <= n; x++) + { + g[x] = (float)(g[x]*s); + xg[x] = (float)(x*g[x]); + xxg[x] = (float)(x*x*g[x]); + } + + Mat_ G(6, 6); + G.setTo(0); + + for (int y = -n; y <= n; y++) + { + for (int x = -n; x <= n; x++) + { + G(0,0) += g[y]*g[x]; + G(1,1) += g[y]*g[x]*x*x; + G(3,3) += g[y]*g[x]*x*x*x*x; + G(5,5) += g[y]*g[x]*x*x*y*y; + } + } + + //G[0][0] = 1.; + G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1); + G(4,4) = G(3,3); + G(3,4) = G(4,3) = G(5,5); + + // invG: + // [ x e e ] + // [ y ] + // [ y ] + // [ e z ] + // [ e z ] + // [ u ] + Mat_ invG = G.inv(DECOMP_CHOLESKY); + + ig11 = invG(1,1); + ig03 = invG(0,3); + ig33 = invG(3,3); + ig55 = invG(5,5); +} + + +void cv::gpu::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma) +{ + vector buf(n*6 + 3); + float* g = &buf[0] + n; + float* xg = g + n*2 + 1; + float* xxg = xg + n*2 + 1; + + if (sigma < FLT_EPSILON) + sigma = n*0.3; + + double ig11, ig03, ig33, ig55; + prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55); + + device::optflow_farneback::setPolinomialExpansionConsts(n, g, xg, xxg, ig11, ig03, ig33, ig55); +} + + +void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter( + const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy, + GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) +{ + device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0])); + swap(M, bufM); + + for (int i = 1; i < 5; ++i) + streams[i].waitForCompletion(); + device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0])); + + if (updateMatrices) + device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0])); +} + + +void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur( + const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy, + GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) +{ + device::optflow_farneback::gaussianBlur5Gpu( + M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0])); + swap(M, bufM); + + device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0])); + + if (updateMatrices) + device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0])); +} + + +void cv::gpu::FarnebackOpticalFlow::operator ()( + const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s) +{ + CV_Assert(frame0.type() == CV_8U && frame1.type() == CV_8U); + CV_Assert(frame0.size() == frame1.size()); + CV_Assert(polyN == 5 || polyN == 7); + CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6); + + Stream streams[5]; + if (S(s)) + streams[0] = s; + + Size size = frame0.size(); + GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY; + + flowx.create(size, CV_32F); + flowy.create(size, CV_32F); + GpuMat flowx0 = flowx; + GpuMat flowy0 = flowy; + + // Crop unnecessary levels + double scale = 1; + int numLevelsCropped = 0; + for (; numLevelsCropped < numLevels; numLevelsCropped++) + { + scale *= pyrScale; + if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE) + break; + } + + streams[0].enqueueConvert(frame0, frames_[0], CV_32F); + streams[1].enqueueConvert(frame1, frames_[1], CV_32F); + + if (fastPyramids) + { + // Build Gaussian pyramids using pyrDown() + pyramid0_.resize(numLevelsCropped + 1); + pyramid1_.resize(numLevelsCropped + 1); + pyramid0_[0] = frames_[0]; + pyramid1_[0] = frames_[1]; + for (int i = 1; i <= numLevelsCropped; ++i) + { + pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]); + pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]); + } + } + + setPolynomialExpansionConsts(polyN, polySigma); + device::optflow_farneback::setUpdateMatricesConsts(); + + for (int k = numLevelsCropped; k >= 0; k--) + { + streams[0].waitForCompletion(); + + scale = 1; + for (int i = 0; i < k; i++) + scale *= pyrScale; + + double sigma = (1./scale - 1) * 0.5; + int smoothSize = cvRound(sigma*5) | 1; + smoothSize = std::max(smoothSize, 3); + + int width = cvRound(size.width*scale); + int height = cvRound(size.height*scale); + + if (fastPyramids) + { + width = pyramid0_[k].cols; + height = pyramid0_[k].rows; + } + + if (k > 0) + { + curFlowX.create(height, width, CV_32F); + curFlowY.create(height, width, CV_32F); + } + else + { + curFlowX = flowx0; + curFlowY = flowy0; + } + + if (!prevFlowX.data) + { + if (flags & OPTFLOW_USE_INITIAL_FLOW) + { +#if ENABLE_GPU_RESIZE + resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); + resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); + streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale); + streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale); +#else + Mat tmp1, tmp2; + flowx0.download(tmp1); + resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA); + tmp2 *= scale; + curFlowX.upload(tmp2); + flowy0.download(tmp1); + resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA); + tmp2 *= scale; + curFlowY.upload(tmp2); +#endif + } + else + { + streams[0].enqueueMemSet(curFlowX, 0); + streams[1].enqueueMemSet(curFlowY, 0); + } + } + else + { +#if ENABLE_GPU_RESIZE + resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); + resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); + streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale); + streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale); +#else + Mat tmp1, tmp2; + prevFlowX.download(tmp1); + resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR); + tmp2 *= 1./pyrScale; + curFlowX.upload(tmp2); + prevFlowY.download(tmp1); + resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR); + tmp2 *= 1./pyrScale; + curFlowY.upload(tmp2); +#endif + } + + GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_); + GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_); + GpuMat R[2] = + { + allocMatFromBuf(5*height, width, CV_32F, R_[0]), + allocMatFromBuf(5*height, width, CV_32F, R_[1]) + }; + + if (fastPyramids) + { + device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0])); + device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1])); + } + else + { + GpuMat tmp[2] = + { + allocMatFromBuf(size.height, size.width, CV_32F, tmp_[0]), + allocMatFromBuf(size.height, size.width, CV_32F, tmp_[1]) + }; + GpuMat I[2] = + { + allocMatFromBuf(height, width, CV_32F, I_[0]), + allocMatFromBuf(height, width, CV_32F, I_[1]) + }; + + Mat g = getGaussianKernel(smoothSize, sigma, CV_32F); + device::optflow_farneback::setGaussianBlurKernel(g.ptr(smoothSize/2), smoothSize/2); + + for (int i = 0; i < 2; i++) + { + device::optflow_farneback::gaussianBlurGpu( + frames_[i], smoothSize/2, tmp[i], BORDER_REFLECT101_GPU, S(streams[i])); +#if ENABLE_GPU_RESIZE + resize(tmp[i], I[i], Size(width, height), INTER_LINEAR, streams[i]); +#else + Mat tmp1, tmp2; + tmp[i].download(tmp1); + resize(tmp1, tmp2, Size(width, height), INTER_LINEAR); + I[i].upload(tmp2); +#endif + device::optflow_farneback::polynomialExpansionGpu(I[i], polyN, R[i], S(streams[i])); + } + } + + streams[1].waitForCompletion(); + device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0])); + + if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) + { + Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F); + device::optflow_farneback::setGaussianBlurKernel(g.ptr(winSize/2), winSize/2); + } + for (int i = 0; i < numIters; i++) + { + if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) + updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); + else + updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); + } + + prevFlowX = curFlowX; + prevFlowY = curFlowY; + } + + flowx = curFlowX; + flowy = curFlowY; + + if (!S(s)) + streams[0].waitForCompletion(); +} + +#endif -- 2.7.4