1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include "internal_shared.hpp"
46 #include "opencv2/gpu/device/common.hpp"
47 #include "opencv2/gpu/device/border_interpolate.hpp"
49 #define tx threadIdx.x
50 #define ty threadIdx.y
53 #define bdx blockDim.x
54 #define bdy blockDim.y
57 #define MAX_KSIZE_HALF 100
59 namespace cv { namespace gpu { namespace device { namespace optflow_farneback
61 __constant__ float c_g[8];
62 __constant__ float c_xg[8];
63 __constant__ float c_xxg[8];
64 __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
68 __global__ void polynomialExpansion(
69 const int height, const int width, const PtrStepf src, PtrStepf dst)
71 const int y = by * bdy + ty;
72 const int x = bx * (bdx - 2*polyN) + tx - polyN;
76 extern __shared__ float smem[];
77 volatile float *row = smem + tx;
78 int xWarped = ::min(::max(x, 0), width - 1);
80 row[0] = src(y, xWarped) * c_g[0];
84 for (int k = 1; k <= polyN; ++k)
86 float t0 = src(::max(y - k, 0), xWarped);
87 float t1 = src(::min(y + k, height - 1), xWarped);
89 row[0] += c_g[k] * (t0 + t1);
90 row[bdx] += c_xg[k] * (t1 - t0);
91 row[2*bdx] += c_xxg[k] * (t0 + t1);
96 if (tx >= polyN && tx + polyN < bdx && x < width)
98 float b1 = c_g[0] * row[0];
99 float b3 = c_g[0] * row[bdx];
100 float b5 = c_g[0] * row[2*bdx];
101 float b2 = 0, b4 = 0, b6 = 0;
103 for (int k = 1; k <= polyN; ++k)
105 b1 += (row[k] + row[-k]) * c_g[k];
106 b4 += (row[k] + row[-k]) * c_xxg[k];
107 b2 += (row[k] - row[-k]) * c_xg[k];
108 b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
109 b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
110 b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
113 dst(y, xWarped) = b3*c_ig11;
114 dst(height + y, xWarped) = b2*c_ig11;
115 dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
116 dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
117 dst(4*height + y, xWarped) = b6*c_ig55;
123 void setPolynomialExpansionConsts(
124 int polyN, const float *g, const float *xg, const float *xxg,
125 float ig11, float ig03, float ig33, float ig55)
127 cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
128 cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
129 cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
130 cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
131 cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
132 cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
133 cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
137 void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
140 dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
141 int smem = 3 * block.x * sizeof(float);
144 polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
146 polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
148 cudaSafeCall(cudaGetLastError());
151 cudaSafeCall(cudaDeviceSynchronize());
155 __constant__ float c_border[BORDER_SIZE + 1];
157 __global__ void updateMatrices(
158 const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
159 const PtrStepf R0, const PtrStepf R1, PtrStepf M)
161 const int y = by * bdy + ty;
162 const int x = bx * bdx + tx;
164 if (y < height && x < width)
166 float dx = flowx(y, x);
167 float dy = flowy(y, x);
175 float r2, r3, r4, r5, r6;
177 if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
179 float a00 = (1.f - fx) * (1.f - fy);
180 float a01 = fx * (1.f - fy);
181 float a10 = (1.f - fx) * fy;
184 r2 = a00 * R1(y1, x1) +
185 a01 * R1(y1, x1 + 1) +
186 a10 * R1(y1 + 1, x1) +
187 a11 * R1(y1 + 1, x1 + 1);
189 r3 = a00 * R1(height + y1, x1) +
190 a01 * R1(height + y1, x1 + 1) +
191 a10 * R1(height + y1 + 1, x1) +
192 a11 * R1(height + y1 + 1, x1 + 1);
194 r4 = a00 * R1(2*height + y1, x1) +
195 a01 * R1(2*height + y1, x1 + 1) +
196 a10 * R1(2*height + y1 + 1, x1) +
197 a11 * R1(2*height + y1 + 1, x1 + 1);
199 r5 = a00 * R1(3*height + y1, x1) +
200 a01 * R1(3*height + y1, x1 + 1) +
201 a10 * R1(3*height + y1 + 1, x1) +
202 a11 * R1(3*height + y1 + 1, x1 + 1);
204 r6 = a00 * R1(4*height + y1, x1) +
205 a01 * R1(4*height + y1, x1 + 1) +
206 a10 * R1(4*height + y1 + 1, x1) +
207 a11 * R1(4*height + y1 + 1, x1 + 1);
209 r4 = (R0(2*height + y, x) + r4) * 0.5f;
210 r5 = (R0(3*height + y, x) + r5) * 0.5f;
211 r6 = (R0(4*height + y, x) + r6) * 0.25f;
216 r4 = R0(2*height + y, x);
217 r5 = R0(3*height + y, x);
218 r6 = R0(4*height + y, x) * 0.5f;
221 r2 = (R0(y, x) - r2) * 0.5f;
222 r3 = (R0(height + y, x) - r3) * 0.5f;
228 c_border[::min(x, BORDER_SIZE)] *
229 c_border[::min(y, BORDER_SIZE)] *
230 c_border[::min(width - x - 1, BORDER_SIZE)] *
231 c_border[::min(height - y - 1, BORDER_SIZE)];
233 r2 *= scale; r3 *= scale; r4 *= scale;
234 r5 *= scale; r6 *= scale;
236 M(y, x) = r4*r4 + r6*r6;
237 M(height + y, x) = (r4 + r5)*r6;
238 M(2*height + y, x) = r5*r5 + r6*r6;
239 M(3*height + y, x) = r4*r2 + r6*r3;
240 M(4*height + y, x) = r6*r2 + r5*r3;
245 void setUpdateMatricesConsts()
247 static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
248 cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
252 void updateMatricesGpu(
253 const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
254 PtrStepSzf M, cudaStream_t stream)
257 dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
259 updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
261 cudaSafeCall(cudaGetLastError());
264 cudaSafeCall(cudaDeviceSynchronize());
268 __global__ void updateFlow(
269 const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
271 const int y = by * bdy + ty;
272 const int x = bx * bdx + tx;
274 if (y < height && x < width)
277 float g12 = M(height + y, x);
278 float g22 = M(2*height + y, x);
279 float h1 = M(3*height + y, x);
280 float h2 = M(4*height + y, x);
282 float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
284 flowx(y, x) = (g11*h2 - g12*h1) * detInv;
285 flowy(y, x) = (g22*h1 - g12*h2) * detInv;
290 void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
293 dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
295 updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
297 cudaSafeCall(cudaGetLastError());
300 cudaSafeCall(cudaDeviceSynchronize());
304 /*__global__ void boxFilter(
305 const int height, const int width, const PtrStepf src,
306 const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
308 const int y = by * bdy + ty;
309 const int x = bx * bdx + tx;
311 extern __shared__ float smem[];
312 volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
317 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
319 int xExt = int(bx * bdx) + i - ksizeHalf;
320 xExt = ::min(::max(xExt, 0), width - 1);
322 row[i] = src(y, xExt);
323 for (int j = 1; j <= ksizeHalf; ++j)
324 row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
332 row += tx + ksizeHalf;
334 for (int i = 1; i <= ksizeHalf; ++i)
335 res += row[-i] + row[i];
336 dst(y, x) = res * boxAreaInv;
342 void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
345 dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
346 int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
348 float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
349 boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
351 cudaSafeCall(cudaGetLastError());
354 cudaSafeCall(cudaDeviceSynchronize());
358 __global__ void boxFilter5(
359 const int height, const int width, const PtrStepf src,
360 const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
362 const int y = by * bdy + ty;
363 const int x = bx * bdx + tx;
365 extern __shared__ float smem[];
367 const int smw = bdx + 2*ksizeHalf; // shared memory "width"
368 volatile float *row = smem + 5 * ty * smw;
373 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
375 int xExt = int(bx * bdx) + i - ksizeHalf;
376 xExt = ::min(::max(xExt, 0), width - 1);
379 for (int k = 0; k < 5; ++k)
380 row[k*smw + i] = src(k*height + y, xExt);
382 for (int j = 1; j <= ksizeHalf; ++j)
384 for (int k = 0; k < 5; ++k)
386 src(k*height + ::max(y - j, 0), xExt) +
387 src(k*height + ::min(y + j, height - 1), xExt);
396 row += tx + ksizeHalf;
400 for (int k = 0; k < 5; ++k)
403 for (int i = 1; i <= ksizeHalf; ++i)
405 for (int k = 0; k < 5; ++k)
406 res[k] += row[k*smw - i] + row[k*smw + i];
409 for (int k = 0; k < 5; ++k)
410 dst(k*height + y, x) = res[k] * boxAreaInv;
416 void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
418 int height = src.rows / 5;
419 int width = src.cols;
422 dim3 grid(divUp(width, block.x), divUp(height, block.y));
423 int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
425 float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
426 boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
428 cudaSafeCall(cudaGetLastError());
431 cudaSafeCall(cudaDeviceSynchronize());
435 void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
437 int height = src.rows / 5;
438 int width = src.cols;
441 dim3 grid(divUp(width, block.x), divUp(height, block.y));
442 int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
444 float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
445 boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
447 cudaSafeCall(cudaGetLastError());
450 cudaSafeCall(cudaDeviceSynchronize());
454 __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
456 template <typename Border>
457 __global__ void gaussianBlur(
458 const int height, const int width, const PtrStepf src, const int ksizeHalf,
459 const Border b, PtrStepf dst)
461 const int y = by * bdy + ty;
462 const int x = bx * bdx + tx;
464 extern __shared__ float smem[];
465 volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
470 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
472 int xExt = int(bx * bdx) + i - ksizeHalf;
473 xExt = b.idx_col(xExt);
474 row[i] = src(y, xExt) * c_gKer[0];
475 for (int j = 1; j <= ksizeHalf; ++j)
477 (src(b.idx_row_low(y - j), xExt) +
478 src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
486 row += tx + ksizeHalf;
487 float res = row[0] * c_gKer[0];
488 for (int i = 1; i <= ksizeHalf; ++i)
489 res += (row[-i] + row[i]) * c_gKer[i];
496 void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
498 cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
502 template <typename Border>
503 void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
505 int height = src.rows;
506 int width = src.cols;
509 dim3 grid(divUp(width, block.x), divUp(height, block.y));
510 int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
511 Border b(height, width);
513 gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
515 cudaSafeCall(cudaGetLastError());
518 cudaSafeCall(cudaDeviceSynchronize());
522 void gaussianBlurGpu(
523 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
525 typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
527 static const caller_t callers[] =
529 gaussianBlurCaller<BrdReflect101<float> >,
530 gaussianBlurCaller<BrdReplicate<float> >,
533 callers[borderMode](src, ksizeHalf, dst, stream);
537 template <typename Border>
538 __global__ void gaussianBlur5(
539 const int height, const int width, const PtrStepf src, const int ksizeHalf,
540 const Border b, PtrStepf dst)
542 const int y = by * bdy + ty;
543 const int x = bx * bdx + tx;
545 extern __shared__ float smem[];
547 const int smw = bdx + 2*ksizeHalf; // shared memory "width"
548 volatile float *row = smem + 5 * ty * smw;
553 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
555 int xExt = int(bx * bdx) + i - ksizeHalf;
556 xExt = b.idx_col(xExt);
559 for (int k = 0; k < 5; ++k)
560 row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
562 for (int j = 1; j <= ksizeHalf; ++j)
564 for (int k = 0; k < 5; ++k)
566 (src(k*height + b.idx_row_low(y - j), xExt) +
567 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
576 row += tx + ksizeHalf;
580 for (int k = 0; k < 5; ++k)
581 res[k] = row[k*smw] * c_gKer[0];
583 for (int i = 1; i <= ksizeHalf; ++i)
585 for (int k = 0; k < 5; ++k)
586 res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
589 for (int k = 0; k < 5; ++k)
590 dst(k*height + y, x) = res[k];
596 template <typename Border, int blockDimX>
597 void gaussianBlur5Caller(
598 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
600 int height = src.rows / 5;
601 int width = src.cols;
603 dim3 block(blockDimX);
604 dim3 grid(divUp(width, block.x), divUp(height, block.y));
605 int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
606 Border b(height, width);
608 gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
610 cudaSafeCall(cudaGetLastError());
613 cudaSafeCall(cudaDeviceSynchronize());
617 void gaussianBlur5Gpu(
618 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
620 typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
622 static const caller_t callers[] =
624 gaussianBlur5Caller<BrdReflect101<float>,256>,
625 gaussianBlur5Caller<BrdReplicate<float>,256>,
628 callers[borderMode](src, ksizeHalf, dst, stream);
631 void gaussianBlur5Gpu_CC11(
632 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
634 typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
636 static const caller_t callers[] =
638 gaussianBlur5Caller<BrdReflect101<float>,128>,
639 gaussianBlur5Caller<BrdReplicate<float>,128>,
642 callers[borderMode](src, ksizeHalf, dst, stream);
645 }}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
648 #endif /* CUDA_DISABLER */