added dual tvl1 optical flow gpu implementation
[profile/ivi/opencv.git] / modules / gpu / src / cuda / fgd_bgfg.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #if !defined CUDA_DISABLER
44
45 #include "opencv2/gpu/device/common.hpp"
46 #include "opencv2/gpu/device/vec_math.hpp"
47 #include "opencv2/gpu/device/limits.hpp"
48 #include "opencv2/gpu/device/utility.hpp"
49 #include "opencv2/gpu/device/reduce.hpp"
50 #include "opencv2/gpu/device/functional.hpp"
51 #include "fgd_bgfg_common.hpp"
52
53 using namespace cv::gpu;
54 using namespace cv::gpu::device;
55
56 namespace bgfg
57 {
58     ////////////////////////////////////////////////////////////////////////////
59     // calcDiffHistogram
60
61     const unsigned int UINT_BITS = 32U;
62     const int LOG_WARP_SIZE = 5;
63     const int WARP_SIZE = 1 << LOG_WARP_SIZE;
64 #if (__CUDA_ARCH__ < 120)
65     const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
66 #endif
67
68     const int MERGE_THREADBLOCK_SIZE = 256;
69
70     __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
71     {
72         #if (__CUDA_ARCH__ < 120)
73             volatile unsigned int* s_WarpHist = s_WarpHist_;
74             unsigned int count;
75             do
76             {
77                 count = s_WarpHist[data] & TAG_MASK;
78                 count = threadTag | (count + 1);
79                 s_WarpHist[data] = count;
80             } while (s_WarpHist[data] != count);
81         #else
82             atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
83         #endif
84     }
85
86
87     template <typename PT, typename CT>
88     __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
89     {
90 #if (__CUDA_ARCH__ < 200)
91         const int HISTOGRAM_WARP_COUNT = 4;
92 #else
93         const int HISTOGRAM_WARP_COUNT = 6;
94 #endif
95         const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
96         const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
97
98         //Per-warp subhistogram storage
99         __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
100         __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
101         __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
102
103         //Clear shared memory storage for current threadblock before processing
104         #pragma unroll
105         for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
106         {
107            s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
108            s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
109            s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
110         }
111         __syncthreads();
112
113         const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
114
115         unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
116         unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
117         unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
118
119         const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
120         const int dataCount = prevFrame.rows * prevFrame.cols;
121         for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
122         {
123             const unsigned int y = pos / prevFrame.cols;
124             const unsigned int x = pos % prevFrame.cols;
125
126             PT prevVal = prevFrame(y, x);
127             CT curVal = curFrame(y, x);
128
129             int3 diff = make_int3(
130                 ::abs(curVal.x - prevVal.x),
131                 ::abs(curVal.y - prevVal.y),
132                 ::abs(curVal.z - prevVal.z)
133             );
134
135             addByte(s_WarpHist0, diff.x, tag);
136             addByte(s_WarpHist1, diff.y, tag);
137             addByte(s_WarpHist2, diff.z, tag);
138         }
139         __syncthreads();
140
141         //Merge per-warp histograms into per-block and write to global memory
142         for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
143         {
144             unsigned int sum0 = 0;
145             unsigned int sum1 = 0;
146             unsigned int sum2 = 0;
147
148             #pragma unroll
149             for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
150             {
151                 #if (__CUDA_ARCH__ < 120)
152                     sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
153                     sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
154                     sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
155                 #else
156                     sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
157                     sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
158                     sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
159                 #endif
160             }
161
162             partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
163             partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
164             partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
165         }
166     }
167
168     __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
169     {
170         unsigned int sum0 = 0;
171         unsigned int sum1 = 0;
172         unsigned int sum2 = 0;
173
174         #pragma unroll
175         for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
176         {
177             sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
178             sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
179             sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
180         }
181
182         __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
183         __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
184         __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
185
186         plus<unsigned int> op;
187         reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
188
189         if(threadIdx.x == 0)
190         {
191             hist0[blockIdx.x] = sum0;
192             hist1[blockIdx.x] = sum1;
193             hist2[blockIdx.x] = sum2;
194         }
195     }
196
197     template <typename PT, typename CT>
198     void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
199                                unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
200                                unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
201                                bool cc20, cudaStream_t stream)
202     {
203         const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
204         const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
205
206         calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
207                 (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
208         cudaSafeCall( cudaGetLastError() );
209
210         mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
211         cudaSafeCall( cudaGetLastError() );
212
213         if (stream == 0)
214             cudaSafeCall( cudaDeviceSynchronize() );
215     }
216
217     template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
218     template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
219     template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
220     template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
221
222     /////////////////////////////////////////////////////////////////////////
223     // calcDiffThreshMask
224
225     template <typename PT, typename CT>
226     __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
227     {
228         const int y = blockIdx.y * blockDim.y + threadIdx.y;
229         const int x = blockIdx.x * blockDim.x + threadIdx.x;
230
231         if (y > prevFrame.rows || x > prevFrame.cols)
232             return;
233
234         PT prevVal = prevFrame(y, x);
235         CT curVal = curFrame(y, x);
236
237         int3 diff = make_int3(
238             ::abs(curVal.x - prevVal.x),
239             ::abs(curVal.y - prevVal.y),
240             ::abs(curVal.z - prevVal.z)
241         );
242
243         if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
244             changeMask(y, x) = 255;
245     }
246
247     template <typename PT, typename CT>
248     void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
249     {
250         dim3 block(32, 8);
251         dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
252
253         calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
254         cudaSafeCall( cudaGetLastError() );
255
256         if (stream == 0)
257             cudaSafeCall( cudaDeviceSynchronize() );
258     }
259
260     template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
261     template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
262     template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
263     template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
264
265     /////////////////////////////////////////////////////////////////////////
266     // bgfgClassification
267
268     __constant__ BGPixelStat c_stat;
269
270     void setBGPixelStat(const BGPixelStat& stat)
271     {
272         cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
273     }
274
275     template <typename T> struct Output;
276     template <> struct Output<uchar3>
277     {
278         static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
279         {
280             return make_uchar3(v0, v1, v2);
281         }
282     };
283     template <> struct Output<uchar4>
284     {
285         static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
286         {
287             return make_uchar4(v0, v1, v2, 255);
288         }
289     };
290
291     template <typename PT, typename CT, typename OT>
292     __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
293                                        const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
294                                        int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
295     {
296         const int i = blockIdx.y * blockDim.y + threadIdx.y;
297         const int j = blockIdx.x * blockDim.x + threadIdx.x;
298
299         if (i > prevFrame.rows || j > prevFrame.cols)
300             return;
301
302         if (Fbd(i, j) || Ftd(i, j))
303         {
304             float Pb  = 0.0f;
305             float Pv  = 0.0f;
306             float Pvb = 0.0f;
307
308             int val = 0;
309
310             // Is it a motion pixel?
311             if (Ftd(i, j))
312             {
313                 if (!c_stat.is_trained_dyn_model(i, j))
314                     val = 1;
315                 else
316                 {
317                     PT prevVal = prevFrame(i, j);
318                     CT curVal = curFrame(i, j);
319
320                     // Compare with stored CCt vectors:
321                     for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
322                     {
323                         OT v1 = c_stat.V1_CC<OT>(i, j, k);
324                         OT v2 = c_stat.V2_CC<OT>(i, j, k);
325
326                         if (::abs(v1.x - prevVal.x) <= deltaCC &&
327                             ::abs(v1.y - prevVal.y) <= deltaCC &&
328                             ::abs(v1.z - prevVal.z) <= deltaCC &&
329                             ::abs(v2.x - curVal.x) <= deltaCC &&
330                             ::abs(v2.y - curVal.y) <= deltaCC &&
331                             ::abs(v2.z - curVal.z) <= deltaCC)
332                         {
333                             Pv += c_stat.PV_CC(i, j, k);
334                             Pvb += c_stat.PVB_CC(i, j, k);
335                         }
336                     }
337
338                     Pb = c_stat.Pbcc(i, j);
339                     if (2 * Pvb * Pb <= Pv)
340                         val = 1;
341                 }
342             }
343             else if(c_stat.is_trained_st_model(i, j))
344             {
345                 CT curVal = curFrame(i, j);
346
347                 // Compare with stored Ct vectors:
348                 for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
349                 {
350                     OT v = c_stat.V_C<OT>(i, j, k);
351
352                     if (::abs(v.x - curVal.x) <= deltaC &&
353                         ::abs(v.y - curVal.y) <= deltaC &&
354                         ::abs(v.z - curVal.z) <= deltaC)
355                     {
356                         Pv += c_stat.PV_C(i, j, k);
357                         Pvb += c_stat.PVB_C(i, j, k);
358                     }
359                 }
360                 Pb = c_stat.Pbc(i, j);
361                 if (2 * Pvb * Pb <= Pv)
362                     val = 1;
363             }
364
365             // Update foreground:
366             foreground(i, j) = static_cast<uchar>(val);
367         } // end if( change detection...
368     }
369
370     template <typename PT, typename CT, typename OT>
371     void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
372                                 int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
373     {
374         dim3 block(32, 8);
375         dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
376
377         cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
378
379         bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
380                                                                    Ftd, Fbd, foreground,
381                                                                    deltaC, deltaCC, alpha2, N1c, N1cc);
382         cudaSafeCall( cudaGetLastError() );
383
384         if (stream == 0)
385             cudaSafeCall( cudaDeviceSynchronize() );
386     }
387
388     template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
389     template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
390     template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
391     template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
392     template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
393     template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
394     template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
395     template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
396
397     ////////////////////////////////////////////////////////////////////////////
398     // updateBackgroundModel
399
400     template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
401     __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
402                                           PtrStepb foreground, PtrStep<OT> background,
403                                           int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
404     {
405         const int i = blockIdx.y * blockDim.y + threadIdx.y;
406         const int j = blockIdx.x * blockDim.x + threadIdx.x;
407
408         if (i > rows || j > cols)
409             return;
410
411         const float MIN_PV = 1e-10f;
412
413         const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
414         if (Ftd(i, j) || !is_trained_dyn_model)
415         {
416             const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
417
418             float Pbcc = c_stat.Pbcc(i, j);
419
420             //update Pb
421             Pbcc *= (1.0f - alpha);
422             if (!foreground(i, j))
423             {
424                 Pbcc += alpha;
425             }
426
427             int min_dist = numeric_limits<int>::max();
428             int indx = -1;
429
430             PT prevVal = prevFrame(i, j);
431             CT curVal = curFrame(i, j);
432
433             // Find best Vi match:
434             for (int k = 0; k < N2cc; ++k)
435             {
436                 float PV_CC = c_stat.PV_CC(i, j, k);
437                 if (!PV_CC)
438                     break;
439
440                 if (PV_CC < MIN_PV)
441                 {
442                     c_stat.PV_CC(i, j, k) = 0;
443                     c_stat.PVB_CC(i, j, k) = 0;
444                     continue;
445                 }
446
447                 c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
448                 c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
449
450                 OT v1 = c_stat.V1_CC<OT>(i, j, k);
451
452                 int3 val1 = make_int3(
453                     ::abs(v1.x - prevVal.x),
454                     ::abs(v1.y - prevVal.y),
455                     ::abs(v1.z - prevVal.z)
456                 );
457
458                 OT v2 = c_stat.V2_CC<OT>(i, j, k);
459
460                 int3 val2 = make_int3(
461                     ::abs(v2.x - curVal.x),
462                     ::abs(v2.y - curVal.y),
463                     ::abs(v2.z - curVal.z)
464                 );
465
466                 int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
467
468                 if (dist < min_dist &&
469                     val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
470                     val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
471                 {
472                     min_dist = dist;
473                     indx = k;
474                 }
475             }
476
477             if (indx < 0)
478             {
479                 // Replace N2th elem in the table by new feature:
480                 indx = N2cc - 1;
481                 c_stat.PV_CC(i, j, indx) = alpha;
482                 c_stat.PVB_CC(i, j, indx) = alpha;
483
484                 //udate Vt
485                 c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
486                 c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
487             }
488             else
489             {
490                 // Update:
491                 c_stat.PV_CC(i, j, indx) += alpha;
492
493                 if (!foreground(i, j))
494                 {
495                     c_stat.PVB_CC(i, j, indx) += alpha;
496                 }
497             }
498
499             //re-sort CCt table by Pv
500             const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
501             const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
502             const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
503             const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
504             for (int k = 0; k < indx; ++k)
505             {
506                 if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
507                 {
508                     //shift elements
509                     float Pv_tmp1;
510                     float Pv_tmp2 = PV_CC_indx;
511
512                     float Pvb_tmp1;
513                     float Pvb_tmp2 = PVB_CC_indx;
514
515                     OT v1_tmp1;
516                     OT v1_tmp2 = V1_CC_indx;
517
518                     OT v2_tmp1;
519                     OT v2_tmp2 = V2_CC_indx;
520
521                     for (int l = k; l <= indx; ++l)
522                     {
523                         Pv_tmp1 = c_stat.PV_CC(i, j, l);
524                         c_stat.PV_CC(i, j, l) = Pv_tmp2;
525                         Pv_tmp2 = Pv_tmp1;
526
527                         Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
528                         c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
529                         Pvb_tmp2 = Pvb_tmp1;
530
531                         v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
532                         c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
533                         v1_tmp2 = v1_tmp1;
534
535                         v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
536                         c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
537                         v2_tmp2 = v2_tmp1;
538                     }
539
540                     break;
541                 }
542             }
543
544             float sum1 = 0.0f;
545             float sum2 = 0.0f;
546
547             //check "once-off" changes
548             for (int k = 0; k < N1cc; ++k)
549             {
550                 const float PV_CC = c_stat.PV_CC(i, j, k);
551                 if (!PV_CC)
552                     break;
553
554                 sum1 += PV_CC;
555                 sum2 += c_stat.PVB_CC(i, j, k);
556             }
557
558             if (sum1 > T)
559                 c_stat.is_trained_dyn_model(i, j) = 1;
560
561             float diff = sum1 - Pbcc * sum2;
562
563             // Update stat table:
564             if (diff > T)
565             {
566                 //new BG features are discovered
567                 for (int k = 0; k < N1cc; ++k)
568                 {
569                     const float PV_CC = c_stat.PV_CC(i, j, k);
570                     if (!PV_CC)
571                         break;
572
573                     c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
574                 }
575             }
576
577             c_stat.Pbcc(i, j) = Pbcc;
578         }
579
580         // Handle "stationary" pixel:
581         if (!Ftd(i, j))
582         {
583             const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
584
585             float Pbc = c_stat.Pbc(i, j);
586
587             //update Pb
588             Pbc *= (1.0f - alpha);
589             if (!foreground(i, j))
590             {
591                 Pbc += alpha;
592             }
593
594             int min_dist = numeric_limits<int>::max();
595             int indx = -1;
596
597             CT curVal = curFrame(i, j);
598
599             //find best Vi match
600             for (int k = 0; k < N2c; ++k)
601             {
602                 float PV_C = c_stat.PV_C(i, j, k);
603
604                 if (PV_C < MIN_PV)
605                 {
606                     c_stat.PV_C(i, j, k) = 0;
607                     c_stat.PVB_C(i, j, k) = 0;
608                     continue;
609                 }
610
611                 // Exponential decay of memory
612                 c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
613                 c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
614
615                 OT v = c_stat.V_C<OT>(i, j, k);
616                 int3 val = make_int3(
617                     ::abs(v.x - curVal.x),
618                     ::abs(v.y - curVal.y),
619                     ::abs(v.z - curVal.z)
620                 );
621
622                 int dist = val.x + val.y + val.z;
623
624                 if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
625                 {
626                     min_dist = dist;
627                     indx = k;
628                 }
629             }
630
631             if (indx < 0)
632             {
633                 //N2th elem in the table is replaced by a new features
634                 indx = N2c - 1;
635
636                 c_stat.PV_C(i, j, indx) = alpha;
637                 c_stat.PVB_C(i, j, indx) = alpha;
638
639                 //udate Vt
640                 c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
641             }
642             else
643             {
644                 //update
645                 c_stat.PV_C(i, j, indx) += alpha;
646
647                 if (!foreground(i, j))
648                 {
649                     c_stat.PVB_C(i, j, indx) += alpha;
650                 }
651             }
652
653             //re-sort Ct table by Pv
654             const float PV_C_indx = c_stat.PV_C(i, j, indx);
655             const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
656             OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
657             for (int k = 0; k < indx; ++k)
658             {
659                 if (c_stat.PV_C(i, j, k) <= PV_C_indx)
660                 {
661                     //shift elements
662                     float Pv_tmp1;
663                     float Pv_tmp2 = PV_C_indx;
664
665                     float Pvb_tmp1;
666                     float Pvb_tmp2 = PVB_C_indx;
667
668                     OT v_tmp1;
669                     OT v_tmp2 = V_C_indx;
670
671                     for (int l = k; l <= indx; ++l)
672                     {
673                         Pv_tmp1 = c_stat.PV_C(i, j, l);
674                         c_stat.PV_C(i, j, l) = Pv_tmp2;
675                         Pv_tmp2 = Pv_tmp1;
676
677                         Pvb_tmp1 = c_stat.PVB_C(i, j, l);
678                         c_stat.PVB_C(i, j, l) = Pvb_tmp2;
679                         Pvb_tmp2 = Pvb_tmp1;
680
681                         v_tmp1 = c_stat.V_C<OT>(i, j, l);
682                         c_stat.V_C<OT>(i, j, l) = v_tmp2;
683                         v_tmp2 = v_tmp1;
684                     }
685
686                     break;
687                 }
688             }
689
690             // Check "once-off" changes:
691             float sum1 = 0.0f;
692             float sum2 = 0.0f;
693             for (int k = 0; k < N1c; ++k)
694             {
695                 const float PV_C = c_stat.PV_C(i, j, k);
696                 if (!PV_C)
697                     break;
698
699                 sum1 += PV_C;
700                 sum2 += c_stat.PVB_C(i, j, k);
701             }
702
703             if (sum1 > T)
704                 c_stat.is_trained_st_model(i, j) = 1;
705
706             float diff = sum1 - Pbc * sum2;
707
708             // Update stat table:
709             if (diff > T)
710             {
711                 //new BG features are discovered
712                 for (int k = 0; k < N1c; ++k)
713                 {
714                     const float PV_C = c_stat.PV_C(i, j, k);
715                     if (!PV_C)
716                         break;
717
718                     c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
719                 }
720
721                 c_stat.Pbc(i, j) = 1.0f - Pbc;
722             }
723             else
724             {
725                 c_stat.Pbc(i, j) = Pbc;
726             }
727         } // if !(change detection) at pixel (i,j)
728
729         // Update the reference BG image:
730         if (!foreground(i, j))
731         {
732             CT curVal = curFrame(i, j);
733
734             if (!Ftd(i, j) && !Fbd(i, j))
735             {
736                 // Apply IIR filter:
737                 OT oldVal = background(i, j);
738
739                 int3 newVal = make_int3(
740                     __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
741                     __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
742                     __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
743                 );
744
745                 background(i, j) = Output<OT>::make(
746                     static_cast<uchar>(newVal.x),
747                     static_cast<uchar>(newVal.y),
748                     static_cast<uchar>(newVal.z)
749                 );
750             }
751             else
752             {
753                 background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
754             }
755         }
756     }
757
758     template <typename PT, typename CT, typename OT>
759     struct UpdateBackgroundModel
760     {
761         static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
762                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
763                          cudaStream_t stream)
764         {
765             dim3 block(32, 8);
766             dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
767
768             cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
769
770             updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
771                 prevFrame.cols, prevFrame.rows,
772                 prevFrame, curFrame,
773                 Ftd, Fbd, foreground, background,
774                 deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
775             cudaSafeCall( cudaGetLastError() );
776
777             if (stream == 0)
778                 cudaSafeCall( cudaDeviceSynchronize() );
779         }
780     };
781
782     template <typename PT, typename CT, typename OT>
783     void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
784                                    int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
785                                    cudaStream_t stream)
786     {
787         UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
788                                                 deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
789     }
790
791     template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
792     template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
793     template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
794     template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
795     template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
796     template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
797     template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
798     template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
799 }
800
801 #endif /* CUDA_DISABLER */