modules/gpu/src/cuda/fgd_bgfg.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "opencv2/gpu/device/common.hpp"
  46 #include "opencv2/gpu/device/vec_math.hpp"
  47 #include "opencv2/gpu/device/limits.hpp"
  48 #include "opencv2/gpu/device/utility.hpp"
  49 #include "opencv2/gpu/device/reduce.hpp"
  50 #include "opencv2/gpu/device/functional.hpp"
  51 #include "fgd_bgfg_common.hpp"
  52
  53 using namespace cv::gpu;
  54 using namespace cv::gpu::device;
  55
  56 namespace bgfg
  57 {
  58     ////////////////////////////////////////////////////////////////////////////
  59     // calcDiffHistogram
  60
  61     const unsigned int UINT_BITS = 32U;
  62     const int LOG_WARP_SIZE = 5;
  63     const int WARP_SIZE = 1 << LOG_WARP_SIZE;
  64 #if (__CUDA_ARCH__ < 120)
  65     const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
  66 #endif
  67
  68     const int MERGE_THREADBLOCK_SIZE = 256;
  69
  70     __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
  71     {
  72         #if (__CUDA_ARCH__ < 120)
  73             volatile unsigned int* s_WarpHist = s_WarpHist_;
  74             unsigned int count;
  75             do
  76             {
  77                 count = s_WarpHist[data] & TAG_MASK;
  78                 count = threadTag | (count + 1);
  79                 s_WarpHist[data] = count;
  80             } while (s_WarpHist[data] != count);
  81         #else
  82             atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
  83         #endif
  84     }
  85
  86
  87     template <typename PT, typename CT>
  88     __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
  89     {
  90 #if (__CUDA_ARCH__ < 200)
  91         const int HISTOGRAM_WARP_COUNT = 4;
  92 #else
  93         const int HISTOGRAM_WARP_COUNT = 6;
  94 #endif
  95         const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
  96         const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
  97
  98         //Per-warp subhistogram storage
  99         __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
 100         __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
 101         __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
 102
 103         //Clear shared memory storage for current threadblock before processing
 104         #pragma unroll
 105         for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
 106         {
 107            s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
 108            s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
 109            s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
 110         }
 111         __syncthreads();
 112
 113         const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
 114
 115         unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
 116         unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
 117         unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
 118
 119         const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
 120         const int dataCount = prevFrame.rows * prevFrame.cols;
 121         for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
 122         {
 123             const unsigned int y = pos / prevFrame.cols;
 124             const unsigned int x = pos % prevFrame.cols;
 125
 126             PT prevVal = prevFrame(y, x);
 127             CT curVal = curFrame(y, x);
 128
 129             int3 diff = make_int3(
 130                 ::abs(curVal.x - prevVal.x),
 131                 ::abs(curVal.y - prevVal.y),
 132                 ::abs(curVal.z - prevVal.z)
 133             );
 134
 135             addByte(s_WarpHist0, diff.x, tag);
 136             addByte(s_WarpHist1, diff.y, tag);
 137             addByte(s_WarpHist2, diff.z, tag);
 138         }
 139         __syncthreads();
 140
 141         //Merge per-warp histograms into per-block and write to global memory
 142         for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
 143         {
 144             unsigned int sum0 = 0;
 145             unsigned int sum1 = 0;
 146             unsigned int sum2 = 0;
 147
 148             #pragma unroll
 149             for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
 150             {
 151                 #if (__CUDA_ARCH__ < 120)
 152                     sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
 153                     sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
 154                     sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
 155                 #else
 156                     sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
 157                     sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
 158                     sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
 159                 #endif
 160             }
 161
 162             partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
 163             partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
 164             partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
 165         }
 166     }
 167
 168     __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
 169     {
 170         unsigned int sum0 = 0;
 171         unsigned int sum1 = 0;
 172         unsigned int sum2 = 0;
 173
 174         #pragma unroll
 175         for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
 176         {
 177             sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
 178             sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
 179             sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
 180         }
 181
 182         __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
 183         __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
 184         __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
 185
 186         plus<unsigned int> op;
 187         reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
 188
 189         if(threadIdx.x == 0)
 190         {
 191             hist0[blockIdx.x] = sum0;
 192             hist1[blockIdx.x] = sum1;
 193             hist2[blockIdx.x] = sum2;
 194         }
 195     }
 196
 197     template <typename PT, typename CT>
 198     void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
 199                                unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
 200                                unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
 201                                bool cc20, cudaStream_t stream)
 202     {
 203         const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
 204         const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
 205
 206         calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
 207                 (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
 208         cudaSafeCall( cudaGetLastError() );
 209
 210         mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
 211         cudaSafeCall( cudaGetLastError() );
 212
 213         if (stream == 0)
 214             cudaSafeCall( cudaDeviceSynchronize() );
 215     }
 216
 217     template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
 218     template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
 219     template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
 220     template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
 221
 222     /////////////////////////////////////////////////////////////////////////
 223     // calcDiffThreshMask
 224
 225     template <typename PT, typename CT>
 226     __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
 227     {
 228         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 229         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 230
 231         if (y > prevFrame.rows || x > prevFrame.cols)
 232             return;
 233
 234         PT prevVal = prevFrame(y, x);
 235         CT curVal = curFrame(y, x);
 236
 237         int3 diff = make_int3(
 238             ::abs(curVal.x - prevVal.x),
 239             ::abs(curVal.y - prevVal.y),
 240             ::abs(curVal.z - prevVal.z)
 241         );
 242
 243         if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
 244             changeMask(y, x) = 255;
 245     }
 246
 247     template <typename PT, typename CT>
 248     void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
 249     {
 250         dim3 block(32, 8);
 251         dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
 252
 253         calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
 254         cudaSafeCall( cudaGetLastError() );
 255
 256         if (stream == 0)
 257             cudaSafeCall( cudaDeviceSynchronize() );
 258     }
 259
 260     template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
 261     template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
 262     template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
 263     template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
 264
 265     /////////////////////////////////////////////////////////////////////////
 266     // bgfgClassification
 267
 268     __constant__ BGPixelStat c_stat;
 269
 270     void setBGPixelStat(const BGPixelStat& stat)
 271     {
 272         cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
 273     }
 274
 275     template <typename T> struct Output;
 276     template <> struct Output<uchar3>
 277     {
 278         static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
 279         {
 280             return make_uchar3(v0, v1, v2);
 281         }
 282     };
 283     template <> struct Output<uchar4>
 284     {
 285         static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
 286         {
 287             return make_uchar4(v0, v1, v2, 255);
 288         }
 289     };
 290
 291     template <typename PT, typename CT, typename OT>
 292     __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
 293                                        const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
 294                                        int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
 295     {
 296         const int i = blockIdx.y * blockDim.y + threadIdx.y;
 297         const int j = blockIdx.x * blockDim.x + threadIdx.x;
 298
 299         if (i > prevFrame.rows || j > prevFrame.cols)
 300             return;
 301
 302         if (Fbd(i, j) || Ftd(i, j))
 303         {
 304             float Pb  = 0.0f;
 305             float Pv  = 0.0f;
 306             float Pvb = 0.0f;
 307
 308             int val = 0;
 309
 310             // Is it a motion pixel?
 311             if (Ftd(i, j))
 312             {
 313                 if (!c_stat.is_trained_dyn_model(i, j))
 314                     val = 1;
 315                 else
 316                 {
 317                     PT prevVal = prevFrame(i, j);
 318                     CT curVal = curFrame(i, j);
 319
 320                     // Compare with stored CCt vectors:
 321                     for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
 322                     {
 323                         OT v1 = c_stat.V1_CC<OT>(i, j, k);
 324                         OT v2 = c_stat.V2_CC<OT>(i, j, k);
 325
 326                         if (::abs(v1.x - prevVal.x) <= deltaCC &&
 327                             ::abs(v1.y - prevVal.y) <= deltaCC &&
 328                             ::abs(v1.z - prevVal.z) <= deltaCC &&
 329                             ::abs(v2.x - curVal.x) <= deltaCC &&
 330                             ::abs(v2.y - curVal.y) <= deltaCC &&
 331                             ::abs(v2.z - curVal.z) <= deltaCC)
 332                         {
 333                             Pv += c_stat.PV_CC(i, j, k);
 334                             Pvb += c_stat.PVB_CC(i, j, k);
 335                         }
 336                     }
 337
 338                     Pb = c_stat.Pbcc(i, j);
 339                     if (2 * Pvb * Pb <= Pv)
 340                         val = 1;
 341                 }
 342             }
 343             else if(c_stat.is_trained_st_model(i, j))
 344             {
 345                 CT curVal = curFrame(i, j);
 346
 347                 // Compare with stored Ct vectors:
 348                 for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
 349                 {
 350                     OT v = c_stat.V_C<OT>(i, j, k);
 351
 352                     if (::abs(v.x - curVal.x) <= deltaC &&
 353                         ::abs(v.y - curVal.y) <= deltaC &&
 354                         ::abs(v.z - curVal.z) <= deltaC)
 355                     {
 356                         Pv += c_stat.PV_C(i, j, k);
 357                         Pvb += c_stat.PVB_C(i, j, k);
 358                     }
 359                 }
 360                 Pb = c_stat.Pbc(i, j);
 361                 if (2 * Pvb * Pb <= Pv)
 362                     val = 1;
 363             }
 364
 365             // Update foreground:
 366             foreground(i, j) = static_cast<uchar>(val);
 367         } // end if( change detection...
 368     }
 369
 370     template <typename PT, typename CT, typename OT>
 371     void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
 372                                 int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
 373     {
 374         dim3 block(32, 8);
 375         dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
 376
 377         cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
 378
 379         bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
 380                                                                    Ftd, Fbd, foreground,
 381                                                                    deltaC, deltaCC, alpha2, N1c, N1cc);
 382         cudaSafeCall( cudaGetLastError() );
 383
 384         if (stream == 0)
 385             cudaSafeCall( cudaDeviceSynchronize() );
 386     }
 387
 388     template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 389     template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 390     template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 391     template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 392     template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 393     template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 394     template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 395     template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
 396
 397     ////////////////////////////////////////////////////////////////////////////
 398     // updateBackgroundModel
 399
 400     template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
 401     __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
 402                                           PtrStepb foreground, PtrStep<OT> background,
 403                                           int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
 404     {
 405         const int i = blockIdx.y * blockDim.y + threadIdx.y;
 406         const int j = blockIdx.x * blockDim.x + threadIdx.x;
 407
 408         if (i > rows || j > cols)
 409             return;
 410
 411         const float MIN_PV = 1e-10f;
 412
 413         const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
 414         if (Ftd(i, j) || !is_trained_dyn_model)
 415         {
 416             const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
 417
 418             float Pbcc = c_stat.Pbcc(i, j);
 419
 420             //update Pb
 421             Pbcc *= (1.0f - alpha);
 422             if (!foreground(i, j))
 423             {
 424                 Pbcc += alpha;
 425             }
 426
 427             int min_dist = numeric_limits<int>::max();
 428             int indx = -1;
 429
 430             PT prevVal = prevFrame(i, j);
 431             CT curVal = curFrame(i, j);
 432
 433             // Find best Vi match:
 434             for (int k = 0; k < N2cc; ++k)
 435             {
 436                 float PV_CC = c_stat.PV_CC(i, j, k);
 437                 if (!PV_CC)
 438                     break;
 439
 440                 if (PV_CC < MIN_PV)
 441                 {
 442                     c_stat.PV_CC(i, j, k) = 0;
 443                     c_stat.PVB_CC(i, j, k) = 0;
 444                     continue;
 445                 }
 446
 447                 c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
 448                 c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
 449
 450                 OT v1 = c_stat.V1_CC<OT>(i, j, k);
 451
 452                 int3 val1 = make_int3(
 453                     ::abs(v1.x - prevVal.x),
 454                     ::abs(v1.y - prevVal.y),
 455                     ::abs(v1.z - prevVal.z)
 456                 );
 457
 458                 OT v2 = c_stat.V2_CC<OT>(i, j, k);
 459
 460                 int3 val2 = make_int3(
 461                     ::abs(v2.x - curVal.x),
 462                     ::abs(v2.y - curVal.y),
 463                     ::abs(v2.z - curVal.z)
 464                 );
 465
 466                 int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
 467
 468                 if (dist < min_dist &&
 469                     val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
 470                     val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
 471                 {
 472                     min_dist = dist;
 473                     indx = k;
 474                 }
 475             }
 476
 477             if (indx < 0)
 478             {
 479                 // Replace N2th elem in the table by new feature:
 480                 indx = N2cc - 1;
 481                 c_stat.PV_CC(i, j, indx) = alpha;
 482                 c_stat.PVB_CC(i, j, indx) = alpha;
 483
 484                 //udate Vt
 485                 c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
 486                 c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
 487             }
 488             else
 489             {
 490                 // Update:
 491                 c_stat.PV_CC(i, j, indx) += alpha;
 492
 493                 if (!foreground(i, j))
 494                 {
 495                     c_stat.PVB_CC(i, j, indx) += alpha;
 496                 }
 497             }
 498
 499             //re-sort CCt table by Pv
 500             const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
 501             const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
 502             const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
 503             const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
 504             for (int k = 0; k < indx; ++k)
 505             {
 506                 if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
 507                 {
 508                     //shift elements
 509                     float Pv_tmp1;
 510                     float Pv_tmp2 = PV_CC_indx;
 511
 512                     float Pvb_tmp1;
 513                     float Pvb_tmp2 = PVB_CC_indx;
 514
 515                     OT v1_tmp1;
 516                     OT v1_tmp2 = V1_CC_indx;
 517
 518                     OT v2_tmp1;
 519                     OT v2_tmp2 = V2_CC_indx;
 520
 521                     for (int l = k; l <= indx; ++l)
 522                     {
 523                         Pv_tmp1 = c_stat.PV_CC(i, j, l);
 524                         c_stat.PV_CC(i, j, l) = Pv_tmp2;
 525                         Pv_tmp2 = Pv_tmp1;
 526
 527                         Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
 528                         c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
 529                         Pvb_tmp2 = Pvb_tmp1;
 530
 531                         v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
 532                         c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
 533                         v1_tmp2 = v1_tmp1;
 534
 535                         v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
 536                         c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
 537                         v2_tmp2 = v2_tmp1;
 538                     }
 539
 540                     break;
 541                 }
 542             }
 543
 544             float sum1 = 0.0f;
 545             float sum2 = 0.0f;
 546
 547             //check "once-off" changes
 548             for (int k = 0; k < N1cc; ++k)
 549             {
 550                 const float PV_CC = c_stat.PV_CC(i, j, k);
 551                 if (!PV_CC)
 552                     break;
 553
 554                 sum1 += PV_CC;
 555                 sum2 += c_stat.PVB_CC(i, j, k);
 556             }
 557
 558             if (sum1 > T)
 559                 c_stat.is_trained_dyn_model(i, j) = 1;
 560
 561             float diff = sum1 - Pbcc * sum2;
 562
 563             // Update stat table:
 564             if (diff > T)
 565             {
 566                 //new BG features are discovered
 567                 for (int k = 0; k < N1cc; ++k)
 568                 {
 569                     const float PV_CC = c_stat.PV_CC(i, j, k);
 570                     if (!PV_CC)
 571                         break;
 572
 573                     c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
 574                 }
 575             }
 576
 577             c_stat.Pbcc(i, j) = Pbcc;
 578         }
 579
 580         // Handle "stationary" pixel:
 581         if (!Ftd(i, j))
 582         {
 583             const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
 584
 585             float Pbc = c_stat.Pbc(i, j);
 586
 587             //update Pb
 588             Pbc *= (1.0f - alpha);
 589             if (!foreground(i, j))
 590             {
 591                 Pbc += alpha;
 592             }
 593
 594             int min_dist = numeric_limits<int>::max();
 595             int indx = -1;
 596
 597             CT curVal = curFrame(i, j);
 598
 599             //find best Vi match
 600             for (int k = 0; k < N2c; ++k)
 601             {
 602                 float PV_C = c_stat.PV_C(i, j, k);
 603
 604                 if (PV_C < MIN_PV)
 605                 {
 606                     c_stat.PV_C(i, j, k) = 0;
 607                     c_stat.PVB_C(i, j, k) = 0;
 608                     continue;
 609                 }
 610
 611                 // Exponential decay of memory
 612                 c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
 613                 c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
 614
 615                 OT v = c_stat.V_C<OT>(i, j, k);
 616                 int3 val = make_int3(
 617                     ::abs(v.x - curVal.x),
 618                     ::abs(v.y - curVal.y),
 619                     ::abs(v.z - curVal.z)
 620                 );
 621
 622                 int dist = val.x + val.y + val.z;
 623
 624                 if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
 625                 {
 626                     min_dist = dist;
 627                     indx = k;
 628                 }
 629             }
 630
 631             if (indx < 0)
 632             {
 633                 //N2th elem in the table is replaced by a new features
 634                 indx = N2c - 1;
 635
 636                 c_stat.PV_C(i, j, indx) = alpha;
 637                 c_stat.PVB_C(i, j, indx) = alpha;
 638
 639                 //udate Vt
 640                 c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
 641             }
 642             else
 643             {
 644                 //update
 645                 c_stat.PV_C(i, j, indx) += alpha;
 646
 647                 if (!foreground(i, j))
 648                 {
 649                     c_stat.PVB_C(i, j, indx) += alpha;
 650                 }
 651             }
 652
 653             //re-sort Ct table by Pv
 654             const float PV_C_indx = c_stat.PV_C(i, j, indx);
 655             const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
 656             OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
 657             for (int k = 0; k < indx; ++k)
 658             {
 659                 if (c_stat.PV_C(i, j, k) <= PV_C_indx)
 660                 {
 661                     //shift elements
 662                     float Pv_tmp1;
 663                     float Pv_tmp2 = PV_C_indx;
 664
 665                     float Pvb_tmp1;
 666                     float Pvb_tmp2 = PVB_C_indx;
 667
 668                     OT v_tmp1;
 669                     OT v_tmp2 = V_C_indx;
 670
 671                     for (int l = k; l <= indx; ++l)
 672                     {
 673                         Pv_tmp1 = c_stat.PV_C(i, j, l);
 674                         c_stat.PV_C(i, j, l) = Pv_tmp2;
 675                         Pv_tmp2 = Pv_tmp1;
 676
 677                         Pvb_tmp1 = c_stat.PVB_C(i, j, l);
 678                         c_stat.PVB_C(i, j, l) = Pvb_tmp2;
 679                         Pvb_tmp2 = Pvb_tmp1;
 680
 681                         v_tmp1 = c_stat.V_C<OT>(i, j, l);
 682                         c_stat.V_C<OT>(i, j, l) = v_tmp2;
 683                         v_tmp2 = v_tmp1;
 684                     }
 685
 686                     break;
 687                 }
 688             }
 689
 690             // Check "once-off" changes:
 691             float sum1 = 0.0f;
 692             float sum2 = 0.0f;
 693             for (int k = 0; k < N1c; ++k)
 694             {
 695                 const float PV_C = c_stat.PV_C(i, j, k);
 696                 if (!PV_C)
 697                     break;
 698
 699                 sum1 += PV_C;
 700                 sum2 += c_stat.PVB_C(i, j, k);
 701             }
 702
 703             if (sum1 > T)
 704                 c_stat.is_trained_st_model(i, j) = 1;
 705
 706             float diff = sum1 - Pbc * sum2;
 707
 708             // Update stat table:
 709             if (diff > T)
 710             {
 711                 //new BG features are discovered
 712                 for (int k = 0; k < N1c; ++k)
 713                 {
 714                     const float PV_C = c_stat.PV_C(i, j, k);
 715                     if (!PV_C)
 716                         break;
 717
 718                     c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
 719                 }
 720
 721                 c_stat.Pbc(i, j) = 1.0f - Pbc;
 722             }
 723             else
 724             {
 725                 c_stat.Pbc(i, j) = Pbc;
 726             }
 727         } // if !(change detection) at pixel (i,j)
 728
 729         // Update the reference BG image:
 730         if (!foreground(i, j))
 731         {
 732             CT curVal = curFrame(i, j);
 733
 734             if (!Ftd(i, j) && !Fbd(i, j))
 735             {
 736                 // Apply IIR filter:
 737                 OT oldVal = background(i, j);
 738
 739                 int3 newVal = make_int3(
 740                     __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
 741                     __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
 742                     __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
 743                 );
 744
 745                 background(i, j) = Output<OT>::make(
 746                     static_cast<uchar>(newVal.x),
 747                     static_cast<uchar>(newVal.y),
 748                     static_cast<uchar>(newVal.z)
 749                 );
 750             }
 751             else
 752             {
 753                 background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
 754             }
 755         }
 756     }
 757
 758     template <typename PT, typename CT, typename OT>
 759     struct UpdateBackgroundModel
 760     {
 761         static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
 762                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
 763                          cudaStream_t stream)
 764         {
 765             dim3 block(32, 8);
 766             dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
 767
 768             cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
 769
 770             updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
 771                 prevFrame.cols, prevFrame.rows,
 772                 prevFrame, curFrame,
 773                 Ftd, Fbd, foreground, background,
 774                 deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
 775             cudaSafeCall( cudaGetLastError() );
 776
 777             if (stream == 0)
 778                 cudaSafeCall( cudaDeviceSynchronize() );
 779         }
 780     };
 781
 782     template <typename PT, typename CT, typename OT>
 783     void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
 784                                    int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
 785                                    cudaStream_t stream)
 786     {
 787         UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
 788                                                 deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
 789     }
 790
 791     template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 792     template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 793     template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 794     template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 795     template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 796     template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 797     template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 798     template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 799 }
 800
 801 #endif /* CUDA_DISABLER */