modules/video/src/lkpyramid.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42 #include "precomp.hpp"
  43 #include <float.h>
  44 #include <stdio.h>
  45 #include "lkpyramid.hpp"
  46 #include "opencl_kernels_video.hpp"
  47 #include "opencv2/core/hal/intrin.hpp"
  48
  49 #include "opencv2/core/openvx/ovx_defs.hpp"
  50
  51 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
  52
  53 namespace
  54 {
  55 static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
  56 {
  57     using namespace cv;
  58     using cv::detail::deriv_type;
  59     int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth();
  60     CV_Assert(depth == CV_8U);
  61     dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
  62
  63     int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
  64     AutoBuffer<deriv_type> _tempBuf(delta*2 + 64);
  65     deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
  66
  67 #if CV_SIMD128
  68     v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
  69     bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
  70 #endif
  71
  72     for( y = 0; y < rows; y++ )
  73     {
  74         const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0);
  75         const uchar* srow1 = src.ptr<uchar>(y);
  76         const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
  77         deriv_type* drow = dst.ptr<deriv_type>(y);
  78
  79         // do vertical convolution
  80         x = 0;
  81 #if CV_SIMD128
  82         if(haveSIMD)
  83         {
  84             for( ; x <= colsn - 8; x += 8 )
  85             {
  86                 v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
  87                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
  88                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
  89
  90                 v_int16x8 t1 = s2 - s0;
  91                 v_int16x8 t0 = (s0 + s2) * c3 + s1 * c10;
  92
  93                 v_store(trow0 + x, t0);
  94                 v_store(trow1 + x, t1);
  95             }
  96         }
  97 #endif
  98
  99         for( ; x < colsn; x++ )
 100         {
 101             int t0 = (srow0[x] + srow2[x])*3 + srow1[x]*10;
 102             int t1 = srow2[x] - srow0[x];
 103             trow0[x] = (deriv_type)t0;
 104             trow1[x] = (deriv_type)t1;
 105         }
 106
 107         // make border
 108         int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols-2 : 0)*cn;
 109         for( int k = 0; k < cn; k++ )
 110         {
 111             trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
 112             trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
 113         }
 114
 115         // do horizontal convolution, interleave the results and store them to dst
 116         x = 0;
 117 #if CV_SIMD128
 118         if(haveSIMD)
 119         {
 120             for( ; x <= colsn - 8; x += 8 )
 121             {
 122                 v_int16x8 s0 = v_load(trow0 + x - cn);
 123                 v_int16x8 s1 = v_load(trow0 + x + cn);
 124                 v_int16x8 s2 = v_load(trow1 + x - cn);
 125                 v_int16x8 s3 = v_load(trow1 + x);
 126                 v_int16x8 s4 = v_load(trow1 + x + cn);
 127
 128                 v_int16x8 t0 = s1 - s0;
 129                 v_int16x8 t1 = ((s2 + s4) * c3) + (s3 * c10);
 130
 131                 v_store_interleave((drow + x*2), t0, t1);
 132             }
 133         }
 134 #endif
 135         for( ; x < colsn; x++ )
 136         {
 137             deriv_type t0 = (deriv_type)(trow0[x+cn] - trow0[x-cn]);
 138             deriv_type t1 = (deriv_type)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
 139             drow[x*2] = t0; drow[x*2+1] = t1;
 140         }
 141     }
 142 }
 143
 144 }//namespace
 145
 146 cv::detail::LKTrackerInvoker::LKTrackerInvoker(
 147                       const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
 148                       const Point2f* _prevPts, Point2f* _nextPts,
 149                       uchar* _status, float* _err,
 150                       Size _winSize, TermCriteria _criteria,
 151                       int _level, int _maxLevel, int _flags, float _minEigThreshold )
 152 {
 153     prevImg = &_prevImg;
 154     prevDeriv = &_prevDeriv;
 155     nextImg = &_nextImg;
 156     prevPts = _prevPts;
 157     nextPts = _nextPts;
 158     status = _status;
 159     err = _err;
 160     winSize = _winSize;
 161     criteria = _criteria;
 162     level = _level;
 163     maxLevel = _maxLevel;
 164     flags = _flags;
 165     minEigThreshold = _minEigThreshold;
 166 }
 167
 168 #if defined __arm__ && !CV_NEON
 169 typedef int64 acctype;
 170 typedef int itemtype;
 171 #else
 172 typedef float acctype;
 173 typedef float itemtype;
 174 #endif
 175
 176 void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 177 {
 178     CV_INSTRUMENT_REGION()
 179
 180     Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
 181     const Mat& I = *prevImg;
 182     const Mat& J = *nextImg;
 183     const Mat& derivI = *prevDeriv;
 184
 185     int j, cn = I.channels(), cn2 = cn*2;
 186     cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
 187     int derivDepth = DataType<deriv_type>::depth;
 188
 189     Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
 190     Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
 191
 192     for( int ptidx = range.start; ptidx < range.end; ptidx++ )
 193     {
 194         Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
 195         Point2f nextPt;
 196         if( level == maxLevel )
 197         {
 198             if( flags & OPTFLOW_USE_INITIAL_FLOW )
 199                 nextPt = nextPts[ptidx]*(float)(1./(1 << level));
 200             else
 201                 nextPt = prevPt;
 202         }
 203         else
 204             nextPt = nextPts[ptidx]*2.f;
 205         nextPts[ptidx] = nextPt;
 206
 207         Point2i iprevPt, inextPt;
 208         prevPt -= halfWin;
 209         iprevPt.x = cvFloor(prevPt.x);
 210         iprevPt.y = cvFloor(prevPt.y);
 211
 212         if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
 213             iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
 214         {
 215             if( level == 0 )
 216             {
 217                 if( status )
 218                     status[ptidx] = false;
 219                 if( err )
 220                     err[ptidx] = 0;
 221             }
 222             continue;
 223         }
 224
 225         float a = prevPt.x - iprevPt.x;
 226         float b = prevPt.y - iprevPt.y;
 227         const int W_BITS = 14, W_BITS1 = 14;
 228         const float FLT_SCALE = 1.f/(1 << 20);
 229         int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
 230         int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
 231         int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
 232         int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 233
 234         int dstep = (int)(derivI.step/derivI.elemSize1());
 235         int stepI = (int)(I.step/I.elemSize1());
 236         int stepJ = (int)(J.step/J.elemSize1());
 237         acctype iA11 = 0, iA12 = 0, iA22 = 0;
 238         float A11, A12, A22;
 239
 240 #if CV_SSE2
 241         __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
 242         __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
 243         __m128i z = _mm_setzero_si128();
 244         __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
 245         __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
 246         __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
 247 #endif
 248
 249 #if CV_NEON
 250
 251         float CV_DECL_ALIGNED(16) nA11[] = { 0, 0, 0, 0 }, nA12[] = { 0, 0, 0, 0 }, nA22[] = { 0, 0, 0, 0 };
 252         const int shifter1 = -(W_BITS - 5); //negative so it shifts right
 253         const int shifter2 = -(W_BITS);
 254
 255         const int16x4_t d26 = vdup_n_s16((int16_t)iw00);
 256         const int16x4_t d27 = vdup_n_s16((int16_t)iw01);
 257         const int16x4_t d28 = vdup_n_s16((int16_t)iw10);
 258         const int16x4_t d29 = vdup_n_s16((int16_t)iw11);
 259         const int32x4_t q11 = vdupq_n_s32((int32_t)shifter1);
 260         const int32x4_t q12 = vdupq_n_s32((int32_t)shifter2);
 261
 262 #endif
 263
 264         // extract the patch from the first image, compute covariation matrix of derivatives
 265         int x, y;
 266         for( y = 0; y < winSize.height; y++ )
 267         {
 268             const uchar* src = I.ptr() + (y + iprevPt.y)*stepI + iprevPt.x*cn;
 269             const deriv_type* dsrc = derivI.ptr<deriv_type>() + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
 270
 271             deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
 272             deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y);
 273
 274             x = 0;
 275
 276 #if CV_SSE2
 277             for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
 278             {
 279                 __m128i v00, v01, v10, v11, t0, t1;
 280
 281                 v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
 282                 v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
 283                 v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI)), z);
 284                 v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI + cn)), z);
 285
 286                 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
 287                                    _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
 288                 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
 289                 _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
 290
 291                 v00 = _mm_loadu_si128((const __m128i*)(dsrc));
 292                 v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
 293                 v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
 294                 v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
 295
 296                 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
 297                                    _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
 298                 t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
 299                                    _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
 300                 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
 301                 t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
 302                 v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
 303
 304                 _mm_storeu_si128((__m128i*)dIptr, v00);
 305                 t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
 306                 t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
 307
 308                 __m128 fy = _mm_cvtepi32_ps(t0);
 309                 __m128 fx = _mm_cvtepi32_ps(t1);
 310
 311                 qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
 312                 qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
 313                 qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
 314             }
 315 #endif
 316
 317 #if CV_NEON
 318             for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
 319             {
 320
 321                 uint8x8_t d0 = vld1_u8(&src[x]);
 322                 uint8x8_t d2 = vld1_u8(&src[x+cn]);
 323                 uint16x8_t q0 = vmovl_u8(d0);
 324                 uint16x8_t q1 = vmovl_u8(d2);
 325
 326                 int32x4_t q5 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26);
 327                 int32x4_t q6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27);
 328
 329                 uint8x8_t d4 = vld1_u8(&src[x + stepI]);
 330                 uint8x8_t d6 = vld1_u8(&src[x + stepI + cn]);
 331                 uint16x8_t q2 = vmovl_u8(d4);
 332                 uint16x8_t q3 = vmovl_u8(d6);
 333
 334                 int32x4_t q7 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28);
 335                 int32x4_t q8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29);
 336
 337                 q5 = vaddq_s32(q5, q6);
 338                 q7 = vaddq_s32(q7, q8);
 339                 q5 = vaddq_s32(q5, q7);
 340
 341                 int16x4x2_t d0d1 = vld2_s16(dsrc);
 342                 int16x4x2_t d2d3 = vld2_s16(&dsrc[cn2]);
 343
 344                 q5 = vqrshlq_s32(q5, q11);
 345
 346                 int32x4_t q4 = vmull_s16(d0d1.val[0], d26);
 347                 q6 = vmull_s16(d0d1.val[1], d26);
 348
 349                 int16x4_t nd0 = vmovn_s32(q5);
 350
 351                 q7 = vmull_s16(d2d3.val[0], d27);
 352                 q8 = vmull_s16(d2d3.val[1], d27);
 353
 354                 vst1_s16(&Iptr[x], nd0);
 355
 356                 int16x4x2_t d4d5 = vld2_s16(&dsrc[dstep]);
 357                 int16x4x2_t d6d7 = vld2_s16(&dsrc[dstep+cn2]);
 358
 359                 q4 = vaddq_s32(q4, q7);
 360                 q6 = vaddq_s32(q6, q8);
 361
 362                 q7 = vmull_s16(d4d5.val[0], d28);
 363                 int32x4_t q14 = vmull_s16(d4d5.val[1], d28);
 364                 q8 = vmull_s16(d6d7.val[0], d29);
 365                 int32x4_t q15 = vmull_s16(d6d7.val[1], d29);
 366
 367                 q7 = vaddq_s32(q7, q8);
 368                 q14 = vaddq_s32(q14, q15);
 369
 370                 q4 = vaddq_s32(q4, q7);
 371                 q6 = vaddq_s32(q6, q14);
 372
 373                 float32x4_t nq0 = vld1q_f32(nA11);
 374                 float32x4_t nq1 = vld1q_f32(nA12);
 375                 float32x4_t nq2 = vld1q_f32(nA22);
 376
 377                 q4 = vqrshlq_s32(q4, q12);
 378                 q6 = vqrshlq_s32(q6, q12);
 379
 380                 q7 = vmulq_s32(q4, q4);
 381                 q8 = vmulq_s32(q4, q6);
 382                 q15 = vmulq_s32(q6, q6);
 383
 384                 nq0 = vaddq_f32(nq0, vcvtq_f32_s32(q7));
 385                 nq1 = vaddq_f32(nq1, vcvtq_f32_s32(q8));
 386                 nq2 = vaddq_f32(nq2, vcvtq_f32_s32(q15));
 387
 388                 vst1q_f32(nA11, nq0);
 389                 vst1q_f32(nA12, nq1);
 390                 vst1q_f32(nA22, nq2);
 391
 392                 int16x4_t d8 = vmovn_s32(q4);
 393                 int16x4_t d12 = vmovn_s32(q6);
 394
 395                 int16x4x2_t d8d12;
 396                 d8d12.val[0] = d8; d8d12.val[1] = d12;
 397                 vst2_s16(dIptr, d8d12);
 398             }
 399 #endif
 400
 401             for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
 402             {
 403                 int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
 404                                       src[x+stepI]*iw10 + src[x+stepI+cn]*iw11, W_BITS1-5);
 405                 int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
 406                                        dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
 407                 int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
 408                                        dsrc[dstep+cn2+1]*iw11, W_BITS1);
 409
 410                 Iptr[x] = (short)ival;
 411                 dIptr[0] = (short)ixval;
 412                 dIptr[1] = (short)iyval;
 413
 414                 iA11 += (itemtype)(ixval*ixval);
 415                 iA12 += (itemtype)(ixval*iyval);
 416                 iA22 += (itemtype)(iyval*iyval);
 417             }
 418         }
 419
 420 #if CV_SSE2
 421         float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
 422         _mm_store_ps(A11buf, qA11);
 423         _mm_store_ps(A12buf, qA12);
 424         _mm_store_ps(A22buf, qA22);
 425         iA11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
 426         iA12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
 427         iA22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 428 #endif
 429
 430 #if CV_NEON
 431         iA11 += nA11[0] + nA11[1] + nA11[2] + nA11[3];
 432         iA12 += nA12[0] + nA12[1] + nA12[2] + nA12[3];
 433         iA22 += nA22[0] + nA22[1] + nA22[2] + nA22[3];
 434 #endif
 435
 436         A11 = iA11*FLT_SCALE;
 437         A12 = iA12*FLT_SCALE;
 438         A22 = iA22*FLT_SCALE;
 439
 440         float D = A11*A22 - A12*A12;
 441         float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
 442                         4.f*A12*A12))/(2*winSize.width*winSize.height);
 443
 444         if( err && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) != 0 )
 445             err[ptidx] = (float)minEig;
 446
 447         if( minEig < minEigThreshold || D < FLT_EPSILON )
 448         {
 449             if( level == 0 && status )
 450                 status[ptidx] = false;
 451             continue;
 452         }
 453
 454         D = 1.f/D;
 455
 456         nextPt -= halfWin;
 457         Point2f prevDelta;
 458
 459         for( j = 0; j < criteria.maxCount; j++ )
 460         {
 461             inextPt.x = cvFloor(nextPt.x);
 462             inextPt.y = cvFloor(nextPt.y);
 463
 464             if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
 465                inextPt.y < -winSize.height || inextPt.y >= J.rows )
 466             {
 467                 if( level == 0 && status )
 468                     status[ptidx] = false;
 469                 break;
 470             }
 471
 472             a = nextPt.x - inextPt.x;
 473             b = nextPt.y - inextPt.y;
 474             iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
 475             iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
 476             iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
 477             iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 478             acctype ib1 = 0, ib2 = 0;
 479             float b1, b2;
 480 #if CV_SSE2
 481             qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
 482             qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
 483             __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
 484 #endif
 485
 486 #if CV_NEON
 487             float CV_DECL_ALIGNED(16) nB1[] = { 0,0,0,0 }, nB2[] = { 0,0,0,0 };
 488
 489             const int16x4_t d26_2 = vdup_n_s16((int16_t)iw00);
 490             const int16x4_t d27_2 = vdup_n_s16((int16_t)iw01);
 491             const int16x4_t d28_2 = vdup_n_s16((int16_t)iw10);
 492             const int16x4_t d29_2 = vdup_n_s16((int16_t)iw11);
 493
 494 #endif
 495
 496             for( y = 0; y < winSize.height; y++ )
 497             {
 498                 const uchar* Jptr = J.ptr() + (y + inextPt.y)*stepJ + inextPt.x*cn;
 499                 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
 500                 const deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y);
 501
 502                 x = 0;
 503
 504 #if CV_SSE2
 505                 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
 506                 {
 507                     __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
 508                     __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
 509                     __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
 510                     __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ)), z);
 511                     __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ + cn)), z);
 512
 513                     __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
 514                                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
 515                     __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
 516                                                _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
 517                     t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
 518                     t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
 519                     diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
 520                     diff1 = _mm_unpackhi_epi16(diff0, diff0);
 521                     diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ...
 522                     v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
 523                     v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
 524                     v10 = _mm_unpacklo_epi16(v00, v01);
 525                     v11 = _mm_unpackhi_epi16(v00, v01);
 526                     v00 = _mm_unpacklo_epi16(diff0, diff1);
 527                     v01 = _mm_unpackhi_epi16(diff0, diff1);
 528                     v00 = _mm_madd_epi16(v00, v10);
 529                     v11 = _mm_madd_epi16(v01, v11);
 530                     qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
 531                     qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v11));
 532                 }
 533 #endif
 534
 535 #if CV_NEON
 536                 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
 537                 {
 538
 539                     uint8x8_t d0 = vld1_u8(&Jptr[x]);
 540                     uint8x8_t d2 = vld1_u8(&Jptr[x+cn]);
 541                     uint8x8_t d4 = vld1_u8(&Jptr[x+stepJ]);
 542                     uint8x8_t d6 = vld1_u8(&Jptr[x+stepJ+cn]);
 543
 544                     uint16x8_t q0 = vmovl_u8(d0);
 545                     uint16x8_t q1 = vmovl_u8(d2);
 546                     uint16x8_t q2 = vmovl_u8(d4);
 547                     uint16x8_t q3 = vmovl_u8(d6);
 548
 549                     int32x4_t nq4 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26_2);
 550                     int32x4_t nq5 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q0)), d26_2);
 551
 552                     int32x4_t nq6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27_2);
 553                     int32x4_t nq7 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q1)), d27_2);
 554
 555                     int32x4_t nq8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28_2);
 556                     int32x4_t nq9 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q2)), d28_2);
 557
 558                     int32x4_t nq10 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29_2);
 559                     int32x4_t nq11 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q3)), d29_2);
 560
 561                     nq4 = vaddq_s32(nq4, nq6);
 562                     nq5 = vaddq_s32(nq5, nq7);
 563                     nq8 = vaddq_s32(nq8, nq10);
 564                     nq9 = vaddq_s32(nq9, nq11);
 565
 566                     int16x8_t q6 = vld1q_s16(&Iptr[x]);
 567
 568                     nq4 = vaddq_s32(nq4, nq8);
 569                     nq5 = vaddq_s32(nq5, nq9);
 570
 571                     nq8 = vmovl_s16(vget_high_s16(q6));
 572                     nq6 = vmovl_s16(vget_low_s16(q6));
 573
 574                     nq4 = vqrshlq_s32(nq4, q11);
 575                     nq5 = vqrshlq_s32(nq5, q11);
 576
 577                     int16x8x2_t q0q1 = vld2q_s16(dIptr);
 578                     float32x4_t nB1v = vld1q_f32(nB1);
 579                     float32x4_t nB2v = vld1q_f32(nB2);
 580
 581                     nq4 = vsubq_s32(nq4, nq6);
 582                     nq5 = vsubq_s32(nq5, nq8);
 583
 584                     int32x4_t nq2 = vmovl_s16(vget_low_s16(q0q1.val[0]));
 585                     int32x4_t nq3 = vmovl_s16(vget_high_s16(q0q1.val[0]));
 586
 587                     nq7 = vmovl_s16(vget_low_s16(q0q1.val[1]));
 588                     nq8 = vmovl_s16(vget_high_s16(q0q1.val[1]));
 589
 590                     nq9 = vmulq_s32(nq4, nq2);
 591                     nq10 = vmulq_s32(nq5, nq3);
 592
 593                     nq4 = vmulq_s32(nq4, nq7);
 594                     nq5 = vmulq_s32(nq5, nq8);
 595
 596                     nq9 = vaddq_s32(nq9, nq10);
 597                     nq4 = vaddq_s32(nq4, nq5);
 598
 599                     nB1v = vaddq_f32(nB1v, vcvtq_f32_s32(nq9));
 600                     nB2v = vaddq_f32(nB2v, vcvtq_f32_s32(nq4));
 601
 602                     vst1q_f32(nB1, nB1v);
 603                     vst1q_f32(nB2, nB2v);
 604                 }
 605 #endif
 606
 607                 for( ; x < winSize.width*cn; x++, dIptr += 2 )
 608                 {
 609                     int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
 610                                           Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
 611                                           W_BITS1-5) - Iptr[x];
 612                     ib1 += (itemtype)(diff*dIptr[0]);
 613                     ib2 += (itemtype)(diff*dIptr[1]);
 614                 }
 615             }
 616
 617 #if CV_SSE2
 618             float CV_DECL_ALIGNED(16) bbuf[4];
 619             _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
 620             ib1 += bbuf[0] + bbuf[2];
 621             ib2 += bbuf[1] + bbuf[3];
 622 #endif
 623
 624 #if CV_NEON
 625
 626             ib1 += (float)(nB1[0] + nB1[1] + nB1[2] + nB1[3]);
 627             ib2 += (float)(nB2[0] + nB2[1] + nB2[2] + nB2[3]);
 628 #endif
 629
 630             b1 = ib1*FLT_SCALE;
 631             b2 = ib2*FLT_SCALE;
 632
 633             Point2f delta( (float)((A12*b2 - A22*b1) * D),
 634                           (float)((A12*b1 - A11*b2) * D));
 635             //delta = -delta;
 636
 637             nextPt += delta;
 638             nextPts[ptidx] = nextPt + halfWin;
 639
 640             if( delta.ddot(delta) <= criteria.epsilon )
 641                 break;
 642
 643             if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
 644                std::abs(delta.y + prevDelta.y) < 0.01 )
 645             {
 646                 nextPts[ptidx] -= delta*0.5f;
 647                 break;
 648             }
 649             prevDelta = delta;
 650         }
 651
 652         CV_Assert(status != NULL);
 653         if( status[ptidx] && err && level == 0 && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) == 0 )
 654         {
 655             Point2f nextPoint = nextPts[ptidx] - halfWin;
 656             Point inextPoint;
 657
 658             inextPoint.x = cvFloor(nextPoint.x);
 659             inextPoint.y = cvFloor(nextPoint.y);
 660
 661             if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols ||
 662                 inextPoint.y < -winSize.height || inextPoint.y >= J.rows )
 663             {
 664                 if( status )
 665                     status[ptidx] = false;
 666                 continue;
 667             }
 668
 669             float aa = nextPoint.x - inextPoint.x;
 670             float bb = nextPoint.y - inextPoint.y;
 671             iw00 = cvRound((1.f - aa)*(1.f - bb)*(1 << W_BITS));
 672             iw01 = cvRound(aa*(1.f - bb)*(1 << W_BITS));
 673             iw10 = cvRound((1.f - aa)*bb*(1 << W_BITS));
 674             iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
 675             float errval = 0.f;
 676
 677             for( y = 0; y < winSize.height; y++ )
 678             {
 679                 const uchar* Jptr = J.ptr() + (y + inextPoint.y)*stepJ + inextPoint.x*cn;
 680                 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
 681
 682                 for( x = 0; x < winSize.width*cn; x++ )
 683                 {
 684                     int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
 685                                           Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
 686                                           W_BITS1-5) - Iptr[x];
 687                     errval += std::abs((float)diff);
 688                 }
 689             }
 690             err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);
 691         }
 692     }
 693 }
 694
 695 int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
 696                                 int pyrBorder, int derivBorder, bool tryReuseInputImage)
 697 {
 698     CV_INSTRUMENT_REGION()
 699
 700     Mat img = _img.getMat();
 701     CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2 );
 702     int pyrstep = withDerivatives ? 2 : 1;
 703
 704     pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true, 0);
 705
 706     int derivType = CV_MAKETYPE(DataType<cv::detail::deriv_type>::depth, img.channels() * 2);
 707
 708     //level 0
 709     bool lvl0IsSet = false;
 710     if(tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
 711     {
 712         Size wholeSize;
 713         Point ofs;
 714         img.locateROI(wholeSize, ofs);
 715         if (ofs.x >= winSize.width && ofs.y >= winSize.height
 716               && ofs.x + img.cols + winSize.width <= wholeSize.width
 717               && ofs.y + img.rows + winSize.height <= wholeSize.height)
 718         {
 719             pyramid.getMatRef(0) = img;
 720             lvl0IsSet = true;
 721         }
 722     }
 723
 724     if(!lvl0IsSet)
 725     {
 726         Mat& temp = pyramid.getMatRef(0);
 727
 728         if(!temp.empty())
 729             temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
 730         if(temp.type() != img.type() || temp.cols != winSize.width*2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
 731             temp.create(img.rows + winSize.height*2, img.cols + winSize.width*2, img.type());
 732
 733         if(pyrBorder == BORDER_TRANSPARENT)
 734             img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
 735         else
 736             copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
 737         temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
 738     }
 739
 740     Size sz = img.size();
 741     Mat prevLevel = pyramid.getMatRef(0);
 742     Mat thisLevel = prevLevel;
 743
 744     for(int level = 0; level <= maxLevel; ++level)
 745     {
 746         if (level != 0)
 747         {
 748             Mat& temp = pyramid.getMatRef(level * pyrstep);
 749
 750             if(!temp.empty())
 751                 temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
 752             if(temp.type() != img.type() || temp.cols != winSize.width*2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
 753                 temp.create(sz.height + winSize.height*2, sz.width + winSize.width*2, img.type());
 754
 755             thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
 756             pyrDown(prevLevel, thisLevel, sz);
 757
 758             if(pyrBorder != BORDER_TRANSPARENT)
 759                 copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder|BORDER_ISOLATED);
 760             temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
 761         }
 762
 763         if(withDerivatives)
 764         {
 765             Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
 766
 767             if(!deriv.empty())
 768                 deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
 769             if(deriv.type() != derivType || deriv.cols != winSize.width*2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
 770                 deriv.create(sz.height + winSize.height*2, sz.width + winSize.width*2, derivType);
 771
 772             Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
 773             calcSharrDeriv(thisLevel, derivI);
 774
 775             if(derivBorder != BORDER_TRANSPARENT)
 776                 copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder|BORDER_ISOLATED);
 777             deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
 778         }
 779
 780         sz = Size((sz.width+1)/2, (sz.height+1)/2);
 781         if( sz.width <= winSize.width || sz.height <= winSize.height )
 782         {
 783             pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true, 0);//check this
 784             return level;
 785         }
 786
 787         prevLevel = thisLevel;
 788     }
 789
 790     return maxLevel;
 791 }
 792
 793 namespace cv
 794 {
 795 namespace
 796 {
 797     class SparsePyrLKOpticalFlowImpl : public SparsePyrLKOpticalFlow
 798     {
 799         struct dim3
 800         {
 801             unsigned int x, y, z;
 802             dim3() : x(0), y(0), z(0) { }
 803         };
 804     public:
 805         SparsePyrLKOpticalFlowImpl(Size winSize_ = Size(21,21),
 806                          int maxLevel_ = 3,
 807                          TermCriteria criteria_ = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01),
 808                          int flags_ = 0,
 809                          double minEigThreshold_ = 1e-4) :
 810           winSize(winSize_), maxLevel(maxLevel_), criteria(criteria_), flags(flags_), minEigThreshold(minEigThreshold_)
 811 #ifdef HAVE_OPENCL
 812           , iters(criteria_.maxCount), derivLambda(criteria_.epsilon), useInitialFlow(0 != (flags_ & OPTFLOW_LK_GET_MIN_EIGENVALS)), waveSize(0)
 813 #endif
 814         {
 815         }
 816
 817         virtual Size getWinSize() const CV_OVERRIDE { return winSize;}
 818         virtual void setWinSize(Size winSize_) CV_OVERRIDE { winSize = winSize_;}
 819
 820         virtual int getMaxLevel() const CV_OVERRIDE { return maxLevel;}
 821         virtual void setMaxLevel(int maxLevel_) CV_OVERRIDE { maxLevel = maxLevel_;}
 822
 823         virtual TermCriteria getTermCriteria() const CV_OVERRIDE { return criteria;}
 824         virtual void setTermCriteria(TermCriteria& crit_) CV_OVERRIDE { criteria=crit_;}
 825
 826         virtual int getFlags() const CV_OVERRIDE { return flags; }
 827         virtual void setFlags(int flags_) CV_OVERRIDE { flags=flags_;}
 828
 829         virtual double getMinEigThreshold() const CV_OVERRIDE { return minEigThreshold;}
 830         virtual void setMinEigThreshold(double minEigThreshold_) CV_OVERRIDE { minEigThreshold=minEigThreshold_;}
 831
 832         virtual void calc(InputArray prevImg, InputArray nextImg,
 833                           InputArray prevPts, InputOutputArray nextPts,
 834                           OutputArray status,
 835                           OutputArray err = cv::noArray()) CV_OVERRIDE;
 836
 837     private:
 838 #ifdef HAVE_OPENCL
 839         bool checkParam()
 840         {
 841             iters = std::min(std::max(iters, 0), 100);
 842
 843             derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
 844             if (derivLambda < 0)
 845                 return false;
 846             if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2)
 847                 return false;
 848             if (winSize.width < 8 || winSize.height < 8 ||
 849                 winSize.width > 24 || winSize.height > 24)
 850                 return false;
 851             calcPatchSize();
 852             if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6)
 853                 return false;
 854             if (!initWaveSize())
 855                 return false;
 856             return true;
 857         }
 858
 859         bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err)
 860         {
 861             if (!checkParam())
 862                 return false;
 863
 864             UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
 865             UMat temp2 = nextPts.reshape(1);
 866             multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2);
 867
 868             status.setTo(Scalar::all(1));
 869
 870             // build the image pyramids.
 871             std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
 872             std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
 873
 874             // allocate buffers with aligned pitch to be able to use cl_khr_image2d_from_buffer extension
 875             // This is the required pitch alignment in pixels
 876             int pitchAlign = (int)ocl::Device::getDefault().imagePitchAlignment();
 877             if (pitchAlign>0)
 878             {
 879                 prevPyr[0] = UMat(prevImg.rows,(prevImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,prevImg.cols);
 880                 nextPyr[0] = UMat(nextImg.rows,(nextImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,nextImg.cols);
 881                 for (int level = 1; level <= maxLevel; ++level)
 882                 {
 883                     int cols,rows;
 884                     // allocate buffers with aligned pitch to be able to use image on buffer extension
 885                     cols = (prevPyr[level - 1].cols+1)/2;
 886                     rows = (prevPyr[level - 1].rows+1)/2;
 887                     prevPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),prevPyr[level-1].type()).colRange(0,cols);
 888                     cols = (nextPyr[level - 1].cols+1)/2;
 889                     rows = (nextPyr[level - 1].rows+1)/2;
 890                     nextPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),nextPyr[level-1].type()).colRange(0,cols);
 891                 }
 892             }
 893
 894             prevImg.convertTo(prevPyr[0], CV_32F);
 895             nextImg.convertTo(nextPyr[0], CV_32F);
 896
 897             for (int level = 1; level <= maxLevel; ++level)
 898             {
 899                 pyrDown(prevPyr[level - 1], prevPyr[level]);
 900                 pyrDown(nextPyr[level - 1], nextPyr[level]);
 901             }
 902
 903             // dI/dx ~ Ix, dI/dy ~ Iy
 904             for (int level = maxLevel; level >= 0; level--)
 905             {
 906                 if (!lkSparse_run(prevPyr[level], nextPyr[level], prevPts,
 907                                   nextPts, status, err,
 908                                   prevPts.cols, level))
 909                     return false;
 910             }
 911             return true;
 912         }
 913 #endif
 914
 915         Size winSize;
 916         int maxLevel;
 917         TermCriteria criteria;
 918         int flags;
 919         double minEigThreshold;
 920 #ifdef HAVE_OPENCL
 921         int iters;
 922         double derivLambda;
 923         bool useInitialFlow;
 924         int waveSize;
 925         bool initWaveSize()
 926         {
 927             waveSize = 1;
 928             if (isDeviceCPU())
 929                 return true;
 930
 931             ocl::Kernel kernel;
 932             if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, ""))
 933                 return false;
 934             waveSize = (int)kernel.preferedWorkGroupSizeMultiple();
 935             return true;
 936         }
 937         dim3 patch;
 938         void calcPatchSize()
 939         {
 940             dim3 block;
 941
 942             if (winSize.width > 32 && winSize.width > 2 * winSize.height)
 943             {
 944                 block.x = 32;
 945                 block.y = 8;
 946             }
 947             else
 948             {
 949                 block.x = 16;
 950                 block.y = 16;
 951             }
 952
 953             patch.x = (winSize.width  + block.x - 1) / block.x;
 954             patch.y = (winSize.height + block.y - 1) / block.y;
 955
 956             block.z = patch.z = 1;
 957         }
 958
 959         bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err,
 960             int ptcount, int level)
 961         {
 962             size_t localThreads[3]  = { 8, 8};
 963             size_t globalThreads[3] = { 8 * (size_t)ptcount, 8};
 964             char calcErr = (0 == level) ? 1 : 0;
 965
 966             int wsx = 1, wsy = 1;
 967             if(winSize.width < 16)
 968                 wsx = 0;
 969             if(winSize.height < 16)
 970                 wsy = 0;
 971             cv::String build_options;
 972             if (isDeviceCPU())
 973                 build_options = " -D CPU";
 974             else
 975                 build_options = cv::format("-D WAVE_SIZE=%d -D WSX=%d -D WSY=%d",
 976                                            waveSize, wsx, wsy);
 977
 978             ocl::Kernel kernel;
 979             if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
 980                 return false;
 981
 982             CV_Assert(I.depth() == CV_32F && J.depth() == CV_32F);
 983             ocl::Image2D imageI(I, false, ocl::Image2D::canCreateAlias(I));
 984             ocl::Image2D imageJ(J, false, ocl::Image2D::canCreateAlias(J));
 985
 986             int idxArg = 0;
 987             idxArg = kernel.set(idxArg, imageI); //image2d_t I
 988             idxArg = kernel.set(idxArg, imageJ); //image2d_t J
 989             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
 990             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
 991             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
 992             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
 993             idxArg = kernel.set(idxArg, (int)level); // const int level
 994             idxArg = kernel.set(idxArg, (int)I.rows); // const int rows
 995             idxArg = kernel.set(idxArg, (int)I.cols); // const int cols
 996             idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X
 997             idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y
 998             idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x
 999             idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
1000             idxArg = kernel.set(idxArg, (int)iters); // int c_iters
1001             idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
1002             return kernel.run(2, globalThreads, localThreads, true); // sync=true because ocl::Image2D lifetime is not handled well for temp UMat
1003         }
1004     private:
1005         inline static bool isDeviceCPU()
1006         {
1007             return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type());
1008         }
1009
1010
1011     bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg,
1012                                          InputArray _prevPts, InputOutputArray _nextPts,
1013                                          OutputArray _status, OutputArray _err)
1014     {
1015         if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags))
1016             return false;
1017         if (!cv::ocl::Device::getDefault().imageSupport())
1018             return false;
1019         if (_nextImg.size() != _prevImg.size())
1020             return false;
1021         int typePrev = _prevImg.type();
1022         int typeNext = _nextImg.type();
1023         if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext)))
1024             return false;
1025         if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext)))
1026             return false;
1027
1028         if (_prevPts.empty() || _prevPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
1029             return false;
1030         if ((1 != _prevPts.size().height) && (1 != _prevPts.size().width))
1031             return false;
1032         size_t npoints = _prevPts.total();
1033         if (useInitialFlow)
1034         {
1035             if (_nextPts.empty() || _nextPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
1036                 return false;
1037             if ((1 != _nextPts.size().height) && (1 != _nextPts.size().width))
1038                 return false;
1039             if (_nextPts.total() != npoints)
1040                 return false;
1041         }
1042         else
1043         {
1044             _nextPts.create(_prevPts.size(), _prevPts.type());
1045         }
1046
1047         if (!checkParam())
1048             return false;
1049
1050         UMat umatErr;
1051         if (_err.needed())
1052         {
1053             _err.create((int)npoints, 1, CV_32FC1);
1054             umatErr = _err.getUMat();
1055         }
1056         else
1057             umatErr.create((int)npoints, 1, CV_32FC1);
1058
1059         _status.create((int)npoints, 1, CV_8UC1);
1060         UMat umatNextPts = _nextPts.getUMat();
1061         UMat umatStatus = _status.getUMat();
1062         return sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr);
1063     }
1064 #endif
1065
1066 #ifdef HAVE_OPENVX
1067     bool openvx_pyrlk(InputArray _prevImg, InputArray _nextImg, InputArray _prevPts, InputOutputArray _nextPts,
1068                              OutputArray _status, OutputArray _err)
1069     {
1070         using namespace ivx;
1071
1072         // Pyramids as inputs are not acceptable because there's no (direct or simple) way
1073         // to build vx_pyramid on user data
1074         if(_prevImg.kind() != _InputArray::MAT || _nextImg.kind() != _InputArray::MAT)
1075             return false;
1076
1077         Mat prevImgMat = _prevImg.getMat(), nextImgMat = _nextImg.getMat();
1078
1079         if(prevImgMat.type() != CV_8UC1 || nextImgMat.type() != CV_8UC1)
1080             return false;
1081
1082         if (ovx::skipSmallImages<VX_KERNEL_OPTICAL_FLOW_PYR_LK>(prevImgMat.cols, prevImgMat.rows))
1083             return false;
1084
1085         CV_Assert(prevImgMat.size() == nextImgMat.size());
1086         Mat prevPtsMat = _prevPts.getMat();
1087         int checkPrev = prevPtsMat.checkVector(2, CV_32F, false);
1088         CV_Assert( checkPrev >= 0 );
1089         size_t npoints = checkPrev;
1090
1091         if( !(flags & OPTFLOW_USE_INITIAL_FLOW) )
1092             _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
1093         Mat nextPtsMat = _nextPts.getMat();
1094         CV_Assert( nextPtsMat.checkVector(2, CV_32F, false) == (int)npoints );
1095
1096         _status.create((int)npoints, 1, CV_8U, -1, true);
1097         Mat statusMat = _status.getMat();
1098         uchar* status = statusMat.ptr();
1099         for(size_t i = 0; i < npoints; i++ )
1100             status[i] = true;
1101
1102         // OpenVX doesn't return detection errors
1103         if( _err.needed() )
1104         {
1105             return false;
1106         }
1107
1108         try
1109         {
1110             Context context = ovx::getOpenVXContext();
1111
1112             if(context.vendorID() == VX_ID_KHRONOS)
1113             {
1114                 // PyrLK in OVX 1.0.1 performs vxCommitImagePatch incorrecty and crashes
1115                 if(VX_VERSION == VX_VERSION_1_0)
1116                     return false;
1117                 // Implementation ignores border mode
1118                 // So check that minimal size of image in pyramid is big enough
1119                 int width = prevImgMat.cols, height = prevImgMat.rows;
1120                 for(int i = 0; i < maxLevel+1; i++)
1121                 {
1122                     if(width < winSize.width + 1 || height < winSize.height + 1)
1123                         return false;
1124                     else
1125                     {
1126                         width /= 2; height /= 2;
1127                     }
1128                 }
1129             }
1130
1131             Image prevImg = Image::createFromHandle(context, Image::matTypeToFormat(prevImgMat.type()),
1132                                                     Image::createAddressing(prevImgMat), (void*)prevImgMat.data);
1133             Image nextImg = Image::createFromHandle(context, Image::matTypeToFormat(nextImgMat.type()),
1134                                                     Image::createAddressing(nextImgMat), (void*)nextImgMat.data);
1135
1136             Graph graph = Graph::create(context);
1137
1138             Pyramid prevPyr = Pyramid::createVirtual(graph, (vx_size)maxLevel+1, VX_SCALE_PYRAMID_HALF,
1139                                                      prevImg.width(), prevImg.height(), prevImg.format());
1140             Pyramid nextPyr = Pyramid::createVirtual(graph, (vx_size)maxLevel+1, VX_SCALE_PYRAMID_HALF,
1141                                                      nextImg.width(), nextImg.height(), nextImg.format());
1142
1143             ivx::Node::create(graph, VX_KERNEL_GAUSSIAN_PYRAMID, prevImg, prevPyr);
1144             ivx::Node::create(graph, VX_KERNEL_GAUSSIAN_PYRAMID, nextImg, nextPyr);
1145
1146             Array prevPts = Array::create(context, VX_TYPE_KEYPOINT, npoints);
1147             Array estimatedPts = Array::create(context, VX_TYPE_KEYPOINT, npoints);
1148             Array nextPts = Array::create(context, VX_TYPE_KEYPOINT, npoints);
1149
1150             std::vector<vx_keypoint_t> vxPrevPts(npoints), vxEstPts(npoints), vxNextPts(npoints);
1151             for(size_t i = 0; i < npoints; i++)
1152             {
1153                 vx_keypoint_t& prevPt = vxPrevPts[i]; vx_keypoint_t& estPt  = vxEstPts[i];
1154                 prevPt.x = prevPtsMat.at<Point2f>(i).x; prevPt.y = prevPtsMat.at<Point2f>(i).y;
1155                  estPt.x = nextPtsMat.at<Point2f>(i).x;  estPt.y = nextPtsMat.at<Point2f>(i).y;
1156                 prevPt.tracking_status = estPt.tracking_status = vx_true_e;
1157             }
1158             prevPts.addItems(vxPrevPts); estimatedPts.addItems(vxEstPts);
1159
1160             if( (criteria.type & TermCriteria::COUNT) == 0 )
1161                 criteria.maxCount = 30;
1162             else
1163                 criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
1164             if( (criteria.type & TermCriteria::EPS) == 0 )
1165                 criteria.epsilon = 0.01;
1166             else
1167                 criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
1168             criteria.epsilon *= criteria.epsilon;
1169
1170             vx_enum termEnum = (criteria.type == TermCriteria::COUNT) ? VX_TERM_CRITERIA_ITERATIONS :
1171                                (criteria.type == TermCriteria::EPS) ? VX_TERM_CRITERIA_EPSILON :
1172                                VX_TERM_CRITERIA_BOTH;
1173
1174             //minEigThreshold is fixed to 0.0001f
1175             ivx::Scalar termination = ivx::Scalar::create<VX_TYPE_ENUM>(context, termEnum);
1176             ivx::Scalar epsilon = ivx::Scalar::create<VX_TYPE_FLOAT32>(context, criteria.epsilon);
1177             ivx::Scalar numIterations = ivx::Scalar::create<VX_TYPE_UINT32>(context, criteria.maxCount);
1178             ivx::Scalar useInitial = ivx::Scalar::create<VX_TYPE_BOOL>(context, (vx_bool)(flags & OPTFLOW_USE_INITIAL_FLOW));
1179             //assume winSize is square
1180             ivx::Scalar windowSize = ivx::Scalar::create<VX_TYPE_SIZE>(context, (vx_size)winSize.width);
1181
1182             ivx::Node::create(graph, VX_KERNEL_OPTICAL_FLOW_PYR_LK, prevPyr, nextPyr, prevPts, estimatedPts,
1183                               nextPts, termination, epsilon, numIterations, useInitial, windowSize);
1184
1185             graph.verify();
1186             graph.process();
1187
1188             nextPts.copyTo(vxNextPts);
1189             for(size_t i = 0; i < npoints; i++)
1190             {
1191                 vx_keypoint_t kp = vxNextPts[i];
1192                 nextPtsMat.at<Point2f>(i) = Point2f(kp.x, kp.y);
1193                 statusMat.at<uchar>(i) = (bool)kp.tracking_status;
1194             }
1195
1196 #ifdef VX_VERSION_1_1
1197         //we should take user memory back before release
1198         //(it's not done automatically according to standard)
1199         prevImg.swapHandle(); nextImg.swapHandle();
1200 #endif
1201         }
1202         catch (RuntimeError & e)
1203         {
1204             VX_DbgThrow(e.what());
1205         }
1206         catch (WrapperError & e)
1207         {
1208             VX_DbgThrow(e.what());
1209         }
1210
1211         return true;
1212     }
1213 #endif
1214 };
1215
1216
1217
1218 void SparsePyrLKOpticalFlowImpl::calc( InputArray _prevImg, InputArray _nextImg,
1219                            InputArray _prevPts, InputOutputArray _nextPts,
1220                            OutputArray _status, OutputArray _err)
1221 {
1222     CV_INSTRUMENT_REGION()
1223
1224     CV_OCL_RUN(ocl::isOpenCLActivated() &&
1225                (_prevImg.isUMat() || _nextImg.isUMat()) &&
1226                ocl::Image2D::isFormatSupported(CV_32F, 1, false),
1227                ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err))
1228
1229     // Disabled due to bad accuracy
1230     CV_OVX_RUN(false,
1231                openvx_pyrlk(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err))
1232
1233     Mat prevPtsMat = _prevPts.getMat();
1234     const int derivDepth = DataType<cv::detail::deriv_type>::depth;
1235
1236     CV_Assert( maxLevel >= 0 && winSize.width > 2 && winSize.height > 2 );
1237
1238     int level=0, i, npoints;
1239     CV_Assert( (npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0 );
1240
1241     if( npoints == 0 )
1242     {
1243         _nextPts.release();
1244         _status.release();
1245         _err.release();
1246         return;
1247     }
1248
1249     if( !(flags & OPTFLOW_USE_INITIAL_FLOW) )
1250         _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
1251
1252     Mat nextPtsMat = _nextPts.getMat();
1253     CV_Assert( nextPtsMat.checkVector(2, CV_32F, true) == npoints );
1254
1255     const Point2f* prevPts = prevPtsMat.ptr<Point2f>();
1256     Point2f* nextPts = nextPtsMat.ptr<Point2f>();
1257
1258     _status.create((int)npoints, 1, CV_8U, -1, true);
1259     Mat statusMat = _status.getMat(), errMat;
1260     CV_Assert( statusMat.isContinuous() );
1261     uchar* status = statusMat.ptr();
1262     float* err = 0;
1263
1264     for( i = 0; i < npoints; i++ )
1265         status[i] = true;
1266
1267     if( _err.needed() )
1268     {
1269         _err.create((int)npoints, 1, CV_32F, -1, true);
1270         errMat = _err.getMat();
1271         CV_Assert( errMat.isContinuous() );
1272         err = errMat.ptr<float>();
1273     }
1274
1275     std::vector<Mat> prevPyr, nextPyr;
1276     int levels1 = -1;
1277     int lvlStep1 = 1;
1278     int levels2 = -1;
1279     int lvlStep2 = 1;
1280
1281     if(_prevImg.kind() == _InputArray::STD_VECTOR_MAT)
1282     {
1283         _prevImg.getMatVector(prevPyr);
1284
1285         levels1 = int(prevPyr.size()) - 1;
1286         CV_Assert(levels1 >= 0);
1287
1288         if (levels1 % 2 == 1 && prevPyr[0].channels() * 2 == prevPyr[1].channels() && prevPyr[1].depth() == derivDepth)
1289         {
1290             lvlStep1 = 2;
1291             levels1 /= 2;
1292         }
1293
1294         // ensure that pyramid has reqired padding
1295         if(levels1 > 0)
1296         {
1297             Size fullSize;
1298             Point ofs;
1299             prevPyr[lvlStep1].locateROI(fullSize, ofs);
1300             CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height
1301                 && ofs.x + prevPyr[lvlStep1].cols + winSize.width <= fullSize.width
1302                 && ofs.y + prevPyr[lvlStep1].rows + winSize.height <= fullSize.height);
1303         }
1304
1305         if(levels1 < maxLevel)
1306             maxLevel = levels1;
1307     }
1308
1309     if(_nextImg.kind() == _InputArray::STD_VECTOR_MAT)
1310     {
1311         _nextImg.getMatVector(nextPyr);
1312
1313         levels2 = int(nextPyr.size()) - 1;
1314         CV_Assert(levels2 >= 0);
1315
1316         if (levels2 % 2 == 1 && nextPyr[0].channels() * 2 == nextPyr[1].channels() && nextPyr[1].depth() == derivDepth)
1317         {
1318             lvlStep2 = 2;
1319             levels2 /= 2;
1320         }
1321
1322         // ensure that pyramid has reqired padding
1323         if(levels2 > 0)
1324         {
1325             Size fullSize;
1326             Point ofs;
1327             nextPyr[lvlStep2].locateROI(fullSize, ofs);
1328             CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height
1329                 && ofs.x + nextPyr[lvlStep2].cols + winSize.width <= fullSize.width
1330                 && ofs.y + nextPyr[lvlStep2].rows + winSize.height <= fullSize.height);
1331         }
1332
1333         if(levels2 < maxLevel)
1334             maxLevel = levels2;
1335     }
1336
1337     if (levels1 < 0)
1338         maxLevel = buildOpticalFlowPyramid(_prevImg, prevPyr, winSize, maxLevel, false);
1339
1340     if (levels2 < 0)
1341         maxLevel = buildOpticalFlowPyramid(_nextImg, nextPyr, winSize, maxLevel, false);
1342
1343     if( (criteria.type & TermCriteria::COUNT) == 0 )
1344         criteria.maxCount = 30;
1345     else
1346         criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
1347     if( (criteria.type & TermCriteria::EPS) == 0 )
1348         criteria.epsilon = 0.01;
1349     else
1350         criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
1351     criteria.epsilon *= criteria.epsilon;
1352
1353     // dI/dx ~ Ix, dI/dy ~ Iy
1354     Mat derivIBuf;
1355     if(lvlStep1 == 1)
1356         derivIBuf.create(prevPyr[0].rows + winSize.height*2, prevPyr[0].cols + winSize.width*2, CV_MAKETYPE(derivDepth, prevPyr[0].channels() * 2));
1357
1358     for( level = maxLevel; level >= 0; level-- )
1359     {
1360         Mat derivI;
1361         if(lvlStep1 == 1)
1362         {
1363             Size imgSize = prevPyr[level * lvlStep1].size();
1364             Mat _derivI( imgSize.height + winSize.height*2,
1365                 imgSize.width + winSize.width*2, derivIBuf.type(), derivIBuf.ptr() );
1366             derivI = _derivI(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));
1367             calcSharrDeriv(prevPyr[level * lvlStep1], derivI);
1368             copyMakeBorder(derivI, _derivI, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_CONSTANT|BORDER_ISOLATED);
1369         }
1370         else
1371             derivI = prevPyr[level * lvlStep1 + 1];
1372
1373         CV_Assert(prevPyr[level * lvlStep1].size() == nextPyr[level * lvlStep2].size());
1374         CV_Assert(prevPyr[level * lvlStep1].type() == nextPyr[level * lvlStep2].type());
1375
1376         typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
1377         parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
1378                                                           nextPyr[level * lvlStep2], prevPts, nextPts,
1379                                                           status, err,
1380                                                           winSize, criteria, level, maxLevel,
1381                                                           flags, (float)minEigThreshold));
1382     }
1383 }
1384
1385 } // namespace
1386 } // namespace cv
1387 cv::Ptr<cv::SparsePyrLKOpticalFlow> cv::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, TermCriteria crit, int flags, double minEigThreshold){
1388     return makePtr<SparsePyrLKOpticalFlowImpl>(winSize,maxLevel,crit,flags,minEigThreshold);
1389 }
1390 void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
1391                                InputArray _prevPts, InputOutputArray _nextPts,
1392                                OutputArray _status, OutputArray _err,
1393                                Size winSize, int maxLevel,
1394                                TermCriteria criteria,
1395                                int flags, double minEigThreshold )
1396 {
1397     Ptr<cv::SparsePyrLKOpticalFlow> optflow = cv::SparsePyrLKOpticalFlow::create(winSize,maxLevel,criteria,flags,minEigThreshold);
1398     optflow->calc(_prevImg,_nextImg,_prevPts,_nextPts,_status,_err);
1399 }
1400
1401 namespace cv
1402 {
1403
1404 static void
1405 getRTMatrix( const Point2f* a, const Point2f* b,
1406              int count, Mat& M, bool fullAffine )
1407 {
1408     CV_Assert( M.isContinuous() );
1409
1410     if( fullAffine )
1411     {
1412         double sa[6][6]={{0.}}, sb[6]={0.};
1413         Mat A( 6, 6, CV_64F, &sa[0][0] ), B( 6, 1, CV_64F, sb );
1414         Mat MM = M.reshape(1, 6);
1415
1416         for( int i = 0; i < count; i++ )
1417         {
1418             sa[0][0] += a[i].x*a[i].x;
1419             sa[0][1] += a[i].y*a[i].x;
1420             sa[0][2] += a[i].x;
1421
1422             sa[1][1] += a[i].y*a[i].y;
1423             sa[1][2] += a[i].y;
1424
1425             sb[0] += a[i].x*b[i].x;
1426             sb[1] += a[i].y*b[i].x;
1427             sb[2] += b[i].x;
1428             sb[3] += a[i].x*b[i].y;
1429             sb[4] += a[i].y*b[i].y;
1430             sb[5] += b[i].y;
1431         }
1432
1433         sa[3][4] = sa[4][3] = sa[1][0] = sa[0][1];
1434         sa[3][5] = sa[5][3] = sa[2][0] = sa[0][2];
1435         sa[4][5] = sa[5][4] = sa[2][1] = sa[1][2];
1436
1437         sa[3][3] = sa[0][0];
1438         sa[4][4] = sa[1][1];
1439         sa[5][5] = sa[2][2] = count;
1440
1441         solve( A, B, MM, DECOMP_EIG );
1442     }
1443     else
1444     {
1445         double sa[4][4]={{0.}}, sb[4]={0.}, m[4] = {0};
1446         Mat A( 4, 4, CV_64F, sa ), B( 4, 1, CV_64F, sb );
1447         Mat MM( 4, 1, CV_64F, m );
1448
1449         for( int i = 0; i < count; i++ )
1450         {
1451             sa[0][0] += a[i].x*a[i].x + a[i].y*a[i].y;
1452             sa[0][2] += a[i].x;
1453             sa[0][3] += a[i].y;
1454
1455             sb[0] += a[i].x*b[i].x + a[i].y*b[i].y;
1456             sb[1] += a[i].x*b[i].y - a[i].y*b[i].x;
1457             sb[2] += b[i].x;
1458             sb[3] += b[i].y;
1459         }
1460
1461         sa[1][1] = sa[0][0];
1462         sa[2][1] = sa[1][2] = -sa[0][3];
1463         sa[3][1] = sa[1][3] = sa[2][0] = sa[0][2];
1464         sa[2][2] = sa[3][3] = count;
1465         sa[3][0] = sa[0][3];
1466
1467         solve( A, B, MM, DECOMP_EIG );
1468
1469         double* om = M.ptr<double>();
1470         om[0] = om[4] = m[0];
1471         om[1] = -m[1];
1472         om[3] = m[1];
1473         om[2] = m[2];
1474         om[5] = m[3];
1475     }
1476 }
1477
1478 }
1479
1480 cv::Mat cv::estimateRigidTransform( InputArray src1, InputArray src2, bool fullAffine )
1481 {
1482     CV_INSTRUMENT_REGION()
1483
1484     Mat M(2, 3, CV_64F), A = src1.getMat(), B = src2.getMat();
1485
1486     const int COUNT = 15;
1487     const int WIDTH = 160, HEIGHT = 120;
1488     const int RANSAC_MAX_ITERS = 500;
1489     const int RANSAC_SIZE0 = 3;
1490     const double RANSAC_GOOD_RATIO = 0.5;
1491
1492     std::vector<Point2f> pA, pB;
1493     std::vector<int> good_idx;
1494     std::vector<uchar> status;
1495
1496     double scale = 1.;
1497     int i, j, k, k1;
1498
1499     RNG rng((uint64)-1);
1500     int good_count = 0;
1501
1502     if( A.size() != B.size() )
1503         CV_Error( Error::StsUnmatchedSizes, "Both input images must have the same size" );
1504
1505     if( A.type() != B.type() )
1506         CV_Error( Error::StsUnmatchedFormats, "Both input images must have the same data type" );
1507
1508     int count = A.checkVector(2);
1509
1510     if( count > 0 )
1511     {
1512         A.reshape(2, count).convertTo(pA, CV_32F);
1513         B.reshape(2, count).convertTo(pB, CV_32F);
1514     }
1515     else if( A.depth() == CV_8U )
1516     {
1517         int cn = A.channels();
1518         CV_Assert( cn == 1 || cn == 3 || cn == 4 );
1519         Size sz0 = A.size();
1520         Size sz1(WIDTH, HEIGHT);
1521
1522         scale = std::max(1., std::max( (double)sz1.width/sz0.width, (double)sz1.height/sz0.height ));
1523
1524         sz1.width = cvRound( sz0.width * scale );
1525         sz1.height = cvRound( sz0.height * scale );
1526
1527         bool equalSizes = sz1.width == sz0.width && sz1.height == sz0.height;
1528
1529         if( !equalSizes || cn != 1 )
1530         {
1531             Mat sA, sB;
1532
1533             if( cn != 1 )
1534             {
1535                 Mat gray;
1536                 cvtColor(A, gray, COLOR_BGR2GRAY);
1537                 resize(gray, sA, sz1, 0., 0., INTER_AREA);
1538                 cvtColor(B, gray, COLOR_BGR2GRAY);
1539                 resize(gray, sB, sz1, 0., 0., INTER_AREA);
1540             }
1541             else
1542             {
1543                 resize(A, sA, sz1, 0., 0., INTER_AREA);
1544                 resize(B, sB, sz1, 0., 0., INTER_AREA);
1545             }
1546
1547             A = sA;
1548             B = sB;
1549         }
1550
1551         int count_y = COUNT;
1552         int count_x = cvRound((double)COUNT*sz1.width/sz1.height);
1553         count = count_x * count_y;
1554
1555         pA.resize(count);
1556         pB.resize(count);
1557         status.resize(count);
1558
1559         for( i = 0, k = 0; i < count_y; i++ )
1560             for( j = 0; j < count_x; j++, k++ )
1561             {
1562                 pA[k].x = (j+0.5f)*sz1.width/count_x;
1563                 pA[k].y = (i+0.5f)*sz1.height/count_y;
1564             }
1565
1566         // find the corresponding points in B
1567         calcOpticalFlowPyrLK(A, B, pA, pB, status, noArray(), Size(21, 21), 3,
1568                              TermCriteria(TermCriteria::MAX_ITER,40,0.1));
1569
1570         // repack the remained points
1571         for( i = 0, k = 0; i < count; i++ )
1572             if( status[i] )
1573             {
1574                 if( i > k )
1575                 {
1576                     pA[k] = pA[i];
1577                     pB[k] = pB[i];
1578                 }
1579                 k++;
1580             }
1581         count = k;
1582         pA.resize(count);
1583         pB.resize(count);
1584     }
1585     else
1586         CV_Error( Error::StsUnsupportedFormat, "Both input images must have either 8uC1 or 8uC3 type" );
1587
1588     good_idx.resize(count);
1589
1590     if( count < RANSAC_SIZE0 )
1591         return Mat();
1592
1593     Rect brect = boundingRect(pB);
1594
1595     // RANSAC stuff:
1596     // 1. find the consensus
1597     for( k = 0; k < RANSAC_MAX_ITERS; k++ )
1598     {
1599         int idx[RANSAC_SIZE0];
1600         Point2f a[RANSAC_SIZE0];
1601         Point2f b[RANSAC_SIZE0];
1602
1603         // choose random 3 non-coplanar points from A & B
1604         for( i = 0; i < RANSAC_SIZE0; i++ )
1605         {
1606             for( k1 = 0; k1 < RANSAC_MAX_ITERS; k1++ )
1607             {
1608                 idx[i] = rng.uniform(0, count);
1609
1610                 for( j = 0; j < i; j++ )
1611                 {
1612                     if( idx[j] == idx[i] )
1613                         break;
1614                     // check that the points are not very close one each other
1615                     if( fabs(pA[idx[i]].x - pA[idx[j]].x) +
1616                         fabs(pA[idx[i]].y - pA[idx[j]].y) < FLT_EPSILON )
1617                         break;
1618                     if( fabs(pB[idx[i]].x - pB[idx[j]].x) +
1619                         fabs(pB[idx[i]].y - pB[idx[j]].y) < FLT_EPSILON )
1620                         break;
1621                 }
1622
1623                 if( j < i )
1624                     continue;
1625
1626                 if( i+1 == RANSAC_SIZE0 )
1627                 {
1628                     // additional check for non-complanar vectors
1629                     a[0] = pA[idx[0]];
1630                     a[1] = pA[idx[1]];
1631                     a[2] = pA[idx[2]];
1632
1633                     b[0] = pB[idx[0]];
1634                     b[1] = pB[idx[1]];
1635                     b[2] = pB[idx[2]];
1636
1637                     double dax1 = a[1].x - a[0].x, day1 = a[1].y - a[0].y;
1638                     double dax2 = a[2].x - a[0].x, day2 = a[2].y - a[0].y;
1639                     double dbx1 = b[1].x - b[0].x, dby1 = b[1].y - b[0].y;
1640                     double dbx2 = b[2].x - b[0].x, dby2 = b[2].y - b[0].y;
1641                     const double eps = 0.01;
1642
1643                     if( fabs(dax1*day2 - day1*dax2) < eps*std::sqrt(dax1*dax1+day1*day1)*std::sqrt(dax2*dax2+day2*day2) ||
1644                         fabs(dbx1*dby2 - dby1*dbx2) < eps*std::sqrt(dbx1*dbx1+dby1*dby1)*std::sqrt(dbx2*dbx2+dby2*dby2) )
1645                         continue;
1646                 }
1647                 break;
1648             }
1649
1650             if( k1 >= RANSAC_MAX_ITERS )
1651                 break;
1652         }
1653
1654         if( i < RANSAC_SIZE0 )
1655             continue;
1656
1657         // estimate the transformation using 3 points
1658         getRTMatrix( a, b, 3, M, fullAffine );
1659
1660         const double* m = M.ptr<double>();
1661         for( i = 0, good_count = 0; i < count; i++ )
1662         {
1663             if( std::abs( m[0]*pA[i].x + m[1]*pA[i].y + m[2] - pB[i].x ) +
1664                 std::abs( m[3]*pA[i].x + m[4]*pA[i].y + m[5] - pB[i].y ) < std::max(brect.width,brect.height)*0.05 )
1665                 good_idx[good_count++] = i;
1666         }
1667
1668         if( good_count >= count*RANSAC_GOOD_RATIO )
1669             break;
1670     }
1671
1672     if( k >= RANSAC_MAX_ITERS )
1673         return Mat();
1674
1675     if( good_count < count )
1676     {
1677         for( i = 0; i < good_count; i++ )
1678         {
1679             j = good_idx[i];
1680             pA[i] = pA[j];
1681             pB[i] = pB[j];
1682         }
1683     }
1684
1685     getRTMatrix( &pA[0], &pB[0], good_count, M, fullAffine );
1686     M.at<double>(0, 2) /= scale;
1687     M.at<double>(1, 2) /= scale;
1688
1689     return M;
1690 }
1691
1692 /* End of file. */