next: drop HAVE_TEGRA_OPTIMIZATION/TADP
[platform/upstream/opencv.git] / modules / video / src / lkpyramid.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 #include "precomp.hpp"
43 #include <float.h>
44 #include <stdio.h>
45 #include "lkpyramid.hpp"
46 #include "opencl_kernels_video.hpp"
47 #include "opencv2/core/hal/intrin.hpp"
48
49 #include "opencv2/core/openvx/ovx_defs.hpp"
50
51 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
52
53 namespace
54 {
55 static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
56 {
57     using namespace cv;
58     using cv::detail::deriv_type;
59     int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth();
60     CV_Assert(depth == CV_8U);
61     dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
62
63     int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
64     AutoBuffer<deriv_type> _tempBuf(delta*2 + 64);
65     deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
66
67 #if CV_SIMD128
68     v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
69     bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
70 #endif
71
72     for( y = 0; y < rows; y++ )
73     {
74         const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0);
75         const uchar* srow1 = src.ptr<uchar>(y);
76         const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
77         deriv_type* drow = dst.ptr<deriv_type>(y);
78
79         // do vertical convolution
80         x = 0;
81 #if CV_SIMD128
82         if(haveSIMD)
83         {
84             for( ; x <= colsn - 8; x += 8 )
85             {
86                 v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(srow0 + x));
87                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
88                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
89
90                 v_int16x8 t1 = s2 - s0;
91                 v_int16x8 t0 = (s0 + s2) * c3 + s1 * c10;
92
93                 v_store(trow0 + x, t0);
94                 v_store(trow1 + x, t1);
95             }
96         }
97 #endif
98
99         for( ; x < colsn; x++ )
100         {
101             int t0 = (srow0[x] + srow2[x])*3 + srow1[x]*10;
102             int t1 = srow2[x] - srow0[x];
103             trow0[x] = (deriv_type)t0;
104             trow1[x] = (deriv_type)t1;
105         }
106
107         // make border
108         int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols-2 : 0)*cn;
109         for( int k = 0; k < cn; k++ )
110         {
111             trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
112             trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
113         }
114
115         // do horizontal convolution, interleave the results and store them to dst
116         x = 0;
117 #if CV_SIMD128
118         if(haveSIMD)
119         {
120             for( ; x <= colsn - 8; x += 8 )
121             {
122                 v_int16x8 s0 = v_load(trow0 + x - cn);
123                 v_int16x8 s1 = v_load(trow0 + x + cn);
124                 v_int16x8 s2 = v_load(trow1 + x - cn);
125                 v_int16x8 s3 = v_load(trow1 + x);
126                 v_int16x8 s4 = v_load(trow1 + x + cn);
127
128                 v_int16x8 t0 = s1 - s0;
129                 v_int16x8 t1 = ((s2 + s4) * c3) + (s3 * c10);
130
131                 v_store_interleave((drow + x*2), t0, t1);
132             }
133         }
134 #endif
135         for( ; x < colsn; x++ )
136         {
137             deriv_type t0 = (deriv_type)(trow0[x+cn] - trow0[x-cn]);
138             deriv_type t1 = (deriv_type)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
139             drow[x*2] = t0; drow[x*2+1] = t1;
140         }
141     }
142 }
143
144 }//namespace
145
146 cv::detail::LKTrackerInvoker::LKTrackerInvoker(
147                       const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
148                       const Point2f* _prevPts, Point2f* _nextPts,
149                       uchar* _status, float* _err,
150                       Size _winSize, TermCriteria _criteria,
151                       int _level, int _maxLevel, int _flags, float _minEigThreshold )
152 {
153     prevImg = &_prevImg;
154     prevDeriv = &_prevDeriv;
155     nextImg = &_nextImg;
156     prevPts = _prevPts;
157     nextPts = _nextPts;
158     status = _status;
159     err = _err;
160     winSize = _winSize;
161     criteria = _criteria;
162     level = _level;
163     maxLevel = _maxLevel;
164     flags = _flags;
165     minEigThreshold = _minEigThreshold;
166 }
167
168 #if defined __arm__ && !CV_NEON
169 typedef int64 acctype;
170 typedef int itemtype;
171 #else
172 typedef float acctype;
173 typedef float itemtype;
174 #endif
175
176 void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
177 {
178     CV_INSTRUMENT_REGION()
179
180     Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
181     const Mat& I = *prevImg;
182     const Mat& J = *nextImg;
183     const Mat& derivI = *prevDeriv;
184
185     int j, cn = I.channels(), cn2 = cn*2;
186     cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
187     int derivDepth = DataType<deriv_type>::depth;
188
189     Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
190     Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
191
192     for( int ptidx = range.start; ptidx < range.end; ptidx++ )
193     {
194         Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
195         Point2f nextPt;
196         if( level == maxLevel )
197         {
198             if( flags & OPTFLOW_USE_INITIAL_FLOW )
199                 nextPt = nextPts[ptidx]*(float)(1./(1 << level));
200             else
201                 nextPt = prevPt;
202         }
203         else
204             nextPt = nextPts[ptidx]*2.f;
205         nextPts[ptidx] = nextPt;
206
207         Point2i iprevPt, inextPt;
208         prevPt -= halfWin;
209         iprevPt.x = cvFloor(prevPt.x);
210         iprevPt.y = cvFloor(prevPt.y);
211
212         if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
213             iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
214         {
215             if( level == 0 )
216             {
217                 if( status )
218                     status[ptidx] = false;
219                 if( err )
220                     err[ptidx] = 0;
221             }
222             continue;
223         }
224
225         float a = prevPt.x - iprevPt.x;
226         float b = prevPt.y - iprevPt.y;
227         const int W_BITS = 14, W_BITS1 = 14;
228         const float FLT_SCALE = 1.f/(1 << 20);
229         int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
230         int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
231         int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
232         int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
233
234         int dstep = (int)(derivI.step/derivI.elemSize1());
235         int stepI = (int)(I.step/I.elemSize1());
236         int stepJ = (int)(J.step/J.elemSize1());
237         acctype iA11 = 0, iA12 = 0, iA22 = 0;
238         float A11, A12, A22;
239
240 #if CV_SSE2
241         __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
242         __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
243         __m128i z = _mm_setzero_si128();
244         __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
245         __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
246         __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
247 #endif
248
249 #if CV_NEON
250
251         float CV_DECL_ALIGNED(16) nA11[] = { 0, 0, 0, 0 }, nA12[] = { 0, 0, 0, 0 }, nA22[] = { 0, 0, 0, 0 };
252         const int shifter1 = -(W_BITS - 5); //negative so it shifts right
253         const int shifter2 = -(W_BITS);
254
255         const int16x4_t d26 = vdup_n_s16((int16_t)iw00);
256         const int16x4_t d27 = vdup_n_s16((int16_t)iw01);
257         const int16x4_t d28 = vdup_n_s16((int16_t)iw10);
258         const int16x4_t d29 = vdup_n_s16((int16_t)iw11);
259         const int32x4_t q11 = vdupq_n_s32((int32_t)shifter1);
260         const int32x4_t q12 = vdupq_n_s32((int32_t)shifter2);
261
262 #endif
263
264         // extract the patch from the first image, compute covariation matrix of derivatives
265         int x, y;
266         for( y = 0; y < winSize.height; y++ )
267         {
268             const uchar* src = I.ptr() + (y + iprevPt.y)*stepI + iprevPt.x*cn;
269             const deriv_type* dsrc = derivI.ptr<deriv_type>() + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
270
271             deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
272             deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y);
273
274             x = 0;
275
276 #if CV_SSE2
277             for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
278             {
279                 __m128i v00, v01, v10, v11, t0, t1;
280
281                 v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
282                 v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
283                 v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI)), z);
284                 v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI + cn)), z);
285
286                 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
287                                    _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
288                 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
289                 _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
290
291                 v00 = _mm_loadu_si128((const __m128i*)(dsrc));
292                 v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
293                 v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
294                 v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
295
296                 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
297                                    _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
298                 t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
299                                    _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
300                 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
301                 t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
302                 v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
303
304                 _mm_storeu_si128((__m128i*)dIptr, v00);
305                 t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
306                 t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
307
308                 __m128 fy = _mm_cvtepi32_ps(t0);
309                 __m128 fx = _mm_cvtepi32_ps(t1);
310
311                 qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
312                 qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
313                 qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
314             }
315 #endif
316
317 #if CV_NEON
318             for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
319             {
320
321                 uint8x8_t d0 = vld1_u8(&src[x]);
322                 uint8x8_t d2 = vld1_u8(&src[x+cn]);
323                 uint16x8_t q0 = vmovl_u8(d0);
324                 uint16x8_t q1 = vmovl_u8(d2);
325
326                 int32x4_t q5 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26);
327                 int32x4_t q6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27);
328
329                 uint8x8_t d4 = vld1_u8(&src[x + stepI]);
330                 uint8x8_t d6 = vld1_u8(&src[x + stepI + cn]);
331                 uint16x8_t q2 = vmovl_u8(d4);
332                 uint16x8_t q3 = vmovl_u8(d6);
333
334                 int32x4_t q7 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28);
335                 int32x4_t q8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29);
336
337                 q5 = vaddq_s32(q5, q6);
338                 q7 = vaddq_s32(q7, q8);
339                 q5 = vaddq_s32(q5, q7);
340
341                 int16x4x2_t d0d1 = vld2_s16(dsrc);
342                 int16x4x2_t d2d3 = vld2_s16(&dsrc[cn2]);
343
344                 q5 = vqrshlq_s32(q5, q11);
345
346                 int32x4_t q4 = vmull_s16(d0d1.val[0], d26);
347                 q6 = vmull_s16(d0d1.val[1], d26);
348
349                 int16x4_t nd0 = vmovn_s32(q5);
350
351                 q7 = vmull_s16(d2d3.val[0], d27);
352                 q8 = vmull_s16(d2d3.val[1], d27);
353
354                 vst1_s16(&Iptr[x], nd0);
355
356                 int16x4x2_t d4d5 = vld2_s16(&dsrc[dstep]);
357                 int16x4x2_t d6d7 = vld2_s16(&dsrc[dstep+cn2]);
358
359                 q4 = vaddq_s32(q4, q7);
360                 q6 = vaddq_s32(q6, q8);
361
362                 q7 = vmull_s16(d4d5.val[0], d28);
363                 int32x4_t q14 = vmull_s16(d4d5.val[1], d28);
364                 q8 = vmull_s16(d6d7.val[0], d29);
365                 int32x4_t q15 = vmull_s16(d6d7.val[1], d29);
366
367                 q7 = vaddq_s32(q7, q8);
368                 q14 = vaddq_s32(q14, q15);
369
370                 q4 = vaddq_s32(q4, q7);
371                 q6 = vaddq_s32(q6, q14);
372
373                 float32x4_t nq0 = vld1q_f32(nA11);
374                 float32x4_t nq1 = vld1q_f32(nA12);
375                 float32x4_t nq2 = vld1q_f32(nA22);
376
377                 q4 = vqrshlq_s32(q4, q12);
378                 q6 = vqrshlq_s32(q6, q12);
379
380                 q7 = vmulq_s32(q4, q4);
381                 q8 = vmulq_s32(q4, q6);
382                 q15 = vmulq_s32(q6, q6);
383
384                 nq0 = vaddq_f32(nq0, vcvtq_f32_s32(q7));
385                 nq1 = vaddq_f32(nq1, vcvtq_f32_s32(q8));
386                 nq2 = vaddq_f32(nq2, vcvtq_f32_s32(q15));
387
388                 vst1q_f32(nA11, nq0);
389                 vst1q_f32(nA12, nq1);
390                 vst1q_f32(nA22, nq2);
391
392                 int16x4_t d8 = vmovn_s32(q4);
393                 int16x4_t d12 = vmovn_s32(q6);
394
395                 int16x4x2_t d8d12;
396                 d8d12.val[0] = d8; d8d12.val[1] = d12;
397                 vst2_s16(dIptr, d8d12);
398             }
399 #endif
400
401             for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
402             {
403                 int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
404                                       src[x+stepI]*iw10 + src[x+stepI+cn]*iw11, W_BITS1-5);
405                 int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
406                                        dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
407                 int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
408                                        dsrc[dstep+cn2+1]*iw11, W_BITS1);
409
410                 Iptr[x] = (short)ival;
411                 dIptr[0] = (short)ixval;
412                 dIptr[1] = (short)iyval;
413
414                 iA11 += (itemtype)(ixval*ixval);
415                 iA12 += (itemtype)(ixval*iyval);
416                 iA22 += (itemtype)(iyval*iyval);
417             }
418         }
419
420 #if CV_SSE2
421         float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
422         _mm_store_ps(A11buf, qA11);
423         _mm_store_ps(A12buf, qA12);
424         _mm_store_ps(A22buf, qA22);
425         iA11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
426         iA12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
427         iA22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
428 #endif
429
430 #if CV_NEON
431         iA11 += nA11[0] + nA11[1] + nA11[2] + nA11[3];
432         iA12 += nA12[0] + nA12[1] + nA12[2] + nA12[3];
433         iA22 += nA22[0] + nA22[1] + nA22[2] + nA22[3];
434 #endif
435
436         A11 = iA11*FLT_SCALE;
437         A12 = iA12*FLT_SCALE;
438         A22 = iA22*FLT_SCALE;
439
440         float D = A11*A22 - A12*A12;
441         float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
442                         4.f*A12*A12))/(2*winSize.width*winSize.height);
443
444         if( err && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) != 0 )
445             err[ptidx] = (float)minEig;
446
447         if( minEig < minEigThreshold || D < FLT_EPSILON )
448         {
449             if( level == 0 && status )
450                 status[ptidx] = false;
451             continue;
452         }
453
454         D = 1.f/D;
455
456         nextPt -= halfWin;
457         Point2f prevDelta;
458
459         for( j = 0; j < criteria.maxCount; j++ )
460         {
461             inextPt.x = cvFloor(nextPt.x);
462             inextPt.y = cvFloor(nextPt.y);
463
464             if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
465                inextPt.y < -winSize.height || inextPt.y >= J.rows )
466             {
467                 if( level == 0 && status )
468                     status[ptidx] = false;
469                 break;
470             }
471
472             a = nextPt.x - inextPt.x;
473             b = nextPt.y - inextPt.y;
474             iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
475             iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
476             iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
477             iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
478             acctype ib1 = 0, ib2 = 0;
479             float b1, b2;
480 #if CV_SSE2
481             qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
482             qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
483             __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
484 #endif
485
486 #if CV_NEON
487             float CV_DECL_ALIGNED(16) nB1[] = { 0,0,0,0 }, nB2[] = { 0,0,0,0 };
488
489             const int16x4_t d26_2 = vdup_n_s16((int16_t)iw00);
490             const int16x4_t d27_2 = vdup_n_s16((int16_t)iw01);
491             const int16x4_t d28_2 = vdup_n_s16((int16_t)iw10);
492             const int16x4_t d29_2 = vdup_n_s16((int16_t)iw11);
493
494 #endif
495
496             for( y = 0; y < winSize.height; y++ )
497             {
498                 const uchar* Jptr = J.ptr() + (y + inextPt.y)*stepJ + inextPt.x*cn;
499                 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
500                 const deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y);
501
502                 x = 0;
503
504 #if CV_SSE2
505                 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
506                 {
507                     __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
508                     __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
509                     __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
510                     __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ)), z);
511                     __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ + cn)), z);
512
513                     __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
514                                                _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
515                     __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
516                                                _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
517                     t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
518                     t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
519                     diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
520                     diff1 = _mm_unpackhi_epi16(diff0, diff0);
521                     diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ...
522                     v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
523                     v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
524                     v10 = _mm_unpacklo_epi16(v00, v01);
525                     v11 = _mm_unpackhi_epi16(v00, v01);
526                     v00 = _mm_unpacklo_epi16(diff0, diff1);
527                     v01 = _mm_unpackhi_epi16(diff0, diff1);
528                     v00 = _mm_madd_epi16(v00, v10);
529                     v11 = _mm_madd_epi16(v01, v11);
530                     qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
531                     qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v11));
532                 }
533 #endif
534
535 #if CV_NEON
536                 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
537                 {
538
539                     uint8x8_t d0 = vld1_u8(&Jptr[x]);
540                     uint8x8_t d2 = vld1_u8(&Jptr[x+cn]);
541                     uint8x8_t d4 = vld1_u8(&Jptr[x+stepJ]);
542                     uint8x8_t d6 = vld1_u8(&Jptr[x+stepJ+cn]);
543
544                     uint16x8_t q0 = vmovl_u8(d0);
545                     uint16x8_t q1 = vmovl_u8(d2);
546                     uint16x8_t q2 = vmovl_u8(d4);
547                     uint16x8_t q3 = vmovl_u8(d6);
548
549                     int32x4_t nq4 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26_2);
550                     int32x4_t nq5 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q0)), d26_2);
551
552                     int32x4_t nq6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27_2);
553                     int32x4_t nq7 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q1)), d27_2);
554
555                     int32x4_t nq8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28_2);
556                     int32x4_t nq9 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q2)), d28_2);
557
558                     int32x4_t nq10 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29_2);
559                     int32x4_t nq11 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q3)), d29_2);
560
561                     nq4 = vaddq_s32(nq4, nq6);
562                     nq5 = vaddq_s32(nq5, nq7);
563                     nq8 = vaddq_s32(nq8, nq10);
564                     nq9 = vaddq_s32(nq9, nq11);
565
566                     int16x8_t q6 = vld1q_s16(&Iptr[x]);
567
568                     nq4 = vaddq_s32(nq4, nq8);
569                     nq5 = vaddq_s32(nq5, nq9);
570
571                     nq8 = vmovl_s16(vget_high_s16(q6));
572                     nq6 = vmovl_s16(vget_low_s16(q6));
573
574                     nq4 = vqrshlq_s32(nq4, q11);
575                     nq5 = vqrshlq_s32(nq5, q11);
576
577                     int16x8x2_t q0q1 = vld2q_s16(dIptr);
578                     float32x4_t nB1v = vld1q_f32(nB1);
579                     float32x4_t nB2v = vld1q_f32(nB2);
580
581                     nq4 = vsubq_s32(nq4, nq6);
582                     nq5 = vsubq_s32(nq5, nq8);
583
584                     int32x4_t nq2 = vmovl_s16(vget_low_s16(q0q1.val[0]));
585                     int32x4_t nq3 = vmovl_s16(vget_high_s16(q0q1.val[0]));
586
587                     nq7 = vmovl_s16(vget_low_s16(q0q1.val[1]));
588                     nq8 = vmovl_s16(vget_high_s16(q0q1.val[1]));
589
590                     nq9 = vmulq_s32(nq4, nq2);
591                     nq10 = vmulq_s32(nq5, nq3);
592
593                     nq4 = vmulq_s32(nq4, nq7);
594                     nq5 = vmulq_s32(nq5, nq8);
595
596                     nq9 = vaddq_s32(nq9, nq10);
597                     nq4 = vaddq_s32(nq4, nq5);
598
599                     nB1v = vaddq_f32(nB1v, vcvtq_f32_s32(nq9));
600                     nB2v = vaddq_f32(nB2v, vcvtq_f32_s32(nq4));
601
602                     vst1q_f32(nB1, nB1v);
603                     vst1q_f32(nB2, nB2v);
604                 }
605 #endif
606
607                 for( ; x < winSize.width*cn; x++, dIptr += 2 )
608                 {
609                     int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
610                                           Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
611                                           W_BITS1-5) - Iptr[x];
612                     ib1 += (itemtype)(diff*dIptr[0]);
613                     ib2 += (itemtype)(diff*dIptr[1]);
614                 }
615             }
616
617 #if CV_SSE2
618             float CV_DECL_ALIGNED(16) bbuf[4];
619             _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
620             ib1 += bbuf[0] + bbuf[2];
621             ib2 += bbuf[1] + bbuf[3];
622 #endif
623
624 #if CV_NEON
625
626             ib1 += (float)(nB1[0] + nB1[1] + nB1[2] + nB1[3]);
627             ib2 += (float)(nB2[0] + nB2[1] + nB2[2] + nB2[3]);
628 #endif
629
630             b1 = ib1*FLT_SCALE;
631             b2 = ib2*FLT_SCALE;
632
633             Point2f delta( (float)((A12*b2 - A22*b1) * D),
634                           (float)((A12*b1 - A11*b2) * D));
635             //delta = -delta;
636
637             nextPt += delta;
638             nextPts[ptidx] = nextPt + halfWin;
639
640             if( delta.ddot(delta) <= criteria.epsilon )
641                 break;
642
643             if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
644                std::abs(delta.y + prevDelta.y) < 0.01 )
645             {
646                 nextPts[ptidx] -= delta*0.5f;
647                 break;
648             }
649             prevDelta = delta;
650         }
651
652         CV_Assert(status != NULL);
653         if( status[ptidx] && err && level == 0 && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) == 0 )
654         {
655             Point2f nextPoint = nextPts[ptidx] - halfWin;
656             Point inextPoint;
657
658             inextPoint.x = cvFloor(nextPoint.x);
659             inextPoint.y = cvFloor(nextPoint.y);
660
661             if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols ||
662                 inextPoint.y < -winSize.height || inextPoint.y >= J.rows )
663             {
664                 if( status )
665                     status[ptidx] = false;
666                 continue;
667             }
668
669             float aa = nextPoint.x - inextPoint.x;
670             float bb = nextPoint.y - inextPoint.y;
671             iw00 = cvRound((1.f - aa)*(1.f - bb)*(1 << W_BITS));
672             iw01 = cvRound(aa*(1.f - bb)*(1 << W_BITS));
673             iw10 = cvRound((1.f - aa)*bb*(1 << W_BITS));
674             iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
675             float errval = 0.f;
676
677             for( y = 0; y < winSize.height; y++ )
678             {
679                 const uchar* Jptr = J.ptr() + (y + inextPoint.y)*stepJ + inextPoint.x*cn;
680                 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
681
682                 for( x = 0; x < winSize.width*cn; x++ )
683                 {
684                     int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
685                                           Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
686                                           W_BITS1-5) - Iptr[x];
687                     errval += std::abs((float)diff);
688                 }
689             }
690             err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);
691         }
692     }
693 }
694
695 int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
696                                 int pyrBorder, int derivBorder, bool tryReuseInputImage)
697 {
698     CV_INSTRUMENT_REGION()
699
700     Mat img = _img.getMat();
701     CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2 );
702     int pyrstep = withDerivatives ? 2 : 1;
703
704     pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true, 0);
705
706     int derivType = CV_MAKETYPE(DataType<cv::detail::deriv_type>::depth, img.channels() * 2);
707
708     //level 0
709     bool lvl0IsSet = false;
710     if(tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
711     {
712         Size wholeSize;
713         Point ofs;
714         img.locateROI(wholeSize, ofs);
715         if (ofs.x >= winSize.width && ofs.y >= winSize.height
716               && ofs.x + img.cols + winSize.width <= wholeSize.width
717               && ofs.y + img.rows + winSize.height <= wholeSize.height)
718         {
719             pyramid.getMatRef(0) = img;
720             lvl0IsSet = true;
721         }
722     }
723
724     if(!lvl0IsSet)
725     {
726         Mat& temp = pyramid.getMatRef(0);
727
728         if(!temp.empty())
729             temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
730         if(temp.type() != img.type() || temp.cols != winSize.width*2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
731             temp.create(img.rows + winSize.height*2, img.cols + winSize.width*2, img.type());
732
733         if(pyrBorder == BORDER_TRANSPARENT)
734             img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
735         else
736             copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
737         temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
738     }
739
740     Size sz = img.size();
741     Mat prevLevel = pyramid.getMatRef(0);
742     Mat thisLevel = prevLevel;
743
744     for(int level = 0; level <= maxLevel; ++level)
745     {
746         if (level != 0)
747         {
748             Mat& temp = pyramid.getMatRef(level * pyrstep);
749
750             if(!temp.empty())
751                 temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
752             if(temp.type() != img.type() || temp.cols != winSize.width*2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
753                 temp.create(sz.height + winSize.height*2, sz.width + winSize.width*2, img.type());
754
755             thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
756             pyrDown(prevLevel, thisLevel, sz);
757
758             if(pyrBorder != BORDER_TRANSPARENT)
759                 copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder|BORDER_ISOLATED);
760             temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
761         }
762
763         if(withDerivatives)
764         {
765             Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
766
767             if(!deriv.empty())
768                 deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
769             if(deriv.type() != derivType || deriv.cols != winSize.width*2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
770                 deriv.create(sz.height + winSize.height*2, sz.width + winSize.width*2, derivType);
771
772             Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
773             calcSharrDeriv(thisLevel, derivI);
774
775             if(derivBorder != BORDER_TRANSPARENT)
776                 copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder|BORDER_ISOLATED);
777             deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
778         }
779
780         sz = Size((sz.width+1)/2, (sz.height+1)/2);
781         if( sz.width <= winSize.width || sz.height <= winSize.height )
782         {
783             pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true, 0);//check this
784             return level;
785         }
786
787         prevLevel = thisLevel;
788     }
789
790     return maxLevel;
791 }
792
793 namespace cv
794 {
795 namespace
796 {
797     class SparsePyrLKOpticalFlowImpl : public SparsePyrLKOpticalFlow
798     {
799         struct dim3
800         {
801             unsigned int x, y, z;
802             dim3() : x(0), y(0), z(0) { }
803         };
804     public:
805         SparsePyrLKOpticalFlowImpl(Size winSize_ = Size(21,21),
806                          int maxLevel_ = 3,
807                          TermCriteria criteria_ = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01),
808                          int flags_ = 0,
809                          double minEigThreshold_ = 1e-4) :
810           winSize(winSize_), maxLevel(maxLevel_), criteria(criteria_), flags(flags_), minEigThreshold(minEigThreshold_)
811 #ifdef HAVE_OPENCL
812           , iters(criteria_.maxCount), derivLambda(criteria_.epsilon), useInitialFlow(0 != (flags_ & OPTFLOW_LK_GET_MIN_EIGENVALS)), waveSize(0)
813 #endif
814         {
815         }
816
817         virtual Size getWinSize() const CV_OVERRIDE { return winSize;}
818         virtual void setWinSize(Size winSize_) CV_OVERRIDE { winSize = winSize_;}
819
820         virtual int getMaxLevel() const CV_OVERRIDE { return maxLevel;}
821         virtual void setMaxLevel(int maxLevel_) CV_OVERRIDE { maxLevel = maxLevel_;}
822
823         virtual TermCriteria getTermCriteria() const CV_OVERRIDE { return criteria;}
824         virtual void setTermCriteria(TermCriteria& crit_) CV_OVERRIDE { criteria=crit_;}
825
826         virtual int getFlags() const CV_OVERRIDE { return flags; }
827         virtual void setFlags(int flags_) CV_OVERRIDE { flags=flags_;}
828
829         virtual double getMinEigThreshold() const CV_OVERRIDE { return minEigThreshold;}
830         virtual void setMinEigThreshold(double minEigThreshold_) CV_OVERRIDE { minEigThreshold=minEigThreshold_;}
831
832         virtual void calc(InputArray prevImg, InputArray nextImg,
833                           InputArray prevPts, InputOutputArray nextPts,
834                           OutputArray status,
835                           OutputArray err = cv::noArray()) CV_OVERRIDE;
836
837     private:
838 #ifdef HAVE_OPENCL
839         bool checkParam()
840         {
841             iters = std::min(std::max(iters, 0), 100);
842
843             derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
844             if (derivLambda < 0)
845                 return false;
846             if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2)
847                 return false;
848             if (winSize.width < 8 || winSize.height < 8 ||
849                 winSize.width > 24 || winSize.height > 24)
850                 return false;
851             calcPatchSize();
852             if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6)
853                 return false;
854             if (!initWaveSize())
855                 return false;
856             return true;
857         }
858
859         bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err)
860         {
861             if (!checkParam())
862                 return false;
863
864             UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
865             UMat temp2 = nextPts.reshape(1);
866             multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2);
867
868             status.setTo(Scalar::all(1));
869
870             // build the image pyramids.
871             std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
872             std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
873
874             // allocate buffers with aligned pitch to be able to use cl_khr_image2d_from_buffer extension
875             // This is the required pitch alignment in pixels
876             int pitchAlign = (int)ocl::Device::getDefault().imagePitchAlignment();
877             if (pitchAlign>0)
878             {
879                 prevPyr[0] = UMat(prevImg.rows,(prevImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,prevImg.cols);
880                 nextPyr[0] = UMat(nextImg.rows,(nextImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,nextImg.cols);
881                 for (int level = 1; level <= maxLevel; ++level)
882                 {
883                     int cols,rows;
884                     // allocate buffers with aligned pitch to be able to use image on buffer extension
885                     cols = (prevPyr[level - 1].cols+1)/2;
886                     rows = (prevPyr[level - 1].rows+1)/2;
887                     prevPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),prevPyr[level-1].type()).colRange(0,cols);
888                     cols = (nextPyr[level - 1].cols+1)/2;
889                     rows = (nextPyr[level - 1].rows+1)/2;
890                     nextPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),nextPyr[level-1].type()).colRange(0,cols);
891                 }
892             }
893
894             prevImg.convertTo(prevPyr[0], CV_32F);
895             nextImg.convertTo(nextPyr[0], CV_32F);
896
897             for (int level = 1; level <= maxLevel; ++level)
898             {
899                 pyrDown(prevPyr[level - 1], prevPyr[level]);
900                 pyrDown(nextPyr[level - 1], nextPyr[level]);
901             }
902
903             // dI/dx ~ Ix, dI/dy ~ Iy
904             for (int level = maxLevel; level >= 0; level--)
905             {
906                 if (!lkSparse_run(prevPyr[level], nextPyr[level], prevPts,
907                                   nextPts, status, err,
908                                   prevPts.cols, level))
909                     return false;
910             }
911             return true;
912         }
913 #endif
914
915         Size winSize;
916         int maxLevel;
917         TermCriteria criteria;
918         int flags;
919         double minEigThreshold;
920 #ifdef HAVE_OPENCL
921         int iters;
922         double derivLambda;
923         bool useInitialFlow;
924         int waveSize;
925         bool initWaveSize()
926         {
927             waveSize = 1;
928             if (isDeviceCPU())
929                 return true;
930
931             ocl::Kernel kernel;
932             if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, ""))
933                 return false;
934             waveSize = (int)kernel.preferedWorkGroupSizeMultiple();
935             return true;
936         }
937         dim3 patch;
938         void calcPatchSize()
939         {
940             dim3 block;
941
942             if (winSize.width > 32 && winSize.width > 2 * winSize.height)
943             {
944                 block.x = 32;
945                 block.y = 8;
946             }
947             else
948             {
949                 block.x = 16;
950                 block.y = 16;
951             }
952
953             patch.x = (winSize.width  + block.x - 1) / block.x;
954             patch.y = (winSize.height + block.y - 1) / block.y;
955
956             block.z = patch.z = 1;
957         }
958
959         bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err,
960             int ptcount, int level)
961         {
962             size_t localThreads[3]  = { 8, 8};
963             size_t globalThreads[3] = { 8 * (size_t)ptcount, 8};
964             char calcErr = (0 == level) ? 1 : 0;
965
966             int wsx = 1, wsy = 1;
967             if(winSize.width < 16)
968                 wsx = 0;
969             if(winSize.height < 16)
970                 wsy = 0;
971             cv::String build_options;
972             if (isDeviceCPU())
973                 build_options = " -D CPU";
974             else
975                 build_options = cv::format("-D WAVE_SIZE=%d -D WSX=%d -D WSY=%d",
976                                            waveSize, wsx, wsy);
977
978             ocl::Kernel kernel;
979             if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
980                 return false;
981
982             CV_Assert(I.depth() == CV_32F && J.depth() == CV_32F);
983             ocl::Image2D imageI(I, false, ocl::Image2D::canCreateAlias(I));
984             ocl::Image2D imageJ(J, false, ocl::Image2D::canCreateAlias(J));
985
986             int idxArg = 0;
987             idxArg = kernel.set(idxArg, imageI); //image2d_t I
988             idxArg = kernel.set(idxArg, imageJ); //image2d_t J
989             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
990             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
991             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
992             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
993             idxArg = kernel.set(idxArg, (int)level); // const int level
994             idxArg = kernel.set(idxArg, (int)I.rows); // const int rows
995             idxArg = kernel.set(idxArg, (int)I.cols); // const int cols
996             idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X
997             idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y
998             idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x
999             idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
1000             idxArg = kernel.set(idxArg, (int)iters); // int c_iters
1001             idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
1002             return kernel.run(2, globalThreads, localThreads, true); // sync=true because ocl::Image2D lifetime is not handled well for temp UMat
1003         }
1004     private:
1005         inline static bool isDeviceCPU()
1006         {
1007             return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type());
1008         }
1009
1010
1011     bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg,
1012                                          InputArray _prevPts, InputOutputArray _nextPts,
1013                                          OutputArray _status, OutputArray _err)
1014     {
1015         if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags))
1016             return false;
1017         if (!cv::ocl::Device::getDefault().imageSupport())
1018             return false;
1019         if (_nextImg.size() != _prevImg.size())
1020             return false;
1021         int typePrev = _prevImg.type();
1022         int typeNext = _nextImg.type();
1023         if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext)))
1024             return false;
1025         if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext)))
1026             return false;
1027
1028         if (_prevPts.empty() || _prevPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
1029             return false;
1030         if ((1 != _prevPts.size().height) && (1 != _prevPts.size().width))
1031             return false;
1032         size_t npoints = _prevPts.total();
1033         if (useInitialFlow)
1034         {
1035             if (_nextPts.empty() || _nextPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
1036                 return false;
1037             if ((1 != _nextPts.size().height) && (1 != _nextPts.size().width))
1038                 return false;
1039             if (_nextPts.total() != npoints)
1040                 return false;
1041         }
1042         else
1043         {
1044             _nextPts.create(_prevPts.size(), _prevPts.type());
1045         }
1046
1047         if (!checkParam())
1048             return false;
1049
1050         UMat umatErr;
1051         if (_err.needed())
1052         {
1053             _err.create((int)npoints, 1, CV_32FC1);
1054             umatErr = _err.getUMat();
1055         }
1056         else
1057             umatErr.create((int)npoints, 1, CV_32FC1);
1058
1059         _status.create((int)npoints, 1, CV_8UC1);
1060         UMat umatNextPts = _nextPts.getUMat();
1061         UMat umatStatus = _status.getUMat();
1062         return sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr);
1063     }
1064 #endif
1065
1066 #ifdef HAVE_OPENVX
1067     bool openvx_pyrlk(InputArray _prevImg, InputArray _nextImg, InputArray _prevPts, InputOutputArray _nextPts,
1068                              OutputArray _status, OutputArray _err)
1069     {
1070         using namespace ivx;
1071
1072         // Pyramids as inputs are not acceptable because there's no (direct or simple) way
1073         // to build vx_pyramid on user data
1074         if(_prevImg.kind() != _InputArray::MAT || _nextImg.kind() != _InputArray::MAT)
1075             return false;
1076
1077         Mat prevImgMat = _prevImg.getMat(), nextImgMat = _nextImg.getMat();
1078
1079         if(prevImgMat.type() != CV_8UC1 || nextImgMat.type() != CV_8UC1)
1080             return false;
1081
1082         if (ovx::skipSmallImages<VX_KERNEL_OPTICAL_FLOW_PYR_LK>(prevImgMat.cols, prevImgMat.rows))
1083             return false;
1084
1085         CV_Assert(prevImgMat.size() == nextImgMat.size());
1086         Mat prevPtsMat = _prevPts.getMat();
1087         int checkPrev = prevPtsMat.checkVector(2, CV_32F, false);
1088         CV_Assert( checkPrev >= 0 );
1089         size_t npoints = checkPrev;
1090
1091         if( !(flags & OPTFLOW_USE_INITIAL_FLOW) )
1092             _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
1093         Mat nextPtsMat = _nextPts.getMat();
1094         CV_Assert( nextPtsMat.checkVector(2, CV_32F, false) == (int)npoints );
1095
1096         _status.create((int)npoints, 1, CV_8U, -1, true);
1097         Mat statusMat = _status.getMat();
1098         uchar* status = statusMat.ptr();
1099         for(size_t i = 0; i < npoints; i++ )
1100             status[i] = true;
1101
1102         // OpenVX doesn't return detection errors
1103         if( _err.needed() )
1104         {
1105             return false;
1106         }
1107
1108         try
1109         {
1110             Context context = ovx::getOpenVXContext();
1111
1112             if(context.vendorID() == VX_ID_KHRONOS)
1113             {
1114                 // PyrLK in OVX 1.0.1 performs vxCommitImagePatch incorrecty and crashes
1115                 if(VX_VERSION == VX_VERSION_1_0)
1116                     return false;
1117                 // Implementation ignores border mode
1118                 // So check that minimal size of image in pyramid is big enough
1119                 int width = prevImgMat.cols, height = prevImgMat.rows;
1120                 for(int i = 0; i < maxLevel+1; i++)
1121                 {
1122                     if(width < winSize.width + 1 || height < winSize.height + 1)
1123                         return false;
1124                     else
1125                     {
1126                         width /= 2; height /= 2;
1127                     }
1128                 }
1129             }
1130
1131             Image prevImg = Image::createFromHandle(context, Image::matTypeToFormat(prevImgMat.type()),
1132                                                     Image::createAddressing(prevImgMat), (void*)prevImgMat.data);
1133             Image nextImg = Image::createFromHandle(context, Image::matTypeToFormat(nextImgMat.type()),
1134                                                     Image::createAddressing(nextImgMat), (void*)nextImgMat.data);
1135
1136             Graph graph = Graph::create(context);
1137
1138             Pyramid prevPyr = Pyramid::createVirtual(graph, (vx_size)maxLevel+1, VX_SCALE_PYRAMID_HALF,
1139                                                      prevImg.width(), prevImg.height(), prevImg.format());
1140             Pyramid nextPyr = Pyramid::createVirtual(graph, (vx_size)maxLevel+1, VX_SCALE_PYRAMID_HALF,
1141                                                      nextImg.width(), nextImg.height(), nextImg.format());
1142
1143             ivx::Node::create(graph, VX_KERNEL_GAUSSIAN_PYRAMID, prevImg, prevPyr);
1144             ivx::Node::create(graph, VX_KERNEL_GAUSSIAN_PYRAMID, nextImg, nextPyr);
1145
1146             Array prevPts = Array::create(context, VX_TYPE_KEYPOINT, npoints);
1147             Array estimatedPts = Array::create(context, VX_TYPE_KEYPOINT, npoints);
1148             Array nextPts = Array::create(context, VX_TYPE_KEYPOINT, npoints);
1149
1150             std::vector<vx_keypoint_t> vxPrevPts(npoints), vxEstPts(npoints), vxNextPts(npoints);
1151             for(size_t i = 0; i < npoints; i++)
1152             {
1153                 vx_keypoint_t& prevPt = vxPrevPts[i]; vx_keypoint_t& estPt  = vxEstPts[i];
1154                 prevPt.x = prevPtsMat.at<Point2f>(i).x; prevPt.y = prevPtsMat.at<Point2f>(i).y;
1155                  estPt.x = nextPtsMat.at<Point2f>(i).x;  estPt.y = nextPtsMat.at<Point2f>(i).y;
1156                 prevPt.tracking_status = estPt.tracking_status = vx_true_e;
1157             }
1158             prevPts.addItems(vxPrevPts); estimatedPts.addItems(vxEstPts);
1159
1160             if( (criteria.type & TermCriteria::COUNT) == 0 )
1161                 criteria.maxCount = 30;
1162             else
1163                 criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
1164             if( (criteria.type & TermCriteria::EPS) == 0 )
1165                 criteria.epsilon = 0.01;
1166             else
1167                 criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
1168             criteria.epsilon *= criteria.epsilon;
1169
1170             vx_enum termEnum = (criteria.type == TermCriteria::COUNT) ? VX_TERM_CRITERIA_ITERATIONS :
1171                                (criteria.type == TermCriteria::EPS) ? VX_TERM_CRITERIA_EPSILON :
1172                                VX_TERM_CRITERIA_BOTH;
1173
1174             //minEigThreshold is fixed to 0.0001f
1175             ivx::Scalar termination = ivx::Scalar::create<VX_TYPE_ENUM>(context, termEnum);
1176             ivx::Scalar epsilon = ivx::Scalar::create<VX_TYPE_FLOAT32>(context, criteria.epsilon);
1177             ivx::Scalar numIterations = ivx::Scalar::create<VX_TYPE_UINT32>(context, criteria.maxCount);
1178             ivx::Scalar useInitial = ivx::Scalar::create<VX_TYPE_BOOL>(context, (vx_bool)(flags & OPTFLOW_USE_INITIAL_FLOW));
1179             //assume winSize is square
1180             ivx::Scalar windowSize = ivx::Scalar::create<VX_TYPE_SIZE>(context, (vx_size)winSize.width);
1181
1182             ivx::Node::create(graph, VX_KERNEL_OPTICAL_FLOW_PYR_LK, prevPyr, nextPyr, prevPts, estimatedPts,
1183                               nextPts, termination, epsilon, numIterations, useInitial, windowSize);
1184
1185             graph.verify();
1186             graph.process();
1187
1188             nextPts.copyTo(vxNextPts);
1189             for(size_t i = 0; i < npoints; i++)
1190             {
1191                 vx_keypoint_t kp = vxNextPts[i];
1192                 nextPtsMat.at<Point2f>(i) = Point2f(kp.x, kp.y);
1193                 statusMat.at<uchar>(i) = (bool)kp.tracking_status;
1194             }
1195
1196 #ifdef VX_VERSION_1_1
1197         //we should take user memory back before release
1198         //(it's not done automatically according to standard)
1199         prevImg.swapHandle(); nextImg.swapHandle();
1200 #endif
1201         }
1202         catch (RuntimeError & e)
1203         {
1204             VX_DbgThrow(e.what());
1205         }
1206         catch (WrapperError & e)
1207         {
1208             VX_DbgThrow(e.what());
1209         }
1210
1211         return true;
1212     }
1213 #endif
1214 };
1215
1216
1217
1218 void SparsePyrLKOpticalFlowImpl::calc( InputArray _prevImg, InputArray _nextImg,
1219                            InputArray _prevPts, InputOutputArray _nextPts,
1220                            OutputArray _status, OutputArray _err)
1221 {
1222     CV_INSTRUMENT_REGION()
1223
1224     CV_OCL_RUN(ocl::isOpenCLActivated() &&
1225                (_prevImg.isUMat() || _nextImg.isUMat()) &&
1226                ocl::Image2D::isFormatSupported(CV_32F, 1, false),
1227                ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err))
1228
1229     // Disabled due to bad accuracy
1230     CV_OVX_RUN(false,
1231                openvx_pyrlk(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err))
1232
1233     Mat prevPtsMat = _prevPts.getMat();
1234     const int derivDepth = DataType<cv::detail::deriv_type>::depth;
1235
1236     CV_Assert( maxLevel >= 0 && winSize.width > 2 && winSize.height > 2 );
1237
1238     int level=0, i, npoints;
1239     CV_Assert( (npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0 );
1240
1241     if( npoints == 0 )
1242     {
1243         _nextPts.release();
1244         _status.release();
1245         _err.release();
1246         return;
1247     }
1248
1249     if( !(flags & OPTFLOW_USE_INITIAL_FLOW) )
1250         _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
1251
1252     Mat nextPtsMat = _nextPts.getMat();
1253     CV_Assert( nextPtsMat.checkVector(2, CV_32F, true) == npoints );
1254
1255     const Point2f* prevPts = prevPtsMat.ptr<Point2f>();
1256     Point2f* nextPts = nextPtsMat.ptr<Point2f>();
1257
1258     _status.create((int)npoints, 1, CV_8U, -1, true);
1259     Mat statusMat = _status.getMat(), errMat;
1260     CV_Assert( statusMat.isContinuous() );
1261     uchar* status = statusMat.ptr();
1262     float* err = 0;
1263
1264     for( i = 0; i < npoints; i++ )
1265         status[i] = true;
1266
1267     if( _err.needed() )
1268     {
1269         _err.create((int)npoints, 1, CV_32F, -1, true);
1270         errMat = _err.getMat();
1271         CV_Assert( errMat.isContinuous() );
1272         err = errMat.ptr<float>();
1273     }
1274
1275     std::vector<Mat> prevPyr, nextPyr;
1276     int levels1 = -1;
1277     int lvlStep1 = 1;
1278     int levels2 = -1;
1279     int lvlStep2 = 1;
1280
1281     if(_prevImg.kind() == _InputArray::STD_VECTOR_MAT)
1282     {
1283         _prevImg.getMatVector(prevPyr);
1284
1285         levels1 = int(prevPyr.size()) - 1;
1286         CV_Assert(levels1 >= 0);
1287
1288         if (levels1 % 2 == 1 && prevPyr[0].channels() * 2 == prevPyr[1].channels() && prevPyr[1].depth() == derivDepth)
1289         {
1290             lvlStep1 = 2;
1291             levels1 /= 2;
1292         }
1293
1294         // ensure that pyramid has reqired padding
1295         if(levels1 > 0)
1296         {
1297             Size fullSize;
1298             Point ofs;
1299             prevPyr[lvlStep1].locateROI(fullSize, ofs);
1300             CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height
1301                 && ofs.x + prevPyr[lvlStep1].cols + winSize.width <= fullSize.width
1302                 && ofs.y + prevPyr[lvlStep1].rows + winSize.height <= fullSize.height);
1303         }
1304
1305         if(levels1 < maxLevel)
1306             maxLevel = levels1;
1307     }
1308
1309     if(_nextImg.kind() == _InputArray::STD_VECTOR_MAT)
1310     {
1311         _nextImg.getMatVector(nextPyr);
1312
1313         levels2 = int(nextPyr.size()) - 1;
1314         CV_Assert(levels2 >= 0);
1315
1316         if (levels2 % 2 == 1 && nextPyr[0].channels() * 2 == nextPyr[1].channels() && nextPyr[1].depth() == derivDepth)
1317         {
1318             lvlStep2 = 2;
1319             levels2 /= 2;
1320         }
1321
1322         // ensure that pyramid has reqired padding
1323         if(levels2 > 0)
1324         {
1325             Size fullSize;
1326             Point ofs;
1327             nextPyr[lvlStep2].locateROI(fullSize, ofs);
1328             CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height
1329                 && ofs.x + nextPyr[lvlStep2].cols + winSize.width <= fullSize.width
1330                 && ofs.y + nextPyr[lvlStep2].rows + winSize.height <= fullSize.height);
1331         }
1332
1333         if(levels2 < maxLevel)
1334             maxLevel = levels2;
1335     }
1336
1337     if (levels1 < 0)
1338         maxLevel = buildOpticalFlowPyramid(_prevImg, prevPyr, winSize, maxLevel, false);
1339
1340     if (levels2 < 0)
1341         maxLevel = buildOpticalFlowPyramid(_nextImg, nextPyr, winSize, maxLevel, false);
1342
1343     if( (criteria.type & TermCriteria::COUNT) == 0 )
1344         criteria.maxCount = 30;
1345     else
1346         criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
1347     if( (criteria.type & TermCriteria::EPS) == 0 )
1348         criteria.epsilon = 0.01;
1349     else
1350         criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
1351     criteria.epsilon *= criteria.epsilon;
1352
1353     // dI/dx ~ Ix, dI/dy ~ Iy
1354     Mat derivIBuf;
1355     if(lvlStep1 == 1)
1356         derivIBuf.create(prevPyr[0].rows + winSize.height*2, prevPyr[0].cols + winSize.width*2, CV_MAKETYPE(derivDepth, prevPyr[0].channels() * 2));
1357
1358     for( level = maxLevel; level >= 0; level-- )
1359     {
1360         Mat derivI;
1361         if(lvlStep1 == 1)
1362         {
1363             Size imgSize = prevPyr[level * lvlStep1].size();
1364             Mat _derivI( imgSize.height + winSize.height*2,
1365                 imgSize.width + winSize.width*2, derivIBuf.type(), derivIBuf.ptr() );
1366             derivI = _derivI(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));
1367             calcSharrDeriv(prevPyr[level * lvlStep1], derivI);
1368             copyMakeBorder(derivI, _derivI, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_CONSTANT|BORDER_ISOLATED);
1369         }
1370         else
1371             derivI = prevPyr[level * lvlStep1 + 1];
1372
1373         CV_Assert(prevPyr[level * lvlStep1].size() == nextPyr[level * lvlStep2].size());
1374         CV_Assert(prevPyr[level * lvlStep1].type() == nextPyr[level * lvlStep2].type());
1375
1376         typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
1377         parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
1378                                                           nextPyr[level * lvlStep2], prevPts, nextPts,
1379                                                           status, err,
1380                                                           winSize, criteria, level, maxLevel,
1381                                                           flags, (float)minEigThreshold));
1382     }
1383 }
1384
1385 } // namespace
1386 } // namespace cv
1387 cv::Ptr<cv::SparsePyrLKOpticalFlow> cv::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, TermCriteria crit, int flags, double minEigThreshold){
1388     return makePtr<SparsePyrLKOpticalFlowImpl>(winSize,maxLevel,crit,flags,minEigThreshold);
1389 }
1390 void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
1391                                InputArray _prevPts, InputOutputArray _nextPts,
1392                                OutputArray _status, OutputArray _err,
1393                                Size winSize, int maxLevel,
1394                                TermCriteria criteria,
1395                                int flags, double minEigThreshold )
1396 {
1397     Ptr<cv::SparsePyrLKOpticalFlow> optflow = cv::SparsePyrLKOpticalFlow::create(winSize,maxLevel,criteria,flags,minEigThreshold);
1398     optflow->calc(_prevImg,_nextImg,_prevPts,_nextPts,_status,_err);
1399 }
1400
1401 namespace cv
1402 {
1403
1404 static void
1405 getRTMatrix( const Point2f* a, const Point2f* b,
1406              int count, Mat& M, bool fullAffine )
1407 {
1408     CV_Assert( M.isContinuous() );
1409
1410     if( fullAffine )
1411     {
1412         double sa[6][6]={{0.}}, sb[6]={0.};
1413         Mat A( 6, 6, CV_64F, &sa[0][0] ), B( 6, 1, CV_64F, sb );
1414         Mat MM = M.reshape(1, 6);
1415
1416         for( int i = 0; i < count; i++ )
1417         {
1418             sa[0][0] += a[i].x*a[i].x;
1419             sa[0][1] += a[i].y*a[i].x;
1420             sa[0][2] += a[i].x;
1421
1422             sa[1][1] += a[i].y*a[i].y;
1423             sa[1][2] += a[i].y;
1424
1425             sb[0] += a[i].x*b[i].x;
1426             sb[1] += a[i].y*b[i].x;
1427             sb[2] += b[i].x;
1428             sb[3] += a[i].x*b[i].y;
1429             sb[4] += a[i].y*b[i].y;
1430             sb[5] += b[i].y;
1431         }
1432
1433         sa[3][4] = sa[4][3] = sa[1][0] = sa[0][1];
1434         sa[3][5] = sa[5][3] = sa[2][0] = sa[0][2];
1435         sa[4][5] = sa[5][4] = sa[2][1] = sa[1][2];
1436
1437         sa[3][3] = sa[0][0];
1438         sa[4][4] = sa[1][1];
1439         sa[5][5] = sa[2][2] = count;
1440
1441         solve( A, B, MM, DECOMP_EIG );
1442     }
1443     else
1444     {
1445         double sa[4][4]={{0.}}, sb[4]={0.}, m[4] = {0};
1446         Mat A( 4, 4, CV_64F, sa ), B( 4, 1, CV_64F, sb );
1447         Mat MM( 4, 1, CV_64F, m );
1448
1449         for( int i = 0; i < count; i++ )
1450         {
1451             sa[0][0] += a[i].x*a[i].x + a[i].y*a[i].y;
1452             sa[0][2] += a[i].x;
1453             sa[0][3] += a[i].y;
1454
1455             sb[0] += a[i].x*b[i].x + a[i].y*b[i].y;
1456             sb[1] += a[i].x*b[i].y - a[i].y*b[i].x;
1457             sb[2] += b[i].x;
1458             sb[3] += b[i].y;
1459         }
1460
1461         sa[1][1] = sa[0][0];
1462         sa[2][1] = sa[1][2] = -sa[0][3];
1463         sa[3][1] = sa[1][3] = sa[2][0] = sa[0][2];
1464         sa[2][2] = sa[3][3] = count;
1465         sa[3][0] = sa[0][3];
1466
1467         solve( A, B, MM, DECOMP_EIG );
1468
1469         double* om = M.ptr<double>();
1470         om[0] = om[4] = m[0];
1471         om[1] = -m[1];
1472         om[3] = m[1];
1473         om[2] = m[2];
1474         om[5] = m[3];
1475     }
1476 }
1477
1478 }
1479
1480 cv::Mat cv::estimateRigidTransform( InputArray src1, InputArray src2, bool fullAffine )
1481 {
1482     CV_INSTRUMENT_REGION()
1483
1484     Mat M(2, 3, CV_64F), A = src1.getMat(), B = src2.getMat();
1485
1486     const int COUNT = 15;
1487     const int WIDTH = 160, HEIGHT = 120;
1488     const int RANSAC_MAX_ITERS = 500;
1489     const int RANSAC_SIZE0 = 3;
1490     const double RANSAC_GOOD_RATIO = 0.5;
1491
1492     std::vector<Point2f> pA, pB;
1493     std::vector<int> good_idx;
1494     std::vector<uchar> status;
1495
1496     double scale = 1.;
1497     int i, j, k, k1;
1498
1499     RNG rng((uint64)-1);
1500     int good_count = 0;
1501
1502     if( A.size() != B.size() )
1503         CV_Error( Error::StsUnmatchedSizes, "Both input images must have the same size" );
1504
1505     if( A.type() != B.type() )
1506         CV_Error( Error::StsUnmatchedFormats, "Both input images must have the same data type" );
1507
1508     int count = A.checkVector(2);
1509
1510     if( count > 0 )
1511     {
1512         A.reshape(2, count).convertTo(pA, CV_32F);
1513         B.reshape(2, count).convertTo(pB, CV_32F);
1514     }
1515     else if( A.depth() == CV_8U )
1516     {
1517         int cn = A.channels();
1518         CV_Assert( cn == 1 || cn == 3 || cn == 4 );
1519         Size sz0 = A.size();
1520         Size sz1(WIDTH, HEIGHT);
1521
1522         scale = std::max(1., std::max( (double)sz1.width/sz0.width, (double)sz1.height/sz0.height ));
1523
1524         sz1.width = cvRound( sz0.width * scale );
1525         sz1.height = cvRound( sz0.height * scale );
1526
1527         bool equalSizes = sz1.width == sz0.width && sz1.height == sz0.height;
1528
1529         if( !equalSizes || cn != 1 )
1530         {
1531             Mat sA, sB;
1532
1533             if( cn != 1 )
1534             {
1535                 Mat gray;
1536                 cvtColor(A, gray, COLOR_BGR2GRAY);
1537                 resize(gray, sA, sz1, 0., 0., INTER_AREA);
1538                 cvtColor(B, gray, COLOR_BGR2GRAY);
1539                 resize(gray, sB, sz1, 0., 0., INTER_AREA);
1540             }
1541             else
1542             {
1543                 resize(A, sA, sz1, 0., 0., INTER_AREA);
1544                 resize(B, sB, sz1, 0., 0., INTER_AREA);
1545             }
1546
1547             A = sA;
1548             B = sB;
1549         }
1550
1551         int count_y = COUNT;
1552         int count_x = cvRound((double)COUNT*sz1.width/sz1.height);
1553         count = count_x * count_y;
1554
1555         pA.resize(count);
1556         pB.resize(count);
1557         status.resize(count);
1558
1559         for( i = 0, k = 0; i < count_y; i++ )
1560             for( j = 0; j < count_x; j++, k++ )
1561             {
1562                 pA[k].x = (j+0.5f)*sz1.width/count_x;
1563                 pA[k].y = (i+0.5f)*sz1.height/count_y;
1564             }
1565
1566         // find the corresponding points in B
1567         calcOpticalFlowPyrLK(A, B, pA, pB, status, noArray(), Size(21, 21), 3,
1568                              TermCriteria(TermCriteria::MAX_ITER,40,0.1));
1569
1570         // repack the remained points
1571         for( i = 0, k = 0; i < count; i++ )
1572             if( status[i] )
1573             {
1574                 if( i > k )
1575                 {
1576                     pA[k] = pA[i];
1577                     pB[k] = pB[i];
1578                 }
1579                 k++;
1580             }
1581         count = k;
1582         pA.resize(count);
1583         pB.resize(count);
1584     }
1585     else
1586         CV_Error( Error::StsUnsupportedFormat, "Both input images must have either 8uC1 or 8uC3 type" );
1587
1588     good_idx.resize(count);
1589
1590     if( count < RANSAC_SIZE0 )
1591         return Mat();
1592
1593     Rect brect = boundingRect(pB);
1594
1595     // RANSAC stuff:
1596     // 1. find the consensus
1597     for( k = 0; k < RANSAC_MAX_ITERS; k++ )
1598     {
1599         int idx[RANSAC_SIZE0];
1600         Point2f a[RANSAC_SIZE0];
1601         Point2f b[RANSAC_SIZE0];
1602
1603         // choose random 3 non-coplanar points from A & B
1604         for( i = 0; i < RANSAC_SIZE0; i++ )
1605         {
1606             for( k1 = 0; k1 < RANSAC_MAX_ITERS; k1++ )
1607             {
1608                 idx[i] = rng.uniform(0, count);
1609
1610                 for( j = 0; j < i; j++ )
1611                 {
1612                     if( idx[j] == idx[i] )
1613                         break;
1614                     // check that the points are not very close one each other
1615                     if( fabs(pA[idx[i]].x - pA[idx[j]].x) +
1616                         fabs(pA[idx[i]].y - pA[idx[j]].y) < FLT_EPSILON )
1617                         break;
1618                     if( fabs(pB[idx[i]].x - pB[idx[j]].x) +
1619                         fabs(pB[idx[i]].y - pB[idx[j]].y) < FLT_EPSILON )
1620                         break;
1621                 }
1622
1623                 if( j < i )
1624                     continue;
1625
1626                 if( i+1 == RANSAC_SIZE0 )
1627                 {
1628                     // additional check for non-complanar vectors
1629                     a[0] = pA[idx[0]];
1630                     a[1] = pA[idx[1]];
1631                     a[2] = pA[idx[2]];
1632
1633                     b[0] = pB[idx[0]];
1634                     b[1] = pB[idx[1]];
1635                     b[2] = pB[idx[2]];
1636
1637                     double dax1 = a[1].x - a[0].x, day1 = a[1].y - a[0].y;
1638                     double dax2 = a[2].x - a[0].x, day2 = a[2].y - a[0].y;
1639                     double dbx1 = b[1].x - b[0].x, dby1 = b[1].y - b[0].y;
1640                     double dbx2 = b[2].x - b[0].x, dby2 = b[2].y - b[0].y;
1641                     const double eps = 0.01;
1642
1643                     if( fabs(dax1*day2 - day1*dax2) < eps*std::sqrt(dax1*dax1+day1*day1)*std::sqrt(dax2*dax2+day2*day2) ||
1644                         fabs(dbx1*dby2 - dby1*dbx2) < eps*std::sqrt(dbx1*dbx1+dby1*dby1)*std::sqrt(dbx2*dbx2+dby2*dby2) )
1645                         continue;
1646                 }
1647                 break;
1648             }
1649
1650             if( k1 >= RANSAC_MAX_ITERS )
1651                 break;
1652         }
1653
1654         if( i < RANSAC_SIZE0 )
1655             continue;
1656
1657         // estimate the transformation using 3 points
1658         getRTMatrix( a, b, 3, M, fullAffine );
1659
1660         const double* m = M.ptr<double>();
1661         for( i = 0, good_count = 0; i < count; i++ )
1662         {
1663             if( std::abs( m[0]*pA[i].x + m[1]*pA[i].y + m[2] - pB[i].x ) +
1664                 std::abs( m[3]*pA[i].x + m[4]*pA[i].y + m[5] - pB[i].y ) < std::max(brect.width,brect.height)*0.05 )
1665                 good_idx[good_count++] = i;
1666         }
1667
1668         if( good_count >= count*RANSAC_GOOD_RATIO )
1669             break;
1670     }
1671
1672     if( k >= RANSAC_MAX_ITERS )
1673         return Mat();
1674
1675     if( good_count < count )
1676     {
1677         for( i = 0; i < good_count; i++ )
1678         {
1679             j = good_idx[i];
1680             pA[i] = pA[j];
1681             pB[i] = pB[j];
1682         }
1683     }
1684
1685     getRTMatrix( &pA[0], &pB[0], good_count, M, fullAffine );
1686     M.at<double>(0, 2) /= scale;
1687     M.at<double>(1, 2) /= scale;
1688
1689     return M;
1690 }
1691
1692 /* End of file. */