fixes in VResizeLinearVec_32s8u
[profile/ivi/opencv.git] / modules / imgproc / src / imgwarp.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 /* ////////////////////////////////////////////////////////////////////
44 //
45 //  Geometrical transforms on images and matrices: rotation, zoom etc.
46 //
47 // */
48
49 #include "precomp.hpp"
50 #include "opencl_kernels_imgproc.hpp"
51
52 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
53 static IppStatus sts = ippInit();
54 #endif
55
56 namespace cv
57 {
58 #if IPP_VERSION_X100 >= 701
59     typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
60     typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
61     typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
62 #endif
63
64 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) && 0
65     typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
66     typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
67     typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
68
69     template <int channels, typename Type>
70     bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
71     {
72         Type values[channels];
73         for( int i = 0; i < channels; i++ )
74             values[i] = saturate_cast<Type>(value[i]);
75         return func(values, dataPointer, step, size) >= 0;
76     }
77
78     static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
79     {
80         if( channels == 1 )
81         {
82             switch( depth )
83             {
84             case CV_8U:
85                 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
86             case CV_16U:
87                 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
88             case CV_32F:
89                 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
90             }
91         }
92         else
93         {
94             if( channels == 3 )
95             {
96                 switch( depth )
97                 {
98                 case CV_8U:
99                     return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
100                 case CV_16U:
101                     return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
102                 case CV_32F:
103                     return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
104                 }
105             }
106             else if( channels == 4 )
107             {
108                 switch( depth )
109                 {
110                 case CV_8U:
111                     return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
112                 case CV_16U:
113                     return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
114                 case CV_32F:
115                     return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
116                 }
117             }
118         }
119         return false;
120     }
121 #endif
122
123 /************** interpolation formulas and tables ***************/
124
125 const int INTER_RESIZE_COEF_BITS=11;
126 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
127
128 const int INTER_REMAP_COEF_BITS=15;
129 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
130
131 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
132
133 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
134 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
135
136 #if CV_SSE2 || CV_NEON
137 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
138 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
139 #endif
140
141 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
142 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
143
144 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
145 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
146
147 static inline void interpolateLinear( float x, float* coeffs )
148 {
149     coeffs[0] = 1.f - x;
150     coeffs[1] = x;
151 }
152
153 static inline void interpolateCubic( float x, float* coeffs )
154 {
155     const float A = -0.75f;
156
157     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
158     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
159     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
160     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
161 }
162
163 static inline void interpolateLanczos4( float x, float* coeffs )
164 {
165     static const double s45 = 0.70710678118654752440084436210485;
166     static const double cs[][2]=
167     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
168
169     if( x < FLT_EPSILON )
170     {
171         for( int i = 0; i < 8; i++ )
172             coeffs[i] = 0;
173         coeffs[3] = 1;
174         return;
175     }
176
177     float sum = 0;
178     double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
179     for(int i = 0; i < 8; i++ )
180     {
181         double y = -(x+3-i)*CV_PI*0.25;
182         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
183         sum += coeffs[i];
184     }
185
186     sum = 1.f/sum;
187     for(int i = 0; i < 8; i++ )
188         coeffs[i] *= sum;
189 }
190
191 static void initInterTab1D(int method, float* tab, int tabsz)
192 {
193     float scale = 1.f/tabsz;
194     if( method == INTER_LINEAR )
195     {
196         for( int i = 0; i < tabsz; i++, tab += 2 )
197             interpolateLinear( i*scale, tab );
198     }
199     else if( method == INTER_CUBIC )
200     {
201         for( int i = 0; i < tabsz; i++, tab += 4 )
202             interpolateCubic( i*scale, tab );
203     }
204     else if( method == INTER_LANCZOS4 )
205     {
206         for( int i = 0; i < tabsz; i++, tab += 8 )
207             interpolateLanczos4( i*scale, tab );
208     }
209     else
210         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
211 }
212
213
214 static const void* initInterTab2D( int method, bool fixpt )
215 {
216     static bool inittab[INTER_MAX+1] = {false};
217     float* tab = 0;
218     short* itab = 0;
219     int ksize = 0;
220     if( method == INTER_LINEAR )
221         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
222     else if( method == INTER_CUBIC )
223         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
224     else if( method == INTER_LANCZOS4 )
225         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
226     else
227         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
228
229     if( !inittab[method] )
230     {
231         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
232         int i, j, k1, k2;
233         initInterTab1D(method, _tab, INTER_TAB_SIZE);
234         for( i = 0; i < INTER_TAB_SIZE; i++ )
235             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
236             {
237                 int isum = 0;
238                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
239                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
240
241                 for( k1 = 0; k1 < ksize; k1++ )
242                 {
243                     float vy = _tab[i*ksize + k1];
244                     for( k2 = 0; k2 < ksize; k2++ )
245                     {
246                         float v = vy*_tab[j*ksize + k2];
247                         tab[k1*ksize + k2] = v;
248                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
249                     }
250                 }
251
252                 if( isum != INTER_REMAP_COEF_SCALE )
253                 {
254                     int diff = isum - INTER_REMAP_COEF_SCALE;
255                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
256                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
257                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
258                         {
259                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
260                                 mk1 = k1, mk2 = k2;
261                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
262                                 Mk1 = k1, Mk2 = k2;
263                         }
264                     if( diff < 0 )
265                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
266                     else
267                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
268                 }
269             }
270         tab -= INTER_TAB_SIZE2*ksize*ksize;
271         itab -= INTER_TAB_SIZE2*ksize*ksize;
272 #if CV_SSE2 || CV_NEON
273         if( method == INTER_LINEAR )
274         {
275             for( i = 0; i < INTER_TAB_SIZE2; i++ )
276                 for( j = 0; j < 4; j++ )
277                 {
278                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
279                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
280                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
281                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
282                 }
283         }
284 #endif
285         inittab[method] = true;
286     }
287     return fixpt ? (const void*)itab : (const void*)tab;
288 }
289
290 #ifndef __MINGW32__
291 static bool initAllInterTab2D()
292 {
293     return  initInterTab2D( INTER_LINEAR, false ) &&
294             initInterTab2D( INTER_LINEAR, true ) &&
295             initInterTab2D( INTER_CUBIC, false ) &&
296             initInterTab2D( INTER_CUBIC, true ) &&
297             initInterTab2D( INTER_LANCZOS4, false ) &&
298             initInterTab2D( INTER_LANCZOS4, true );
299 }
300
301 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
302 #endif
303
304 template<typename ST, typename DT> struct Cast
305 {
306     typedef ST type1;
307     typedef DT rtype;
308
309     DT operator()(ST val) const { return saturate_cast<DT>(val); }
310 };
311
312 template<typename ST, typename DT, int bits> struct FixedPtCast
313 {
314     typedef ST type1;
315     typedef DT rtype;
316     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
317
318     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
319 };
320
321 /****************************************************************************************\
322 *                                         Resize                                         *
323 \****************************************************************************************/
324
325 class resizeNNInvoker :
326     public ParallelLoopBody
327 {
328 public:
329     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
330         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
331         ify(_ify)
332     {
333     }
334
335     virtual void operator() (const Range& range) const
336     {
337         Size ssize = src.size(), dsize = dst.size();
338         int y, x, pix_size = (int)src.elemSize();
339
340         for( y = range.start; y < range.end; y++ )
341         {
342             uchar* D = dst.data + dst.step*y;
343             int sy = std::min(cvFloor(y*ify), ssize.height-1);
344             const uchar* S = src.ptr(sy);
345
346             switch( pix_size )
347             {
348             case 1:
349                 for( x = 0; x <= dsize.width - 2; x += 2 )
350                 {
351                     uchar t0 = S[x_ofs[x]];
352                     uchar t1 = S[x_ofs[x+1]];
353                     D[x] = t0;
354                     D[x+1] = t1;
355                 }
356
357                 for( ; x < dsize.width; x++ )
358                     D[x] = S[x_ofs[x]];
359                 break;
360             case 2:
361                 for( x = 0; x < dsize.width; x++ )
362                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
363                 break;
364             case 3:
365                 for( x = 0; x < dsize.width; x++, D += 3 )
366                 {
367                     const uchar* _tS = S + x_ofs[x];
368                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
369                 }
370                 break;
371             case 4:
372                 for( x = 0; x < dsize.width; x++ )
373                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
374                 break;
375             case 6:
376                 for( x = 0; x < dsize.width; x++, D += 6 )
377                 {
378                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
379                     ushort* _tD = (ushort*)D;
380                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
381                 }
382                 break;
383             case 8:
384                 for( x = 0; x < dsize.width; x++, D += 8 )
385                 {
386                     const int* _tS = (const int*)(S + x_ofs[x]);
387                     int* _tD = (int*)D;
388                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
389                 }
390                 break;
391             case 12:
392                 for( x = 0; x < dsize.width; x++, D += 12 )
393                 {
394                     const int* _tS = (const int*)(S + x_ofs[x]);
395                     int* _tD = (int*)D;
396                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
397                 }
398                 break;
399             default:
400                 for( x = 0; x < dsize.width; x++, D += pix_size )
401                 {
402                     const int* _tS = (const int*)(S + x_ofs[x]);
403                     int* _tD = (int*)D;
404                     for( int k = 0; k < pix_size4; k++ )
405                         _tD[k] = _tS[k];
406                 }
407             }
408         }
409     }
410
411 private:
412     const Mat src;
413     Mat dst;
414     int* x_ofs, pix_size4;
415     double ify;
416
417     resizeNNInvoker(const resizeNNInvoker&);
418     resizeNNInvoker& operator=(const resizeNNInvoker&);
419 };
420
421 static void
422 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
423 {
424     Size ssize = src.size(), dsize = dst.size();
425     AutoBuffer<int> _x_ofs(dsize.width);
426     int* x_ofs = _x_ofs;
427     int pix_size = (int)src.elemSize();
428     int pix_size4 = (int)(pix_size / sizeof(int));
429     double ifx = 1./fx, ify = 1./fy;
430     int x;
431
432     for( x = 0; x < dsize.width; x++ )
433     {
434         int sx = cvFloor(x*ifx);
435         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
436     }
437
438     Range range(0, dsize.height);
439     resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
440     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
441 }
442
443
444 struct VResizeNoVec
445 {
446     int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
447 };
448
449 struct HResizeNoVec
450 {
451     int operator()(const uchar**, uchar**, int, const int*,
452         const uchar*, int, int, int, int, int) const { return 0; }
453 };
454
455 #if CV_SSE2
456
457 struct VResizeLinearVec_32s8u
458 {
459     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
460     {
461         if( !checkHardwareSupport(CV_CPU_SSE2) )
462             return 0;
463
464         const int** src = (const int**)_src;
465         const short* beta = (const short*)_beta;
466         const int *S0 = src[0], *S1 = src[1];
467         int x = 0;
468         __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
469         __m128i delta = _mm_set1_epi16(2);
470
471         if( (((size_t)S0|(size_t)S1)&15) == 0 )
472             for( ; x <= width - 16; x += 16 )
473             {
474                 __m128i x0, x1, x2, y0, y1, y2;
475                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
476                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
477                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
478                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
479                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
480                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
481
482                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
483                 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
484                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
485                 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
486                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
487                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
488
489                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
490                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
491
492                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
493                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
494                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
495             }
496         else
497             for( ; x <= width - 16; x += 16 )
498             {
499                 __m128i x0, x1, x2, y0, y1, y2;
500                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
501                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
502                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
503                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
504                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
505                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
506
507                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
508                 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
509                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
510                 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
511                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
512                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
513
514                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
515                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
516
517                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
518                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
519                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
520             }
521
522         for( ; x < width - 4; x += 4 )
523         {
524             __m128i x0, y0;
525             x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
526             y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
527             x0 = _mm_packs_epi32(x0, x0);
528             y0 = _mm_packs_epi32(y0, y0);
529             x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
530             x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
531             x0 = _mm_packus_epi16(x0, x0);
532             *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
533         }
534
535         return x;
536     }
537 };
538
539
540 template<int shiftval> struct VResizeLinearVec_32f16
541 {
542     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
543     {
544         if( !checkHardwareSupport(CV_CPU_SSE2) )
545             return 0;
546
547         const float** src = (const float**)_src;
548         const float* beta = (const float*)_beta;
549         const float *S0 = src[0], *S1 = src[1];
550         ushort* dst = (ushort*)_dst;
551         int x = 0;
552
553         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
554         __m128i preshift = _mm_set1_epi32(shiftval);
555         __m128i postshift = _mm_set1_epi16((short)shiftval);
556
557         if( (((size_t)S0|(size_t)S1)&15) == 0 )
558             for( ; x <= width - 16; x += 16 )
559             {
560                 __m128 x0, x1, y0, y1;
561                 __m128i t0, t1, t2;
562                 x0 = _mm_load_ps(S0 + x);
563                 x1 = _mm_load_ps(S0 + x + 4);
564                 y0 = _mm_load_ps(S1 + x);
565                 y1 = _mm_load_ps(S1 + x + 4);
566
567                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
568                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
569                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
570                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
571                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
572
573                 x0 = _mm_load_ps(S0 + x + 8);
574                 x1 = _mm_load_ps(S0 + x + 12);
575                 y0 = _mm_load_ps(S1 + x + 8);
576                 y1 = _mm_load_ps(S1 + x + 12);
577
578                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
579                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
580                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
581                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
582                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
583
584                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
585                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
586             }
587         else
588             for( ; x <= width - 16; x += 16 )
589             {
590                 __m128 x0, x1, y0, y1;
591                 __m128i t0, t1, t2;
592                 x0 = _mm_loadu_ps(S0 + x);
593                 x1 = _mm_loadu_ps(S0 + x + 4);
594                 y0 = _mm_loadu_ps(S1 + x);
595                 y1 = _mm_loadu_ps(S1 + x + 4);
596
597                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
598                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
599                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
600                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
601                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
602
603                 x0 = _mm_loadu_ps(S0 + x + 8);
604                 x1 = _mm_loadu_ps(S0 + x + 12);
605                 y0 = _mm_loadu_ps(S1 + x + 8);
606                 y1 = _mm_loadu_ps(S1 + x + 12);
607
608                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
609                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
610                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
611                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
612                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
613
614                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
615                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
616             }
617
618         for( ; x < width - 4; x += 4 )
619         {
620             __m128 x0, y0;
621             __m128i t0;
622             x0 = _mm_loadu_ps(S0 + x);
623             y0 = _mm_loadu_ps(S1 + x);
624
625             x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
626             t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
627             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
628             _mm_storel_epi64( (__m128i*)(dst + x), t0);
629         }
630
631         return x;
632     }
633 };
634
635 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
636 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
637
638 struct VResizeLinearVec_32f
639 {
640     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
641     {
642         if( !checkHardwareSupport(CV_CPU_SSE) )
643             return 0;
644
645         const float** src = (const float**)_src;
646         const float* beta = (const float*)_beta;
647         const float *S0 = src[0], *S1 = src[1];
648         float* dst = (float*)_dst;
649         int x = 0;
650
651         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
652
653         if( (((size_t)S0|(size_t)S1)&15) == 0 )
654             for( ; x <= width - 8; x += 8 )
655             {
656                 __m128 x0, x1, y0, y1;
657                 x0 = _mm_load_ps(S0 + x);
658                 x1 = _mm_load_ps(S0 + x + 4);
659                 y0 = _mm_load_ps(S1 + x);
660                 y1 = _mm_load_ps(S1 + x + 4);
661
662                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
663                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
664
665                 _mm_storeu_ps( dst + x, x0);
666                 _mm_storeu_ps( dst + x + 4, x1);
667             }
668         else
669             for( ; x <= width - 8; x += 8 )
670             {
671                 __m128 x0, x1, y0, y1;
672                 x0 = _mm_loadu_ps(S0 + x);
673                 x1 = _mm_loadu_ps(S0 + x + 4);
674                 y0 = _mm_loadu_ps(S1 + x);
675                 y1 = _mm_loadu_ps(S1 + x + 4);
676
677                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
678                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
679
680                 _mm_storeu_ps( dst + x, x0);
681                 _mm_storeu_ps( dst + x + 4, x1);
682             }
683
684         return x;
685     }
686 };
687
688
689 struct VResizeCubicVec_32s8u
690 {
691     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
692     {
693         if( !checkHardwareSupport(CV_CPU_SSE2) )
694             return 0;
695
696         const int** src = (const int**)_src;
697         const short* beta = (const short*)_beta;
698         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
699         int x = 0;
700         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
701         __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
702             b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
703
704         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
705             for( ; x <= width - 8; x += 8 )
706             {
707                 __m128i x0, x1, y0, y1;
708                 __m128 s0, s1, f0, f1;
709                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
710                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
711                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
712                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
713
714                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
715                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
716                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
717                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
718                 s0 = _mm_add_ps(s0, f0);
719                 s1 = _mm_add_ps(s1, f1);
720
721                 x0 = _mm_load_si128((const __m128i*)(S2 + x));
722                 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
723                 y0 = _mm_load_si128((const __m128i*)(S3 + x));
724                 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
725
726                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
727                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
728                 s0 = _mm_add_ps(s0, f0);
729                 s1 = _mm_add_ps(s1, f1);
730                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
731                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
732                 s0 = _mm_add_ps(s0, f0);
733                 s1 = _mm_add_ps(s1, f1);
734
735                 x0 = _mm_cvtps_epi32(s0);
736                 x1 = _mm_cvtps_epi32(s1);
737
738                 x0 = _mm_packs_epi32(x0, x1);
739                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
740             }
741         else
742             for( ; x <= width - 8; x += 8 )
743             {
744                 __m128i x0, x1, y0, y1;
745                 __m128 s0, s1, f0, f1;
746                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
747                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
748                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
749                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
750
751                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
752                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
753                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
754                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
755                 s0 = _mm_add_ps(s0, f0);
756                 s1 = _mm_add_ps(s1, f1);
757
758                 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
759                 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
760                 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
761                 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
762
763                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
764                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
765                 s0 = _mm_add_ps(s0, f0);
766                 s1 = _mm_add_ps(s1, f1);
767                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
768                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
769                 s0 = _mm_add_ps(s0, f0);
770                 s1 = _mm_add_ps(s1, f1);
771
772                 x0 = _mm_cvtps_epi32(s0);
773                 x1 = _mm_cvtps_epi32(s1);
774
775                 x0 = _mm_packs_epi32(x0, x1);
776                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
777             }
778
779         return x;
780     }
781 };
782
783
784 template<int shiftval> struct VResizeCubicVec_32f16
785 {
786     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
787     {
788         if( !checkHardwareSupport(CV_CPU_SSE2) )
789             return 0;
790
791         const float** src = (const float**)_src;
792         const float* beta = (const float*)_beta;
793         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
794         ushort* dst = (ushort*)_dst;
795         int x = 0;
796         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
797             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
798         __m128i preshift = _mm_set1_epi32(shiftval);
799         __m128i postshift = _mm_set1_epi16((short)shiftval);
800
801         for( ; x <= width - 8; x += 8 )
802         {
803             __m128 x0, x1, y0, y1, s0, s1;
804             __m128i t0, t1;
805             x0 = _mm_loadu_ps(S0 + x);
806             x1 = _mm_loadu_ps(S0 + x + 4);
807             y0 = _mm_loadu_ps(S1 + x);
808             y1 = _mm_loadu_ps(S1 + x + 4);
809
810             s0 = _mm_mul_ps(x0, b0);
811             s1 = _mm_mul_ps(x1, b0);
812             y0 = _mm_mul_ps(y0, b1);
813             y1 = _mm_mul_ps(y1, b1);
814             s0 = _mm_add_ps(s0, y0);
815             s1 = _mm_add_ps(s1, y1);
816
817             x0 = _mm_loadu_ps(S2 + x);
818             x1 = _mm_loadu_ps(S2 + x + 4);
819             y0 = _mm_loadu_ps(S3 + x);
820             y1 = _mm_loadu_ps(S3 + x + 4);
821
822             x0 = _mm_mul_ps(x0, b2);
823             x1 = _mm_mul_ps(x1, b2);
824             y0 = _mm_mul_ps(y0, b3);
825             y1 = _mm_mul_ps(y1, b3);
826             s0 = _mm_add_ps(s0, x0);
827             s1 = _mm_add_ps(s1, x1);
828             s0 = _mm_add_ps(s0, y0);
829             s1 = _mm_add_ps(s1, y1);
830
831             t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
832             t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
833
834             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
835             _mm_storeu_si128( (__m128i*)(dst + x), t0);
836         }
837
838         return x;
839     }
840 };
841
842 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
843 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
844
845 struct VResizeCubicVec_32f
846 {
847     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
848     {
849         if( !checkHardwareSupport(CV_CPU_SSE) )
850             return 0;
851
852         const float** src = (const float**)_src;
853         const float* beta = (const float*)_beta;
854         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
855         float* dst = (float*)_dst;
856         int x = 0;
857         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
858             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
859
860         for( ; x <= width - 8; x += 8 )
861         {
862             __m128 x0, x1, y0, y1, s0, s1;
863             x0 = _mm_loadu_ps(S0 + x);
864             x1 = _mm_loadu_ps(S0 + x + 4);
865             y0 = _mm_loadu_ps(S1 + x);
866             y1 = _mm_loadu_ps(S1 + x + 4);
867
868             s0 = _mm_mul_ps(x0, b0);
869             s1 = _mm_mul_ps(x1, b0);
870             y0 = _mm_mul_ps(y0, b1);
871             y1 = _mm_mul_ps(y1, b1);
872             s0 = _mm_add_ps(s0, y0);
873             s1 = _mm_add_ps(s1, y1);
874
875             x0 = _mm_loadu_ps(S2 + x);
876             x1 = _mm_loadu_ps(S2 + x + 4);
877             y0 = _mm_loadu_ps(S3 + x);
878             y1 = _mm_loadu_ps(S3 + x + 4);
879
880             x0 = _mm_mul_ps(x0, b2);
881             x1 = _mm_mul_ps(x1, b2);
882             y0 = _mm_mul_ps(y0, b3);
883             y1 = _mm_mul_ps(y1, b3);
884             s0 = _mm_add_ps(s0, x0);
885             s1 = _mm_add_ps(s1, x1);
886             s0 = _mm_add_ps(s0, y0);
887             s1 = _mm_add_ps(s1, y1);
888
889             _mm_storeu_ps( dst + x, s0);
890             _mm_storeu_ps( dst + x + 4, s1);
891         }
892
893         return x;
894     }
895 };
896
897 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
898 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
899 typedef VResizeNoVec VResizeLanczos4Vec_32f;
900
901 #elif CV_NEON
902
903 struct VResizeLinearVec_32s8u
904 {
905     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
906     {
907         const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
908         const short* beta = (const short*)_beta;
909         int x = 0;
910         int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
911
912         for( ; x <= width - 16; x += 16)
913         {
914             int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
915             int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
916
917             int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
918             int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
919
920             int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
921                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
922             v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
923
924             v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
925             v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
926             v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
927             v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
928
929             v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
930             v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
931
932             int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
933                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
934             v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
935
936             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
937         }
938
939         return x;
940     }
941 };
942
943 struct VResizeLinearVec_32f16u
944 {
945     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
946     {
947         const float** src = (const float**)_src;
948         const float* beta = (const float*)_beta;
949         const float *S0 = src[0], *S1 = src[1];
950         ushort* dst = (ushort*)_dst;
951         int x = 0;
952
953         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
954
955         for( ; x <= width - 8; x += 8 )
956         {
957             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
958             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
959
960             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
961             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
962
963             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
964                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
965         }
966
967         return x;
968     }
969 };
970
971 struct VResizeLinearVec_32f16s
972 {
973     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
974     {
975         const float** src = (const float**)_src;
976         const float* beta = (const float*)_beta;
977         const float *S0 = src[0], *S1 = src[1];
978         short* dst = (short*)_dst;
979         int x = 0;
980
981         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
982
983         for( ; x <= width - 8; x += 8 )
984         {
985             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
986             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
987
988             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
989             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
990
991             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
992                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
993         }
994
995         return x;
996     }
997 };
998
999 struct VResizeLinearVec_32f
1000 {
1001     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1002     {
1003         const float** src = (const float**)_src;
1004         const float* beta = (const float*)_beta;
1005         const float *S0 = src[0], *S1 = src[1];
1006         float* dst = (float*)_dst;
1007         int x = 0;
1008
1009         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1010
1011         for( ; x <= width - 8; x += 8 )
1012         {
1013             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1014             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1015
1016             vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
1017             vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
1018         }
1019
1020         return x;
1021     }
1022 };
1023
1024 typedef VResizeNoVec VResizeCubicVec_32s8u;
1025
1026 struct VResizeCubicVec_32f16u
1027 {
1028     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1029     {
1030         const float** src = (const float**)_src;
1031         const float* beta = (const float*)_beta;
1032         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1033         ushort* dst = (ushort*)_dst;
1034         int x = 0;
1035         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1036                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1037
1038         for( ; x <= width - 8; x += 8 )
1039         {
1040             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1041                                                                          v_b1, vld1q_f32(S1 + x)),
1042                                                                          v_b2, vld1q_f32(S2 + x)),
1043                                                                          v_b3, vld1q_f32(S3 + x));
1044             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1045                                                                          v_b1, vld1q_f32(S1 + x + 4)),
1046                                                                          v_b2, vld1q_f32(S2 + x + 4)),
1047                                                                          v_b3, vld1q_f32(S3 + x + 4));
1048
1049             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1050                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1051         }
1052
1053         return x;
1054     }
1055 };
1056
1057 struct VResizeCubicVec_32f16s
1058 {
1059     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1060     {
1061         const float** src = (const float**)_src;
1062         const float* beta = (const float*)_beta;
1063         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1064         short* dst = (short*)_dst;
1065         int x = 0;
1066         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1067                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1068
1069         for( ; x <= width - 8; x += 8 )
1070         {
1071             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1072                                                                          v_b1, vld1q_f32(S1 + x)),
1073                                                                          v_b2, vld1q_f32(S2 + x)),
1074                                                                          v_b3, vld1q_f32(S3 + x));
1075             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1076                                                                          v_b1, vld1q_f32(S1 + x + 4)),
1077                                                                          v_b2, vld1q_f32(S2 + x + 4)),
1078                                                                          v_b3, vld1q_f32(S3 + x + 4));
1079
1080             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1081                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1082         }
1083
1084         return x;
1085     }
1086 };
1087
1088 struct VResizeCubicVec_32f
1089 {
1090     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1091     {
1092         const float** src = (const float**)_src;
1093         const float* beta = (const float*)_beta;
1094         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1095         float* dst = (float*)_dst;
1096         int x = 0;
1097         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1098                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1099
1100         for( ; x <= width - 8; x += 8 )
1101         {
1102             vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1103                                                                        v_b1, vld1q_f32(S1 + x)),
1104                                                                        v_b2, vld1q_f32(S2 + x)),
1105                                                                        v_b3, vld1q_f32(S3 + x)));
1106             vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1107                                                                           v_b1, vld1q_f32(S1 + x + 4)),
1108                                                                           v_b2, vld1q_f32(S2 + x + 4)),
1109                                                                           v_b3, vld1q_f32(S3 + x + 4)));
1110         }
1111
1112         return x;
1113     }
1114 };
1115
1116 struct VResizeLanczos4Vec_32f16u
1117 {
1118     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1119     {
1120         const float** src = (const float**)_src;
1121         const float* beta = (const float*)_beta;
1122         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1123                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1124         ushort * dst = (ushort*)_dst;
1125         int x = 0;
1126         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1127                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1128                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1129                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1130
1131         for( ; x <= width - 8; x += 8 )
1132         {
1133             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1134                                                                          v_b1, vld1q_f32(S1 + x)),
1135                                                                          v_b2, vld1q_f32(S2 + x)),
1136                                                                          v_b3, vld1q_f32(S3 + x));
1137             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1138                                                                          v_b5, vld1q_f32(S5 + x)),
1139                                                                          v_b6, vld1q_f32(S6 + x)),
1140                                                                          v_b7, vld1q_f32(S7 + x));
1141             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1142
1143             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1144                                                              v_b1, vld1q_f32(S1 + x + 4)),
1145                                                              v_b2, vld1q_f32(S2 + x + 4)),
1146                                                              v_b3, vld1q_f32(S3 + x + 4));
1147             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1148                                                              v_b5, vld1q_f32(S5 + x + 4)),
1149                                                              v_b6, vld1q_f32(S6 + x + 4)),
1150                                                              v_b7, vld1q_f32(S7 + x + 4));
1151             v_dst1 = vaddq_f32(v_dst0, v_dst1);
1152
1153             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
1154                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1155         }
1156
1157         return x;
1158     }
1159 };
1160
1161 struct VResizeLanczos4Vec_32f16s
1162 {
1163     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1164     {
1165         const float** src = (const float**)_src;
1166         const float* beta = (const float*)_beta;
1167         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1168                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1169         short * dst = (short*)_dst;
1170         int x = 0;
1171         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1172                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1173                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1174                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1175
1176         for( ; x <= width - 8; x += 8 )
1177         {
1178             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1179                                                                          v_b1, vld1q_f32(S1 + x)),
1180                                                                          v_b2, vld1q_f32(S2 + x)),
1181                                                                          v_b3, vld1q_f32(S3 + x));
1182             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1183                                                                          v_b5, vld1q_f32(S5 + x)),
1184                                                                          v_b6, vld1q_f32(S6 + x)),
1185                                                                          v_b7, vld1q_f32(S7 + x));
1186             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1187
1188             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1189                                                              v_b1, vld1q_f32(S1 + x + 4)),
1190                                                              v_b2, vld1q_f32(S2 + x + 4)),
1191                                                              v_b3, vld1q_f32(S3 + x + 4));
1192             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1193                                                              v_b5, vld1q_f32(S5 + x + 4)),
1194                                                              v_b6, vld1q_f32(S6 + x + 4)),
1195                                                              v_b7, vld1q_f32(S7 + x + 4));
1196             v_dst1 = vaddq_f32(v_dst0, v_dst1);
1197
1198             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
1199                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1200         }
1201
1202         return x;
1203     }
1204 };
1205
1206 struct VResizeLanczos4Vec_32f
1207 {
1208     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1209     {
1210         const float** src = (const float**)_src;
1211         const float* beta = (const float*)_beta;
1212         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1213                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1214         float* dst = (float*)_dst;
1215         int x = 0;
1216         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1217                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1218                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1219                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1220
1221         for( ; x <= width - 4; x += 4 )
1222         {
1223             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1224                                                                          v_b1, vld1q_f32(S1 + x)),
1225                                                                          v_b2, vld1q_f32(S2 + x)),
1226                                                                          v_b3, vld1q_f32(S3 + x));
1227             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1228                                                                          v_b5, vld1q_f32(S5 + x)),
1229                                                                          v_b6, vld1q_f32(S6 + x)),
1230                                                                          v_b7, vld1q_f32(S7 + x));
1231             vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
1232         }
1233
1234         return x;
1235     }
1236 };
1237
1238 #else
1239
1240 typedef VResizeNoVec VResizeLinearVec_32s8u;
1241 typedef VResizeNoVec VResizeLinearVec_32f16u;
1242 typedef VResizeNoVec VResizeLinearVec_32f16s;
1243 typedef VResizeNoVec VResizeLinearVec_32f;
1244
1245 typedef VResizeNoVec VResizeCubicVec_32s8u;
1246 typedef VResizeNoVec VResizeCubicVec_32f16u;
1247 typedef VResizeNoVec VResizeCubicVec_32f16s;
1248 typedef VResizeNoVec VResizeCubicVec_32f;
1249
1250 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
1251 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
1252 typedef VResizeNoVec VResizeLanczos4Vec_32f;
1253
1254 #endif
1255
1256 typedef HResizeNoVec HResizeLinearVec_8u32s;
1257 typedef HResizeNoVec HResizeLinearVec_16u32f;
1258 typedef HResizeNoVec HResizeLinearVec_16s32f;
1259 typedef HResizeNoVec HResizeLinearVec_32f;
1260 typedef HResizeNoVec HResizeLinearVec_64f;
1261
1262
1263 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1264 struct HResizeLinear
1265 {
1266     typedef T value_type;
1267     typedef WT buf_type;
1268     typedef AT alpha_type;
1269
1270     void operator()(const T** src, WT** dst, int count,
1271                     const int* xofs, const AT* alpha,
1272                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1273     {
1274         int dx, k;
1275         VecOp vecOp;
1276
1277         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
1278             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
1279
1280         for( k = 0; k <= count - 2; k++ )
1281         {
1282             const T *S0 = src[k], *S1 = src[k+1];
1283             WT *D0 = dst[k], *D1 = dst[k+1];
1284             for( dx = dx0; dx < xmax; dx++ )
1285             {
1286                 int sx = xofs[dx];
1287                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1288                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1289                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1290                 D0[dx] = t0; D1[dx] = t1;
1291             }
1292
1293             for( ; dx < dwidth; dx++ )
1294             {
1295                 int sx = xofs[dx];
1296                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1297             }
1298         }
1299
1300         for( ; k < count; k++ )
1301         {
1302             const T *S = src[k];
1303             WT *D = dst[k];
1304             for( dx = 0; dx < xmax; dx++ )
1305             {
1306                 int sx = xofs[dx];
1307                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1308             }
1309
1310             for( ; dx < dwidth; dx++ )
1311                 D[dx] = WT(S[xofs[dx]]*ONE);
1312         }
1313     }
1314 };
1315
1316
1317 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1318 struct VResizeLinear
1319 {
1320     typedef T value_type;
1321     typedef WT buf_type;
1322     typedef AT alpha_type;
1323
1324     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1325     {
1326         WT b0 = beta[0], b1 = beta[1];
1327         const WT *S0 = src[0], *S1 = src[1];
1328         CastOp castOp;
1329         VecOp vecOp;
1330
1331         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1332         #if CV_ENABLE_UNROLLED
1333         for( ; x <= width - 4; x += 4 )
1334         {
1335             WT t0, t1;
1336             t0 = S0[x]*b0 + S1[x]*b1;
1337             t1 = S0[x+1]*b0 + S1[x+1]*b1;
1338             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1339             t0 = S0[x+2]*b0 + S1[x+2]*b1;
1340             t1 = S0[x+3]*b0 + S1[x+3]*b1;
1341             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1342         }
1343         #endif
1344         for( ; x < width; x++ )
1345             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1346     }
1347 };
1348
1349 template<>
1350 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1351 {
1352     typedef uchar value_type;
1353     typedef int buf_type;
1354     typedef short alpha_type;
1355
1356     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1357     {
1358         alpha_type b0 = beta[0], b1 = beta[1];
1359         const buf_type *S0 = src[0], *S1 = src[1];
1360         VResizeLinearVec_32s8u vecOp;
1361
1362         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1363         #if CV_ENABLE_UNROLLED
1364         for( ; x <= width - 4; x += 4 )
1365         {
1366             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1367             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1368             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1369             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1370         }
1371         #endif
1372         for( ; x < width; x++ )
1373             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1374     }
1375 };
1376
1377
1378 template<typename T, typename WT, typename AT>
1379 struct HResizeCubic
1380 {
1381     typedef T value_type;
1382     typedef WT buf_type;
1383     typedef AT alpha_type;
1384
1385     void operator()(const T** src, WT** dst, int count,
1386                     const int* xofs, const AT* alpha,
1387                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1388     {
1389         for( int k = 0; k < count; k++ )
1390         {
1391             const T *S = src[k];
1392             WT *D = dst[k];
1393             int dx = 0, limit = xmin;
1394             for(;;)
1395             {
1396                 for( ; dx < limit; dx++, alpha += 4 )
1397                 {
1398                     int j, sx = xofs[dx] - cn;
1399                     WT v = 0;
1400                     for( j = 0; j < 4; j++ )
1401                     {
1402                         int sxj = sx + j*cn;
1403                         if( (unsigned)sxj >= (unsigned)swidth )
1404                         {
1405                             while( sxj < 0 )
1406                                 sxj += cn;
1407                             while( sxj >= swidth )
1408                                 sxj -= cn;
1409                         }
1410                         v += S[sxj]*alpha[j];
1411                     }
1412                     D[dx] = v;
1413                 }
1414                 if( limit == dwidth )
1415                     break;
1416                 for( ; dx < xmax; dx++, alpha += 4 )
1417                 {
1418                     int sx = xofs[dx];
1419                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1420                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1421                 }
1422                 limit = dwidth;
1423             }
1424             alpha -= dwidth*4;
1425         }
1426     }
1427 };
1428
1429
1430 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1431 struct VResizeCubic
1432 {
1433     typedef T value_type;
1434     typedef WT buf_type;
1435     typedef AT alpha_type;
1436
1437     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1438     {
1439         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1440         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1441         CastOp castOp;
1442         VecOp vecOp;
1443
1444         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1445         for( ; x < width; x++ )
1446             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
1447     }
1448 };
1449
1450
1451 template<typename T, typename WT, typename AT>
1452 struct HResizeLanczos4
1453 {
1454     typedef T value_type;
1455     typedef WT buf_type;
1456     typedef AT alpha_type;
1457
1458     void operator()(const T** src, WT** dst, int count,
1459                     const int* xofs, const AT* alpha,
1460                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1461     {
1462         for( int k = 0; k < count; k++ )
1463         {
1464             const T *S = src[k];
1465             WT *D = dst[k];
1466             int dx = 0, limit = xmin;
1467             for(;;)
1468             {
1469                 for( ; dx < limit; dx++, alpha += 8 )
1470                 {
1471                     int j, sx = xofs[dx] - cn*3;
1472                     WT v = 0;
1473                     for( j = 0; j < 8; j++ )
1474                     {
1475                         int sxj = sx + j*cn;
1476                         if( (unsigned)sxj >= (unsigned)swidth )
1477                         {
1478                             while( sxj < 0 )
1479                                 sxj += cn;
1480                             while( sxj >= swidth )
1481                                 sxj -= cn;
1482                         }
1483                         v += S[sxj]*alpha[j];
1484                     }
1485                     D[dx] = v;
1486                 }
1487                 if( limit == dwidth )
1488                     break;
1489                 for( ; dx < xmax; dx++, alpha += 8 )
1490                 {
1491                     int sx = xofs[dx];
1492                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
1493                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
1494                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
1495                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
1496                 }
1497                 limit = dwidth;
1498             }
1499             alpha -= dwidth*8;
1500         }
1501     }
1502 };
1503
1504
1505 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1506 struct VResizeLanczos4
1507 {
1508     typedef T value_type;
1509     typedef WT buf_type;
1510     typedef AT alpha_type;
1511
1512     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1513     {
1514         CastOp castOp;
1515         VecOp vecOp;
1516         int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1517         #if CV_ENABLE_UNROLLED
1518         for( ; x <= width - 4; x += 4 )
1519         {
1520             WT b = beta[0];
1521             const WT* S = src[0];
1522             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
1523
1524             for( k = 1; k < 8; k++ )
1525             {
1526                 b = beta[k]; S = src[k];
1527                 s0 += S[x]*b; s1 += S[x+1]*b;
1528                 s2 += S[x+2]*b; s3 += S[x+3]*b;
1529             }
1530
1531             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
1532             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
1533         }
1534         #endif
1535         for( ; x < width; x++ )
1536         {
1537             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
1538                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
1539                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
1540         }
1541     }
1542 };
1543
1544
1545 static inline int clip(int x, int a, int b)
1546 {
1547     return x >= a ? (x < b ? x : b-1) : a;
1548 }
1549
1550 static const int MAX_ESIZE=16;
1551
1552 template <typename HResize, typename VResize>
1553 class resizeGeneric_Invoker :
1554     public ParallelLoopBody
1555 {
1556 public:
1557     typedef typename HResize::value_type T;
1558     typedef typename HResize::buf_type WT;
1559     typedef typename HResize::alpha_type AT;
1560
1561     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
1562         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
1563         int _ksize, int _xmin, int _xmax) :
1564         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
1565         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
1566         ksize(_ksize), xmin(_xmin), xmax(_xmax)
1567     {
1568         CV_Assert(ksize <= MAX_ESIZE);
1569     }
1570
1571 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1572 # pragma GCC diagnostic push
1573 # pragma GCC diagnostic ignored "-Warray-bounds"
1574 #endif
1575     virtual void operator() (const Range& range) const
1576     {
1577         int dy, cn = src.channels();
1578         HResize hresize;
1579         VResize vresize;
1580
1581         int bufstep = (int)alignSize(dsize.width, 16);
1582         AutoBuffer<WT> _buffer(bufstep*ksize);
1583         const T* srows[MAX_ESIZE]={0};
1584         WT* rows[MAX_ESIZE]={0};
1585         int prev_sy[MAX_ESIZE];
1586
1587         for(int k = 0; k < ksize; k++ )
1588         {
1589             prev_sy[k] = -1;
1590             rows[k] = (WT*)_buffer + bufstep*k;
1591         }
1592
1593         const AT* beta = _beta + ksize * range.start;
1594
1595         for( dy = range.start; dy < range.end; dy++, beta += ksize )
1596         {
1597             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
1598
1599             for(int k = 0; k < ksize; k++ )
1600             {
1601                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
1602                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
1603                 {
1604                     if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
1605                     {
1606                         if( k1 > k )
1607                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
1608                         break;
1609                     }
1610                 }
1611                 if( k1 == ksize )
1612                     k0 = std::min(k0, k); // remember the first row that needs to be computed
1613                 srows[k] = src.template ptr<T>(sy);
1614                 prev_sy[k] = sy;
1615             }
1616
1617             if( k0 < ksize )
1618                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
1619                         ssize.width, dsize.width, cn, xmin, xmax );
1620             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
1621         }
1622     }
1623 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1624 # pragma GCC diagnostic pop
1625 #endif
1626
1627 private:
1628     Mat src;
1629     Mat dst;
1630     const int* xofs, *yofs;
1631     const AT* alpha, *_beta;
1632     Size ssize, dsize;
1633     const int ksize, xmin, xmax;
1634
1635     resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
1636 };
1637
1638 template<class HResize, class VResize>
1639 static void resizeGeneric_( const Mat& src, Mat& dst,
1640                             const int* xofs, const void* _alpha,
1641                             const int* yofs, const void* _beta,
1642                             int xmin, int xmax, int ksize )
1643 {
1644     typedef typename HResize::alpha_type AT;
1645
1646     const AT* beta = (const AT*)_beta;
1647     Size ssize = src.size(), dsize = dst.size();
1648     int cn = src.channels();
1649     ssize.width *= cn;
1650     dsize.width *= cn;
1651     xmin *= cn;
1652     xmax *= cn;
1653     // image resize is a separable operation. In case of not too strong
1654
1655     Range range(0, dsize.height);
1656     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
1657         ssize, dsize, ksize, xmin, xmax);
1658     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1659 }
1660
1661 template <typename T, typename WT>
1662 struct ResizeAreaFastNoVec
1663 {
1664     ResizeAreaFastNoVec(int, int) { }
1665     ResizeAreaFastNoVec(int, int, int, int) { }
1666     int operator() (const T*, T*, int) const
1667     { return 0; }
1668 };
1669
1670 #if CV_NEON
1671
1672 class ResizeAreaFastVec_SIMD_8u
1673 {
1674 public:
1675     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1676         cn(_cn), step(_step)
1677     {
1678     }
1679
1680     int operator() (const uchar* S, uchar* D, int w) const
1681     {
1682         int dx = 0;
1683         const uchar* S0 = S, * S1 = S0 + step;
1684
1685         uint16x8_t v_2 = vdupq_n_u16(2);
1686
1687         if (cn == 1)
1688         {
1689             for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
1690             {
1691                 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
1692
1693                 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
1694                 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
1695                 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
1696
1697                 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
1698                 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
1699                 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
1700
1701                 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
1702             }
1703         }
1704         else if (cn == 4)
1705         {
1706             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1707             {
1708                 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
1709
1710                 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
1711                 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
1712                 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
1713                 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
1714
1715                 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
1716                                            vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
1717                 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
1718                                            vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
1719                 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
1720
1721                 vst1_u8(D, vmovn_u16(v_dst));
1722             }
1723         }
1724
1725         return dx;
1726     }
1727
1728 private:
1729     int cn, step;
1730 };
1731
1732 class ResizeAreaFastVec_SIMD_16u
1733 {
1734 public:
1735     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1736         cn(_cn), step(_step)
1737     {
1738     }
1739
1740     int operator() (const ushort * S, ushort * D, int w) const
1741     {
1742         int dx = 0;
1743         const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
1744
1745         uint32x4_t v_2 = vdupq_n_u32(2);
1746
1747         if (cn == 1)
1748         {
1749             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1750             {
1751                 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
1752
1753                 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
1754                 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
1755                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
1756
1757                 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
1758                 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
1759                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
1760
1761                 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
1762             }
1763         }
1764         else if (cn == 4)
1765         {
1766             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1767             {
1768                 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
1769                 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
1770                                              vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
1771                 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
1772             }
1773         }
1774
1775         return dx;
1776     }
1777
1778 private:
1779     int cn, step;
1780 };
1781
1782 class ResizeAreaFastVec_SIMD_16s
1783 {
1784 public:
1785     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
1786         cn(_cn), step(_step)
1787     {
1788     }
1789
1790     int operator() (const short * S, short * D, int w) const
1791     {
1792         int dx = 0;
1793         const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
1794
1795         int32x4_t v_2 = vdupq_n_s32(2);
1796
1797         if (cn == 1)
1798         {
1799             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1800             {
1801                 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
1802
1803                 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
1804                 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
1805                 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
1806
1807                 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
1808                 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
1809                 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
1810
1811                 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
1812             }
1813         }
1814         else if (cn == 4)
1815         {
1816             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1817             {
1818                 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
1819                 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
1820                                             vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
1821                 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
1822             }
1823         }
1824
1825         return dx;
1826     }
1827
1828 private:
1829     int cn, step;
1830 };
1831
1832 struct ResizeAreaFastVec_SIMD_32f
1833 {
1834     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
1835         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
1836     {
1837         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
1838     }
1839
1840     int operator() (const float * S, float * D, int w) const
1841     {
1842         if (!fast_mode)
1843             return 0;
1844
1845         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
1846         int dx = 0;
1847
1848         float32x4_t v_025 = vdupq_n_f32(0.25f);
1849
1850         if (cn == 1)
1851         {
1852             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1853             {
1854                 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
1855
1856                 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
1857                 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
1858
1859                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
1860             }
1861         }
1862         else if (cn == 4)
1863         {
1864             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1865             {
1866                 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
1867                 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
1868
1869                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
1870             }
1871         }
1872
1873         return dx;
1874     }
1875
1876 private:
1877     int scale_x, scale_y;
1878     int cn;
1879     bool fast_mode;
1880     int step;
1881 };
1882
1883 #elif CV_SSE2
1884
1885 class ResizeAreaFastVec_SIMD_8u
1886 {
1887 public:
1888     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1889         cn(_cn), step(_step)
1890     {
1891         use_simd = checkHardwareSupport(CV_CPU_SSE2);
1892     }
1893
1894     int operator() (const uchar* S, uchar* D, int w) const
1895     {
1896         if (!use_simd)
1897             return 0;
1898
1899         int dx = 0;
1900         const uchar* S0 = S;
1901         const uchar* S1 = S0 + step;
1902         __m128i zero = _mm_setzero_si128();
1903         __m128i delta2 = _mm_set1_epi16(2);
1904
1905         if (cn == 1)
1906         {
1907             __m128i masklow = _mm_set1_epi16(0x00ff);
1908             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1909             {
1910                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1911                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1912
1913                 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
1914                 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
1915                 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
1916                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1917
1918                 _mm_storel_epi64((__m128i*)D, s0);
1919             }
1920         }
1921         else if (cn == 3)
1922             for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
1923             {
1924                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1925                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1926
1927                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1928                 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
1929                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1930                 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
1931
1932                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
1933                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
1934                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1935                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1936                 _mm_storel_epi64((__m128i*)D, s0);
1937
1938                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
1939                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
1940                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1941                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1942                 _mm_storel_epi64((__m128i*)(D+3), s0);
1943             }
1944         else
1945         {
1946             CV_Assert(cn == 4);
1947             int v[] = { 0, 0, -1, -1 };
1948             __m128i mask = _mm_loadu_si128((const __m128i*)v);
1949
1950             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1951             {
1952                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1953                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1954
1955                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1956                 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
1957                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1958                 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
1959
1960                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
1961                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
1962                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1963                 __m128i res0 = _mm_srli_epi16(s0, 2);
1964
1965                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
1966                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
1967                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1968                 __m128i res1 = _mm_srli_epi16(s0, 2);
1969                 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
1970                                                    _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
1971                 _mm_storel_epi64((__m128i*)(D), s0);
1972             }
1973         }
1974
1975         return dx;
1976     }
1977
1978 private:
1979     int cn;
1980     bool use_simd;
1981     int step;
1982 };
1983
1984 class ResizeAreaFastVec_SIMD_16u
1985 {
1986 public:
1987     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1988         cn(_cn), step(_step)
1989     {
1990         use_simd = checkHardwareSupport(CV_CPU_SSE2);
1991     }
1992
1993     int operator() (const ushort* S, ushort* D, int w) const
1994     {
1995         if (!use_simd)
1996             return 0;
1997
1998         int dx = 0;
1999         const ushort* S0 = (const ushort*)S;
2000         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
2001         __m128i masklow = _mm_set1_epi32(0x0000ffff);
2002         __m128i zero = _mm_setzero_si128();
2003         __m128i delta2 = _mm_set1_epi32(2);
2004
2005 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
2006
2007         if (cn == 1)
2008         {
2009             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2010             {
2011                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2012                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2013
2014                 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
2015                 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
2016                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
2017                 s0 = _mm_srli_epi32(s0, 2);
2018                 s0 = _mm_packus_epi32(s0, zero);
2019
2020                 _mm_storel_epi64((__m128i*)D, s0);
2021             }
2022         }
2023         else if (cn == 3)
2024             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2025             {
2026                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2027                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2028
2029                 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
2030                 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
2031                 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
2032                 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
2033
2034                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
2035                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
2036                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
2037                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2038                 _mm_storel_epi64((__m128i*)D, s0);
2039             }
2040         else
2041         {
2042             CV_Assert(cn == 4);
2043             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2044             {
2045                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2046                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2047
2048                 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
2049                 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
2050                 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
2051                 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
2052
2053                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
2054                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
2055                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
2056                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2057                 _mm_storel_epi64((__m128i*)D, s0);
2058             }
2059         }
2060
2061 #undef _mm_packus_epi32
2062
2063         return dx;
2064     }
2065
2066 private:
2067     int cn;
2068     int step;
2069     bool use_simd;
2070 };
2071
2072 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2073 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2074
2075 #else
2076
2077 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
2078 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
2079 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2080 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2081
2082 #endif
2083
2084 template<typename T, typename SIMDVecOp>
2085 struct ResizeAreaFastVec
2086 {
2087     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
2088         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
2089     {
2090         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
2091     }
2092
2093     int operator() (const T* S, T* D, int w) const
2094     {
2095         if (!fast_mode)
2096             return 0;
2097
2098         const T* nextS = (const T*)((const uchar*)S + step);
2099         int dx = vecOp(S, D, w);
2100
2101         if (cn == 1)
2102             for( ; dx < w; ++dx )
2103             {
2104                 int index = dx*2;
2105                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
2106             }
2107         else if (cn == 3)
2108             for( ; dx < w; dx += 3 )
2109             {
2110                 int index = dx*2;
2111                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
2112                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
2113                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
2114             }
2115         else
2116             {
2117                 CV_Assert(cn == 4);
2118                 for( ; dx < w; dx += 4 )
2119                 {
2120                     int index = dx*2;
2121                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
2122                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
2123                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
2124                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
2125                 }
2126             }
2127
2128         return dx;
2129     }
2130
2131 private:
2132     int scale_x, scale_y;
2133     int cn;
2134     bool fast_mode;
2135     int step;
2136     SIMDVecOp vecOp;
2137 };
2138
2139 template <typename T, typename WT, typename VecOp>
2140 class resizeAreaFast_Invoker :
2141     public ParallelLoopBody
2142 {
2143 public:
2144     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
2145         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
2146         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
2147         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
2148     {
2149     }
2150
2151     virtual void operator() (const Range& range) const
2152     {
2153         Size ssize = src.size(), dsize = dst.size();
2154         int cn = src.channels();
2155         int area = scale_x*scale_y;
2156         float scale = 1.f/(area);
2157         int dwidth1 = (ssize.width/scale_x)*cn;
2158         dsize.width *= cn;
2159         ssize.width *= cn;
2160         int dy, dx, k = 0;
2161
2162         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
2163
2164         for( dy = range.start; dy < range.end; dy++ )
2165         {
2166             T* D = (T*)(dst.data + dst.step*dy);
2167             int sy0 = dy*scale_y;
2168             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
2169
2170             if( sy0 >= ssize.height )
2171             {
2172                 for( dx = 0; dx < dsize.width; dx++ )
2173                     D[dx] = 0;
2174                 continue;
2175             }
2176
2177             dx = vop(src.template ptr<T>(sy0), D, w);
2178             for( ; dx < w; dx++ )
2179             {
2180                 const T* S = src.template ptr<T>(sy0) + xofs[dx];
2181                 WT sum = 0;
2182                 k = 0;
2183                 #if CV_ENABLE_UNROLLED
2184                 for( ; k <= area - 4; k += 4 )
2185                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
2186                 #endif
2187                 for( ; k < area; k++ )
2188                     sum += S[ofs[k]];
2189
2190                 D[dx] = saturate_cast<T>(sum * scale);
2191             }
2192
2193             for( ; dx < dsize.width; dx++ )
2194             {
2195                 WT sum = 0;
2196                 int count = 0, sx0 = xofs[dx];
2197                 if( sx0 >= ssize.width )
2198                     D[dx] = 0;
2199
2200                 for( int sy = 0; sy < scale_y; sy++ )
2201                 {
2202                     if( sy0 + sy >= ssize.height )
2203                         break;
2204                     const T* S = src.template ptr<T>(sy0 + sy) + sx0;
2205                     for( int sx = 0; sx < scale_x*cn; sx += cn )
2206                     {
2207                         if( sx0 + sx >= ssize.width )
2208                             break;
2209                         sum += S[sx];
2210                         count++;
2211                     }
2212                 }
2213
2214                 D[dx] = saturate_cast<T>((float)sum/count);
2215             }
2216         }
2217     }
2218
2219 private:
2220     Mat src;
2221     Mat dst;
2222     int scale_x, scale_y;
2223     const int *ofs, *xofs;
2224 };
2225
2226 template<typename T, typename WT, typename VecOp>
2227 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
2228                              int scale_x, int scale_y )
2229 {
2230     Range range(0, dst.rows);
2231     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
2232         scale_y, ofs, xofs);
2233     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2234 }
2235
2236 struct DecimateAlpha
2237 {
2238     int si, di;
2239     float alpha;
2240 };
2241
2242
2243 template<typename T, typename WT> class ResizeArea_Invoker :
2244     public ParallelLoopBody
2245 {
2246 public:
2247     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
2248                         const DecimateAlpha* _xtab, int _xtab_size,
2249                         const DecimateAlpha* _ytab, int _ytab_size,
2250                         const int* _tabofs )
2251     {
2252         src = &_src;
2253         dst = &_dst;
2254         xtab0 = _xtab;
2255         xtab_size0 = _xtab_size;
2256         ytab = _ytab;
2257         ytab_size = _ytab_size;
2258         tabofs = _tabofs;
2259     }
2260
2261     virtual void operator() (const Range& range) const
2262     {
2263         Size dsize = dst->size();
2264         int cn = dst->channels();
2265         dsize.width *= cn;
2266         AutoBuffer<WT> _buffer(dsize.width*2);
2267         const DecimateAlpha* xtab = xtab0;
2268         int xtab_size = xtab_size0;
2269         WT *buf = _buffer, *sum = buf + dsize.width;
2270         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
2271
2272         for( dx = 0; dx < dsize.width; dx++ )
2273             sum[dx] = (WT)0;
2274
2275         for( j = j_start; j < j_end; j++ )
2276         {
2277             WT beta = ytab[j].alpha;
2278             int dy = ytab[j].di;
2279             int sy = ytab[j].si;
2280
2281             {
2282                 const T* S = src->template ptr<T>(sy);
2283                 for( dx = 0; dx < dsize.width; dx++ )
2284                     buf[dx] = (WT)0;
2285
2286                 if( cn == 1 )
2287                     for( k = 0; k < xtab_size; k++ )
2288                     {
2289                         int dxn = xtab[k].di;
2290                         WT alpha = xtab[k].alpha;
2291                         buf[dxn] += S[xtab[k].si]*alpha;
2292                     }
2293                 else if( cn == 2 )
2294                     for( k = 0; k < xtab_size; k++ )
2295                     {
2296                         int sxn = xtab[k].si;
2297                         int dxn = xtab[k].di;
2298                         WT alpha = xtab[k].alpha;
2299                         WT t0 = buf[dxn] + S[sxn]*alpha;
2300                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2301                         buf[dxn] = t0; buf[dxn+1] = t1;
2302                     }
2303                 else if( cn == 3 )
2304                     for( k = 0; k < xtab_size; k++ )
2305                     {
2306                         int sxn = xtab[k].si;
2307                         int dxn = xtab[k].di;
2308                         WT alpha = xtab[k].alpha;
2309                         WT t0 = buf[dxn] + S[sxn]*alpha;
2310                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2311                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
2312                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
2313                     }
2314                 else if( cn == 4 )
2315                 {
2316                     for( k = 0; k < xtab_size; k++ )
2317                     {
2318                         int sxn = xtab[k].si;
2319                         int dxn = xtab[k].di;
2320                         WT alpha = xtab[k].alpha;
2321                         WT t0 = buf[dxn] + S[sxn]*alpha;
2322                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2323                         buf[dxn] = t0; buf[dxn+1] = t1;
2324                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
2325                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
2326                         buf[dxn+2] = t0; buf[dxn+3] = t1;
2327                     }
2328                 }
2329                 else
2330                 {
2331                     for( k = 0; k < xtab_size; k++ )
2332                     {
2333                         int sxn = xtab[k].si;
2334                         int dxn = xtab[k].di;
2335                         WT alpha = xtab[k].alpha;
2336                         for( int c = 0; c < cn; c++ )
2337                             buf[dxn + c] += S[sxn + c]*alpha;
2338                     }
2339                 }
2340             }
2341
2342             if( dy != prev_dy )
2343             {
2344                 T* D = dst->template ptr<T>(prev_dy);
2345
2346                 for( dx = 0; dx < dsize.width; dx++ )
2347                 {
2348                     D[dx] = saturate_cast<T>(sum[dx]);
2349                     sum[dx] = beta*buf[dx];
2350                 }
2351                 prev_dy = dy;
2352             }
2353             else
2354             {
2355                 for( dx = 0; dx < dsize.width; dx++ )
2356                     sum[dx] += beta*buf[dx];
2357             }
2358         }
2359
2360         {
2361         T* D = dst->template ptr<T>(prev_dy);
2362         for( dx = 0; dx < dsize.width; dx++ )
2363             D[dx] = saturate_cast<T>(sum[dx]);
2364         }
2365     }
2366
2367 private:
2368     const Mat* src;
2369     Mat* dst;
2370     const DecimateAlpha* xtab0;
2371     const DecimateAlpha* ytab;
2372     int xtab_size0, ytab_size;
2373     const int* tabofs;
2374 };
2375
2376
2377 template <typename T, typename WT>
2378 static void resizeArea_( const Mat& src, Mat& dst,
2379                          const DecimateAlpha* xtab, int xtab_size,
2380                          const DecimateAlpha* ytab, int ytab_size,
2381                          const int* tabofs )
2382 {
2383     parallel_for_(Range(0, dst.rows),
2384                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
2385                  dst.total()/((double)(1 << 16)));
2386 }
2387
2388
2389 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
2390                             const int* xofs, const void* alpha,
2391                             const int* yofs, const void* beta,
2392                             int xmin, int xmax, int ksize );
2393
2394 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
2395                                     const int* ofs, const int *xofs,
2396                                     int scale_x, int scale_y );
2397
2398 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
2399                                 const DecimateAlpha* xtab, int xtab_size,
2400                                 const DecimateAlpha* ytab, int ytab_size,
2401                                 const int* yofs);
2402
2403
2404 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
2405 {
2406     int k = 0;
2407     for(int dx = 0; dx < dsize; dx++ )
2408     {
2409         double fsx1 = dx * scale;
2410         double fsx2 = fsx1 + scale;
2411         double cellWidth = std::min(scale, ssize - fsx1);
2412
2413         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2414
2415         sx2 = std::min(sx2, ssize - 1);
2416         sx1 = std::min(sx1, sx2);
2417
2418         if( sx1 - fsx1 > 1e-3 )
2419         {
2420             assert( k < ssize*2 );
2421             tab[k].di = dx * cn;
2422             tab[k].si = (sx1 - 1) * cn;
2423             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
2424         }
2425
2426         for(int sx = sx1; sx < sx2; sx++ )
2427         {
2428             assert( k < ssize*2 );
2429             tab[k].di = dx * cn;
2430             tab[k].si = sx * cn;
2431             tab[k++].alpha = float(1.0 / cellWidth);
2432         }
2433
2434         if( fsx2 - sx2 > 1e-3 )
2435         {
2436             assert( k < ssize*2 );
2437             tab[k].di = dx * cn;
2438             tab[k].si = sx2 * cn;
2439             tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2440         }
2441     }
2442     return k;
2443 }
2444
2445 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
2446
2447 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
2448     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2449     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2450     specBuf.allocate(specSize);\
2451     pSpec = (uchar*)specBuf;\
2452     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
2453
2454 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
2455     if (mode == (int)ippCubic) { *ok = false; return; } \
2456     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2457     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2458     specBuf.allocate(specSize);\
2459     pSpec = (uchar*)specBuf;\
2460     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
2461     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
2462     getSrcOffsetFunc =  (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
2463
2464 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
2465     func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
2466     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2467     specBuf.allocate(specSize);\
2468     pSpec = (uchar*)specBuf;\
2469     AutoBuffer<uchar> buf(initSize);\
2470     uchar* pInit = (uchar*)buf;\
2471     CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
2472
2473 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
2474     if (mode == (int)ippLinear)     { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
2475     else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
2476     else { *ok = false; return; } \
2477     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
2478     getSrcOffsetFunc =  (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
2479
2480 #if IPP_VERSION_X100 >= 701
2481 class IPPresizeInvoker :
2482     public ParallelLoopBody
2483 {
2484 public:
2485     IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
2486         ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
2487         inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
2488         func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
2489     {
2490         *ok = true;
2491         IppiSize srcSize, dstSize;
2492         int type = src.type(), specSize = 0, initSize = 0;
2493         srcSize.width  = src.cols;
2494         srcSize.height = src.rows;
2495         dstSize.width  = dst.cols;
2496         dstSize.height = dst.rows;
2497
2498         switch (type)
2499         {
2500 #if 0 // disabled since it breaks tests for CascadeClassifier
2501             case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
2502             case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
2503             case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
2504 #endif
2505             case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
2506             case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
2507             case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
2508             case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
2509             case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
2510             case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
2511             case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
2512             case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
2513             case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
2514             case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
2515             case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
2516             case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
2517             default: { *ok = false; return; } break;
2518         }
2519     }
2520
2521     ~IPPresizeInvoker()
2522     {
2523     }
2524
2525     virtual void operator() (const Range& range) const
2526     {
2527         if (*ok == false)
2528             return;
2529
2530         int cn = src.channels();
2531         int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
2532         int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
2533         int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
2534
2535         IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
2536         IppiSize  dstSize   = { dstwidth, dstheight - dsty };
2537         int bufsize = 0, itemSize = (int)src.elemSize1();
2538
2539         CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
2540         CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
2541
2542         const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
2543         Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
2544
2545         AutoBuffer<uchar> buf(bufsize + 64);
2546         uchar* bufptr = alignPtr((uchar*)buf, 32);
2547
2548         if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
2549             *ok = false;
2550         else
2551         {
2552             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2553         }
2554     }
2555 private:
2556     const Mat & src;
2557     Mat & dst;
2558     double inv_scale_x;
2559     double inv_scale_y;
2560     void *pSpec;
2561     AutoBuffer<uchar> specBuf;
2562     int mode;
2563     ippiResizeFunc func;
2564     ippiResizeGetBufferSize getBufferSizeFunc;
2565     ippiResizeGetSrcOffset getSrcOffsetFunc;
2566     bool *ok;
2567     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
2568 };
2569
2570 #endif
2571
2572 #ifdef HAVE_OPENCL
2573
2574 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
2575                                       float * const alpha_tab, int * const ofs_tab)
2576 {
2577     int k = 0, dx = 0;
2578     for ( ; dx < dsize; dx++)
2579     {
2580         ofs_tab[dx] = k;
2581
2582         double fsx1 = dx * scale;
2583         double fsx2 = fsx1 + scale;
2584         double cellWidth = std::min(scale, ssize - fsx1);
2585
2586         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2587
2588         sx2 = std::min(sx2, ssize - 1);
2589         sx1 = std::min(sx1, sx2);
2590
2591         if (sx1 - fsx1 > 1e-3)
2592         {
2593             map_tab[k] = sx1 - 1;
2594             alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
2595         }
2596
2597         for (int sx = sx1; sx < sx2; sx++)
2598         {
2599             map_tab[k] = sx;
2600             alpha_tab[k++] = float(1.0 / cellWidth);
2601         }
2602
2603         if (fsx2 - sx2 > 1e-3)
2604         {
2605             map_tab[k] = sx2;
2606             alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2607         }
2608     }
2609     ofs_tab[dx] = k;
2610 }
2611
2612 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
2613                         double fx, double fy, int interpolation)
2614 {
2615     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2616
2617     double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
2618     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
2619     int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
2620     bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
2621         std::abs(inv_fy - iscale_y) < DBL_EPSILON;
2622
2623     // in case of scale_x && scale_y is equal to 2
2624     // INTER_AREA (fast) also is equal to INTER_LINEAR
2625     if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2626         /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
2627
2628     if( !(cn <= 4 &&
2629            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
2630             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
2631         return false;
2632
2633     UMat src = _src.getUMat();
2634     _dst.create(dsize, type);
2635     UMat dst = _dst.getUMat();
2636
2637     Size ssize = src.size();
2638     ocl::Kernel k;
2639     size_t globalsize[] = { dst.cols, dst.rows };
2640
2641     ocl::Image2D srcImage;
2642
2643     // See if this could be done with a sampler.  We stick with integer
2644     // datatypes because the observed error is low.
2645     bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
2646                        ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
2647                        ocl::Image2D::isFormatSupported(depth, cn, true) &&
2648                        src.offset==0);
2649     if (useSampler)
2650     {
2651         int wdepth = std::max(depth, CV_32S);
2652         char buf[2][32];
2653         cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
2654                         "-D convertToDT=%s -D cn=%d",
2655                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
2656                         ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2657                         cn);
2658         k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
2659
2660         if (k.empty())
2661             useSampler = false;
2662         else
2663         {
2664             // Convert the input into an OpenCL image type, using normalized channel data types
2665             // and aliasing the UMat.
2666             srcImage = ocl::Image2D(src, true, true);
2667             k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
2668                    (float)inv_fx, (float)inv_fy);
2669         }
2670     }
2671
2672     if (interpolation == INTER_LINEAR && !useSampler)
2673     {
2674         char buf[2][32];
2675
2676         // integer path is slower because of CPU part, so it's disabled
2677         if (depth == CV_8U && ((void)0, 0))
2678         {
2679             AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
2680             int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
2681             short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
2682             float fxx, fyy;
2683             int sx, sy;
2684
2685             for (int dx = 0; dx < dsize.width; dx++)
2686             {
2687                 fxx = (float)((dx+0.5)*inv_fx - 0.5);
2688                 sx = cvFloor(fxx);
2689                 fxx -= sx;
2690
2691                 if (sx < 0)
2692                     fxx = 0, sx = 0;
2693
2694                 if (sx >= ssize.width-1)
2695                     fxx = 0, sx = ssize.width-1;
2696
2697                 xofs[dx] = sx;
2698                 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
2699                 ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
2700             }
2701
2702             for (int dy = 0; dy < dsize.height; dy++)
2703             {
2704                 fyy = (float)((dy+0.5)*inv_fy - 0.5);
2705                 sy = cvFloor(fyy);
2706                 fyy -= sy;
2707
2708                 yofs[dy] = sy;
2709                 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
2710                 ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
2711             }
2712
2713             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2714             UMat coeffs;
2715             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
2716
2717             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2718                      format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
2719                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2720                             "-D INTER_RESIZE_COEF_BITS=%d",
2721                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2722                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2723                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2724                             cn, INTER_RESIZE_COEF_BITS));
2725             if (k.empty())
2726                 return false;
2727
2728             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2729                    ocl::KernelArg::PtrReadOnly(coeffs));
2730         }
2731         else
2732         {
2733             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2734             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2735                      format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
2736                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2737                             "-D INTER_RESIZE_COEF_BITS=%d",
2738                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2739                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2740                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2741                             cn, INTER_RESIZE_COEF_BITS));
2742             if (k.empty())
2743                 return false;
2744
2745             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2746                    (float)inv_fx, (float)inv_fy);
2747         }
2748     }
2749     else if (interpolation == INTER_NEAREST)
2750     {
2751         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
2752                  format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
2753                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
2754         if (k.empty())
2755             return false;
2756
2757         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2758                (float)inv_fx, (float)inv_fy);
2759     }
2760     else if (interpolation == INTER_AREA)
2761     {
2762         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
2763         int wtype = CV_MAKE_TYPE(wdepth, cn);
2764
2765         char cvt[2][40];
2766         String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
2767                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2768                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
2769
2770         UMat alphaOcl, tabofsOcl, mapOcl;
2771         UMat dmap, smap;
2772
2773         if (is_area_fast)
2774         {
2775             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
2776             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
2777                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
2778                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
2779                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
2780                                     iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
2781
2782             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
2783             if (k.empty())
2784                 return false;
2785         }
2786         else
2787         {
2788             buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
2789             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
2790             if (k.empty())
2791                 return false;
2792
2793             int xytab_size = (ssize.width + ssize.height) << 1;
2794             int tabofs_size = dsize.height + dsize.width + 2;
2795
2796             AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
2797             AutoBuffer<float> _xyalpha_tab(xytab_size);
2798             int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
2799             float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
2800             int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
2801
2802             ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
2803             ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
2804
2805             // loading precomputed arrays to GPU
2806             Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
2807             Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
2808             Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
2809         }
2810
2811         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
2812
2813         if (is_area_fast)
2814             k.args(srcarg, dstarg);
2815         else
2816             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
2817                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
2818
2819         return k.run(2, globalsize, NULL, false);
2820     }
2821
2822     return k.run(2, globalsize, 0, false);
2823 }
2824
2825 #endif
2826
2827 }
2828
2829 //////////////////////////////////////////////////////////////////////////////////////////
2830
2831 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
2832                  double inv_scale_x, double inv_scale_y, int interpolation )
2833 {
2834     static ResizeFunc linear_tab[] =
2835     {
2836         resizeGeneric_<
2837             HResizeLinear<uchar, int, short,
2838                 INTER_RESIZE_COEF_SCALE,
2839                 HResizeLinearVec_8u32s>,
2840             VResizeLinear<uchar, int, short,
2841                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2842                 VResizeLinearVec_32s8u> >,
2843         0,
2844         resizeGeneric_<
2845             HResizeLinear<ushort, float, float, 1,
2846                 HResizeLinearVec_16u32f>,
2847             VResizeLinear<ushort, float, float, Cast<float, ushort>,
2848                 VResizeLinearVec_32f16u> >,
2849         resizeGeneric_<
2850             HResizeLinear<short, float, float, 1,
2851                 HResizeLinearVec_16s32f>,
2852             VResizeLinear<short, float, float, Cast<float, short>,
2853                 VResizeLinearVec_32f16s> >,
2854         0,
2855         resizeGeneric_<
2856             HResizeLinear<float, float, float, 1,
2857                 HResizeLinearVec_32f>,
2858             VResizeLinear<float, float, float, Cast<float, float>,
2859                 VResizeLinearVec_32f> >,
2860         resizeGeneric_<
2861             HResizeLinear<double, double, float, 1,
2862                 HResizeNoVec>,
2863             VResizeLinear<double, double, float, Cast<double, double>,
2864                 VResizeNoVec> >,
2865         0
2866     };
2867
2868     static ResizeFunc cubic_tab[] =
2869     {
2870         resizeGeneric_<
2871             HResizeCubic<uchar, int, short>,
2872             VResizeCubic<uchar, int, short,
2873                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2874                 VResizeCubicVec_32s8u> >,
2875         0,
2876         resizeGeneric_<
2877             HResizeCubic<ushort, float, float>,
2878             VResizeCubic<ushort, float, float, Cast<float, ushort>,
2879             VResizeCubicVec_32f16u> >,
2880         resizeGeneric_<
2881             HResizeCubic<short, float, float>,
2882             VResizeCubic<short, float, float, Cast<float, short>,
2883             VResizeCubicVec_32f16s> >,
2884         0,
2885         resizeGeneric_<
2886             HResizeCubic<float, float, float>,
2887             VResizeCubic<float, float, float, Cast<float, float>,
2888             VResizeCubicVec_32f> >,
2889         resizeGeneric_<
2890             HResizeCubic<double, double, float>,
2891             VResizeCubic<double, double, float, Cast<double, double>,
2892             VResizeNoVec> >,
2893         0
2894     };
2895
2896     static ResizeFunc lanczos4_tab[] =
2897     {
2898         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
2899             VResizeLanczos4<uchar, int, short,
2900             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2901             VResizeNoVec> >,
2902         0,
2903         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
2904             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
2905             VResizeLanczos4Vec_32f16u> >,
2906         resizeGeneric_<HResizeLanczos4<short, float, float>,
2907             VResizeLanczos4<short, float, float, Cast<float, short>,
2908             VResizeLanczos4Vec_32f16s> >,
2909         0,
2910         resizeGeneric_<HResizeLanczos4<float, float, float>,
2911             VResizeLanczos4<float, float, float, Cast<float, float>,
2912             VResizeLanczos4Vec_32f> >,
2913         resizeGeneric_<HResizeLanczos4<double, double, float>,
2914             VResizeLanczos4<double, double, float, Cast<double, double>,
2915             VResizeNoVec> >,
2916         0
2917     };
2918
2919     static ResizeAreaFastFunc areafast_tab[] =
2920     {
2921         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
2922         0,
2923         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
2924         resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
2925         0,
2926         resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
2927         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
2928         0
2929     };
2930
2931     static ResizeAreaFunc area_tab[] =
2932     {
2933         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
2934         resizeArea_<short, float>, 0, resizeArea_<float, float>,
2935         resizeArea_<double, double>, 0
2936     };
2937
2938     Size ssize = _src.size();
2939
2940     CV_Assert( ssize.area() > 0 );
2941     CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
2942     if( dsize.area() == 0 )
2943     {
2944         dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
2945                      saturate_cast<int>(ssize.height*inv_scale_y));
2946         CV_Assert( dsize.area() > 0 );
2947     }
2948     else
2949     {
2950         inv_scale_x = (double)dsize.width/ssize.width;
2951         inv_scale_y = (double)dsize.height/ssize.height;
2952     }
2953
2954     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
2955                ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
2956
2957     Mat src = _src.getMat();
2958     _dst.create(dsize, src.type());
2959     Mat dst = _dst.getMat();
2960
2961 #ifdef HAVE_TEGRA_OPTIMIZATION
2962     if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
2963         return;
2964 #endif
2965
2966     int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2967     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
2968     int k, sx, sy, dx, dy;
2969
2970     int iscale_x = saturate_cast<int>(scale_x);
2971     int iscale_y = saturate_cast<int>(scale_y);
2972
2973     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
2974             std::abs(scale_y - iscale_y) < DBL_EPSILON;
2975
2976 #if IPP_VERSION_X100 >= 701
2977     CV_IPP_CHECK()
2978     {
2979 #define IPP_RESIZE_EPS 1e-10
2980
2981         double ex = fabs((double)dsize.width / src.cols  - inv_scale_x) / inv_scale_x;
2982         double ey = fabs((double)dsize.height / src.rows - inv_scale_y) / inv_scale_y;
2983
2984         if ( ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
2985              (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
2986              !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U))
2987         {
2988             int mode = -1;
2989             if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
2990                 mode = ippLinear;
2991             else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
2992                 mode = ippCubic;
2993
2994             if( mode >= 0 && (cn == 1 || cn == 3 || cn == 4) &&
2995                 (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
2996                 (depth == CV_64F && mode == ippLinear)))
2997             {
2998                 bool ok = true;
2999                 Range range(0, src.rows);
3000                 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
3001                 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3002                 if( ok )
3003                 {
3004                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3005                     return;
3006                 }
3007                 setIppErrorStatus();
3008             }
3009         }
3010 #undef IPP_RESIZE_EPS
3011     }
3012 #endif
3013
3014     if( interpolation == INTER_NEAREST )
3015     {
3016         resizeNN( src, dst, inv_scale_x, inv_scale_y );
3017         return;
3018     }
3019
3020     {
3021         // in case of scale_x && scale_y is equal to 2
3022         // INTER_AREA (fast) also is equal to INTER_LINEAR
3023         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3024             interpolation = INTER_AREA;
3025
3026         // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
3027         // In other cases it is emulated using some variant of bilinear interpolation
3028         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
3029         {
3030             if( is_area_fast )
3031             {
3032                 int area = iscale_x*iscale_y;
3033                 size_t srcstep = src.step / src.elemSize1();
3034                 AutoBuffer<int> _ofs(area + dsize.width*cn);
3035                 int* ofs = _ofs;
3036                 int* xofs = ofs + area;
3037                 ResizeAreaFastFunc func = areafast_tab[depth];
3038                 CV_Assert( func != 0 );
3039
3040                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
3041                     for( sx = 0; sx < iscale_x; sx++ )
3042                         ofs[k++] = (int)(sy*srcstep + sx*cn);
3043
3044                 for( dx = 0; dx < dsize.width; dx++ )
3045                 {
3046                     int j = dx * cn;
3047                     sx = iscale_x * j;
3048                     for( k = 0; k < cn; k++ )
3049                         xofs[j + k] = sx + k;
3050                 }
3051
3052                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
3053                 return;
3054             }
3055
3056             ResizeAreaFunc func = area_tab[depth];
3057             CV_Assert( func != 0 && cn <= 4 );
3058
3059             AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
3060             DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
3061
3062             int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
3063             int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
3064
3065             AutoBuffer<int> _tabofs(dsize.height + 1);
3066             int* tabofs = _tabofs;
3067             for( k = 0, dy = 0; k < ytab_size; k++ )
3068             {
3069                 if( k == 0 || ytab[k].di != ytab[k-1].di )
3070                 {
3071                     assert( ytab[k].di == dy );
3072                     tabofs[dy++] = k;
3073                 }
3074             }
3075             tabofs[dy] = ytab_size;
3076
3077             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
3078             return;
3079         }
3080     }
3081
3082     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
3083     bool area_mode = interpolation == INTER_AREA;
3084     bool fixpt = depth == CV_8U;
3085     float fx, fy;
3086     ResizeFunc func=0;
3087     int ksize=0, ksize2;
3088     if( interpolation == INTER_CUBIC )
3089         ksize = 4, func = cubic_tab[depth];
3090     else if( interpolation == INTER_LANCZOS4 )
3091         ksize = 8, func = lanczos4_tab[depth];
3092     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
3093         ksize = 2, func = linear_tab[depth];
3094     else
3095         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
3096     ksize2 = ksize/2;
3097
3098     CV_Assert( func != 0 );
3099
3100     AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
3101     int* xofs = (int*)(uchar*)_buffer;
3102     int* yofs = xofs + width;
3103     float* alpha = (float*)(yofs + dsize.height);
3104     short* ialpha = (short*)alpha;
3105     float* beta = alpha + width*ksize;
3106     short* ibeta = ialpha + width*ksize;
3107     float cbuf[MAX_ESIZE];
3108
3109     for( dx = 0; dx < dsize.width; dx++ )
3110     {
3111         if( !area_mode )
3112         {
3113             fx = (float)((dx+0.5)*scale_x - 0.5);
3114             sx = cvFloor(fx);
3115             fx -= sx;
3116         }
3117         else
3118         {
3119             sx = cvFloor(dx*scale_x);
3120             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
3121             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
3122         }
3123
3124         if( sx < ksize2-1 )
3125         {
3126             xmin = dx+1;
3127             if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3128                 fx = 0, sx = 0;
3129         }
3130
3131         if( sx + ksize2 >= ssize.width )
3132         {
3133             xmax = std::min( xmax, dx );
3134             if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3135                 fx = 0, sx = ssize.width-1;
3136         }
3137
3138         for( k = 0, sx *= cn; k < cn; k++ )
3139             xofs[dx*cn + k] = sx + k;
3140
3141         if( interpolation == INTER_CUBIC )
3142             interpolateCubic( fx, cbuf );
3143         else if( interpolation == INTER_LANCZOS4 )
3144             interpolateLanczos4( fx, cbuf );
3145         else
3146         {
3147             cbuf[0] = 1.f - fx;
3148             cbuf[1] = fx;
3149         }
3150         if( fixpt )
3151         {
3152             for( k = 0; k < ksize; k++ )
3153                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3154             for( ; k < cn*ksize; k++ )
3155                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
3156         }
3157         else
3158         {
3159             for( k = 0; k < ksize; k++ )
3160                 alpha[dx*cn*ksize + k] = cbuf[k];
3161             for( ; k < cn*ksize; k++ )
3162                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
3163         }
3164     }
3165
3166     for( dy = 0; dy < dsize.height; dy++ )
3167     {
3168         if( !area_mode )
3169         {
3170             fy = (float)((dy+0.5)*scale_y - 0.5);
3171             sy = cvFloor(fy);
3172             fy -= sy;
3173         }
3174         else
3175         {
3176             sy = cvFloor(dy*scale_y);
3177             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
3178             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
3179         }
3180
3181         yofs[dy] = sy;
3182         if( interpolation == INTER_CUBIC )
3183             interpolateCubic( fy, cbuf );
3184         else if( interpolation == INTER_LANCZOS4 )
3185             interpolateLanczos4( fy, cbuf );
3186         else
3187         {
3188             cbuf[0] = 1.f - fy;
3189             cbuf[1] = fy;
3190         }
3191
3192         if( fixpt )
3193         {
3194             for( k = 0; k < ksize; k++ )
3195                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3196         }
3197         else
3198         {
3199             for( k = 0; k < ksize; k++ )
3200                 beta[dy*ksize + k] = cbuf[k];
3201         }
3202     }
3203
3204     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
3205           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
3206 }
3207
3208
3209 /****************************************************************************************\
3210 *                       General warping (affine, perspective, remap)                     *
3211 \****************************************************************************************/
3212
3213 namespace cv
3214 {
3215
3216 template<typename T>
3217 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
3218                           int borderType, const Scalar& _borderValue )
3219 {
3220     Size ssize = _src.size(), dsize = _dst.size();
3221     int cn = _src.channels();
3222     const T* S0 = _src.ptr<T>();
3223     size_t sstep = _src.step/sizeof(S0[0]);
3224     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3225         saturate_cast<T>(_borderValue[1]),
3226         saturate_cast<T>(_borderValue[2]),
3227         saturate_cast<T>(_borderValue[3]));
3228     int dx, dy;
3229
3230     unsigned width1 = ssize.width, height1 = ssize.height;
3231
3232     if( _dst.isContinuous() && _xy.isContinuous() )
3233     {
3234         dsize.width *= dsize.height;
3235         dsize.height = 1;
3236     }
3237
3238     for( dy = 0; dy < dsize.height; dy++ )
3239     {
3240         T* D = _dst.ptr<T>(dy);
3241         const short* XY = _xy.ptr<short>(dy);
3242
3243         if( cn == 1 )
3244         {
3245             for( dx = 0; dx < dsize.width; dx++ )
3246             {
3247                 int sx = XY[dx*2], sy = XY[dx*2+1];
3248                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3249                     D[dx] = S0[sy*sstep + sx];
3250                 else
3251                 {
3252                     if( borderType == BORDER_REPLICATE )
3253                     {
3254                         sx = clip(sx, 0, ssize.width);
3255                         sy = clip(sy, 0, ssize.height);
3256                         D[dx] = S0[sy*sstep + sx];
3257                     }
3258                     else if( borderType == BORDER_CONSTANT )
3259                         D[dx] = cval[0];
3260                     else if( borderType != BORDER_TRANSPARENT )
3261                     {
3262                         sx = borderInterpolate(sx, ssize.width, borderType);
3263                         sy = borderInterpolate(sy, ssize.height, borderType);
3264                         D[dx] = S0[sy*sstep + sx];
3265                     }
3266                 }
3267             }
3268         }
3269         else
3270         {
3271             for( dx = 0; dx < dsize.width; dx++, D += cn )
3272             {
3273                 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3274                 const T *S;
3275                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3276                 {
3277                     if( cn == 3 )
3278                     {
3279                         S = S0 + sy*sstep + sx*3;
3280                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
3281                     }
3282                     else if( cn == 4 )
3283                     {
3284                         S = S0 + sy*sstep + sx*4;
3285                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
3286                     }
3287                     else
3288                     {
3289                         S = S0 + sy*sstep + sx*cn;
3290                         for( k = 0; k < cn; k++ )
3291                             D[k] = S[k];
3292                     }
3293                 }
3294                 else if( borderType != BORDER_TRANSPARENT )
3295                 {
3296                     if( borderType == BORDER_REPLICATE )
3297                     {
3298                         sx = clip(sx, 0, ssize.width);
3299                         sy = clip(sy, 0, ssize.height);
3300                         S = S0 + sy*sstep + sx*cn;
3301                     }
3302                     else if( borderType == BORDER_CONSTANT )
3303                         S = &cval[0];
3304                     else
3305                     {
3306                         sx = borderInterpolate(sx, ssize.width, borderType);
3307                         sy = borderInterpolate(sy, ssize.height, borderType);
3308                         S = S0 + sy*sstep + sx*cn;
3309                     }
3310                     for( k = 0; k < cn; k++ )
3311                         D[k] = S[k];
3312                 }
3313             }
3314         }
3315     }
3316 }
3317
3318
3319 struct RemapNoVec
3320 {
3321     int operator()( const Mat&, void*, const short*, const ushort*,
3322                     const void*, int ) const { return 0; }
3323 };
3324
3325 #if CV_SSE2
3326
3327 struct RemapVec_8u
3328 {
3329     int operator()( const Mat& _src, void* _dst, const short* XY,
3330                     const ushort* FXY, const void* _wtab, int width ) const
3331     {
3332         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
3333
3334         if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
3335             sstep > 0x8000 )
3336             return 0;
3337
3338         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
3339         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
3340         uchar* D = (uchar*)_dst;
3341         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
3342         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
3343         __m128i z = _mm_setzero_si128();
3344         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
3345
3346         if( cn == 1 )
3347         {
3348             for( ; x <= width - 8; x += 8 )
3349             {
3350                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3351                 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
3352                 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
3353                 unsigned i0, i1;
3354
3355                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3356                 xy1 = _mm_madd_epi16( xy1, xy2ofs );
3357                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3358                 _mm_store_si128( (__m128i*)iofs1, xy1 );
3359
3360                 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
3361                 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
3362                 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3363                 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
3364                 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
3365                 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3366                 v0 = _mm_unpacklo_epi8(v0, z);
3367                 v1 = _mm_unpacklo_epi8(v1, z);
3368
3369                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
3370                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
3371                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
3372                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
3373                 b0 = _mm_unpacklo_epi64(a0, a1);
3374                 b1 = _mm_unpackhi_epi64(a0, a1);
3375                 v0 = _mm_madd_epi16(v0, b0);
3376                 v1 = _mm_madd_epi16(v1, b1);
3377                 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
3378
3379                 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
3380                 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
3381                 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3382                 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
3383                 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
3384                 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3385                 v2 = _mm_unpacklo_epi8(v2, z);
3386                 v3 = _mm_unpacklo_epi8(v3, z);
3387
3388                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
3389                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
3390                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
3391                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
3392                 b0 = _mm_unpacklo_epi64(a0, a1);
3393                 b1 = _mm_unpackhi_epi64(a0, a1);
3394                 v2 = _mm_madd_epi16(v2, b0);
3395                 v3 = _mm_madd_epi16(v3, b1);
3396                 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
3397
3398                 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
3399                 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
3400                 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
3401                 _mm_storel_epi64( (__m128i*)(D + x), v0 );
3402             }
3403         }
3404         else if( cn == 3 )
3405         {
3406             for( ; x <= width - 5; x += 4, D += 12 )
3407             {
3408                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3409                 __m128i u0, v0, u1, v1;
3410
3411                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3412                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3413                 const __m128i *w0, *w1;
3414                 w0 = (const __m128i*)(wtab + FXY[x]*16);
3415                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3416
3417                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3418                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
3419                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3420                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
3421                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3422                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
3423                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3424                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
3425                 u0 = _mm_unpacklo_epi8(u0, z);
3426                 v0 = _mm_unpacklo_epi8(v0, z);
3427                 u1 = _mm_unpacklo_epi8(u1, z);
3428                 v1 = _mm_unpacklo_epi8(v1, z);
3429                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3430                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3431                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3432                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3433                 u0 = _mm_slli_si128(u0, 4);
3434                 u0 = _mm_packs_epi32(u0, u1);
3435                 u0 = _mm_packus_epi16(u0, u0);
3436                 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
3437
3438                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3439                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3440
3441                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3442                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
3443                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3444                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
3445                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3446                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
3447                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3448                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
3449                 u0 = _mm_unpacklo_epi8(u0, z);
3450                 v0 = _mm_unpacklo_epi8(v0, z);
3451                 u1 = _mm_unpacklo_epi8(u1, z);
3452                 v1 = _mm_unpacklo_epi8(v1, z);
3453                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3454                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3455                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3456                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3457                 u0 = _mm_slli_si128(u0, 4);
3458                 u0 = _mm_packs_epi32(u0, u1);
3459                 u0 = _mm_packus_epi16(u0, u0);
3460                 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
3461             }
3462         }
3463         else if( cn == 4 )
3464         {
3465             for( ; x <= width - 4; x += 4, D += 16 )
3466             {
3467                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3468                 __m128i u0, v0, u1, v1;
3469
3470                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3471                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3472                 const __m128i *w0, *w1;
3473                 w0 = (const __m128i*)(wtab + FXY[x]*16);
3474                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3475
3476                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3477                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
3478                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3479                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
3480                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3481                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
3482                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3483                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
3484                 u0 = _mm_unpacklo_epi8(u0, z);
3485                 v0 = _mm_unpacklo_epi8(v0, z);
3486                 u1 = _mm_unpacklo_epi8(u1, z);
3487                 v1 = _mm_unpacklo_epi8(v1, z);
3488                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3489                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3490                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3491                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3492                 u0 = _mm_packs_epi32(u0, u1);
3493                 u0 = _mm_packus_epi16(u0, u0);
3494                 _mm_storel_epi64((__m128i*)D, u0);
3495
3496                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3497                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3498
3499                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3500                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
3501                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3502                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
3503                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3504                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
3505                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3506                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
3507                 u0 = _mm_unpacklo_epi8(u0, z);
3508                 v0 = _mm_unpacklo_epi8(v0, z);
3509                 u1 = _mm_unpacklo_epi8(u1, z);
3510                 v1 = _mm_unpacklo_epi8(v1, z);
3511                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3512                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3513                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3514                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3515                 u0 = _mm_packs_epi32(u0, u1);
3516                 u0 = _mm_packus_epi16(u0, u0);
3517                 _mm_storel_epi64((__m128i*)(D + 8), u0);
3518             }
3519         }
3520
3521         return x;
3522     }
3523 };
3524
3525 #else
3526
3527 typedef RemapNoVec RemapVec_8u;
3528
3529 #endif
3530
3531
3532 template<class CastOp, class VecOp, typename AT>
3533 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
3534                            const Mat& _fxy, const void* _wtab,
3535                            int borderType, const Scalar& _borderValue )
3536 {
3537     typedef typename CastOp::rtype T;
3538     typedef typename CastOp::type1 WT;
3539     Size ssize = _src.size(), dsize = _dst.size();
3540     int cn = _src.channels();
3541     const AT* wtab = (const AT*)_wtab;
3542     const T* S0 = _src.ptr<T>();
3543     size_t sstep = _src.step/sizeof(S0[0]);
3544     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3545         saturate_cast<T>(_borderValue[1]),
3546         saturate_cast<T>(_borderValue[2]),
3547         saturate_cast<T>(_borderValue[3]));
3548     int dx, dy;
3549     CastOp castOp;
3550     VecOp vecOp;
3551
3552     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
3553     CV_Assert( cn <= 4 && ssize.area() > 0 );
3554 #if CV_SSE2
3555     if( _src.type() == CV_8UC3 )
3556         width1 = std::max(ssize.width-2, 0);
3557 #endif
3558
3559     for( dy = 0; dy < dsize.height; dy++ )
3560     {
3561         T* D = _dst.ptr<T>(dy);
3562         const short* XY = _xy.ptr<short>(dy);
3563         const ushort* FXY = _fxy.ptr<ushort>(dy);
3564         int X0 = 0;
3565         bool prevInlier = false;
3566
3567         for( dx = 0; dx <= dsize.width; dx++ )
3568         {
3569             bool curInlier = dx < dsize.width ?
3570                 (unsigned)XY[dx*2] < width1 &&
3571                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
3572             if( curInlier == prevInlier )
3573                 continue;
3574
3575             int X1 = dx;
3576             dx = X0;
3577             X0 = X1;
3578             prevInlier = curInlier;
3579
3580             if( !curInlier )
3581             {
3582                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
3583                 D += len*cn;
3584                 dx += len;
3585
3586                 if( cn == 1 )
3587                 {
3588                     for( ; dx < X1; dx++, D++ )
3589                     {
3590                         int sx = XY[dx*2], sy = XY[dx*2+1];
3591                         const AT* w = wtab + FXY[dx]*4;
3592                         const T* S = S0 + sy*sstep + sx;
3593                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
3594                     }
3595                 }
3596                 else if( cn == 2 )
3597                     for( ; dx < X1; dx++, D += 2 )
3598                     {
3599                         int sx = XY[dx*2], sy = XY[dx*2+1];
3600                         const AT* w = wtab + FXY[dx]*4;
3601                         const T* S = S0 + sy*sstep + sx*2;
3602                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
3603                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
3604                         D[0] = castOp(t0); D[1] = castOp(t1);
3605                     }
3606                 else if( cn == 3 )
3607                     for( ; dx < X1; dx++, D += 3 )
3608                     {
3609                         int sx = XY[dx*2], sy = XY[dx*2+1];
3610                         const AT* w = wtab + FXY[dx]*4;
3611                         const T* S = S0 + sy*sstep + sx*3;
3612                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
3613                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
3614                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
3615                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
3616                     }
3617                 else
3618                     for( ; dx < X1; dx++, D += 4 )
3619                     {
3620                         int sx = XY[dx*2], sy = XY[dx*2+1];
3621                         const AT* w = wtab + FXY[dx]*4;
3622                         const T* S = S0 + sy*sstep + sx*4;
3623                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
3624                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
3625                         D[0] = castOp(t0); D[1] = castOp(t1);
3626                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
3627                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
3628                         D[2] = castOp(t0); D[3] = castOp(t1);
3629                     }
3630             }
3631             else
3632             {
3633                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
3634                 {
3635                     D += (X1 - dx)*cn;
3636                     dx = X1;
3637                     continue;
3638                 }
3639
3640                 if( cn == 1 )
3641                     for( ; dx < X1; dx++, D++ )
3642                     {
3643                         int sx = XY[dx*2], sy = XY[dx*2+1];
3644                         if( borderType == BORDER_CONSTANT &&
3645                             (sx >= ssize.width || sx+1 < 0 ||
3646                              sy >= ssize.height || sy+1 < 0) )
3647                         {
3648                             D[0] = cval[0];
3649                         }
3650                         else
3651                         {
3652                             int sx0, sx1, sy0, sy1;
3653                             T v0, v1, v2, v3;
3654                             const AT* w = wtab + FXY[dx]*4;
3655                             if( borderType == BORDER_REPLICATE )
3656                             {
3657                                 sx0 = clip(sx, 0, ssize.width);
3658                                 sx1 = clip(sx+1, 0, ssize.width);
3659                                 sy0 = clip(sy, 0, ssize.height);
3660                                 sy1 = clip(sy+1, 0, ssize.height);
3661                                 v0 = S0[sy0*sstep + sx0];
3662                                 v1 = S0[sy0*sstep + sx1];
3663                                 v2 = S0[sy1*sstep + sx0];
3664                                 v3 = S0[sy1*sstep + sx1];
3665                             }
3666                             else
3667                             {
3668                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
3669                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3670                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
3671                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3672                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
3673                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
3674                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
3675                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
3676                             }
3677                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
3678                         }
3679                     }
3680                 else
3681                     for( ; dx < X1; dx++, D += cn )
3682                     {
3683                         int sx = XY[dx*2], sy = XY[dx*2+1], k;
3684                         if( borderType == BORDER_CONSTANT &&
3685                             (sx >= ssize.width || sx+1 < 0 ||
3686                              sy >= ssize.height || sy+1 < 0) )
3687                         {
3688                             for( k = 0; k < cn; k++ )
3689                                 D[k] = cval[k];
3690                         }
3691                         else
3692                         {
3693                             int sx0, sx1, sy0, sy1;
3694                             const T *v0, *v1, *v2, *v3;
3695                             const AT* w = wtab + FXY[dx]*4;
3696                             if( borderType == BORDER_REPLICATE )
3697                             {
3698                                 sx0 = clip(sx, 0, ssize.width);
3699                                 sx1 = clip(sx+1, 0, ssize.width);
3700                                 sy0 = clip(sy, 0, ssize.height);
3701                                 sy1 = clip(sy+1, 0, ssize.height);
3702                                 v0 = S0 + sy0*sstep + sx0*cn;
3703                                 v1 = S0 + sy0*sstep + sx1*cn;
3704                                 v2 = S0 + sy1*sstep + sx0*cn;
3705                                 v3 = S0 + sy1*sstep + sx1*cn;
3706                             }
3707                             else if( borderType == BORDER_TRANSPARENT &&
3708                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
3709                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
3710                                 continue;
3711                             else
3712                             {
3713                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
3714                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3715                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
3716                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3717                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
3718                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
3719                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
3720                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
3721                             }
3722                             for( k = 0; k < cn; k++ )
3723                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
3724                         }
3725                     }
3726             }
3727         }
3728     }
3729 }
3730
3731
3732 template<class CastOp, typename AT, int ONE>
3733 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
3734                           const Mat& _fxy, const void* _wtab,
3735                           int borderType, const Scalar& _borderValue )
3736 {
3737     typedef typename CastOp::rtype T;
3738     typedef typename CastOp::type1 WT;
3739     Size ssize = _src.size(), dsize = _dst.size();
3740     int cn = _src.channels();
3741     const AT* wtab = (const AT*)_wtab;
3742     const T* S0 = _src.ptr<T>();
3743     size_t sstep = _src.step/sizeof(S0[0]);
3744     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3745         saturate_cast<T>(_borderValue[1]),
3746         saturate_cast<T>(_borderValue[2]),
3747         saturate_cast<T>(_borderValue[3]));
3748     int dx, dy;
3749     CastOp castOp;
3750     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3751
3752     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
3753
3754     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3755     {
3756         dsize.width *= dsize.height;
3757         dsize.height = 1;
3758     }
3759
3760     for( dy = 0; dy < dsize.height; dy++ )
3761     {
3762         T* D = _dst.ptr<T>(dy);
3763         const short* XY = _xy.ptr<short>(dy);
3764         const ushort* FXY = _fxy.ptr<ushort>(dy);
3765
3766         for( dx = 0; dx < dsize.width; dx++, D += cn )
3767         {
3768             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
3769             const AT* w = wtab + FXY[dx]*16;
3770             int i, k;
3771             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3772             {
3773                 const T* S = S0 + sy*sstep + sx*cn;
3774                 for( k = 0; k < cn; k++ )
3775                 {
3776                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
3777                     S += sstep;
3778                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
3779                     S += sstep;
3780                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
3781                     S += sstep;
3782                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
3783                     S += 1 - sstep*3;
3784                     D[k] = castOp(sum);
3785                 }
3786             }
3787             else
3788             {
3789                 int x[4], y[4];
3790                 if( borderType == BORDER_TRANSPARENT &&
3791                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
3792                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
3793                     continue;
3794
3795                 if( borderType1 == BORDER_CONSTANT &&
3796                     (sx >= ssize.width || sx+4 <= 0 ||
3797                     sy >= ssize.height || sy+4 <= 0))
3798                 {
3799                     for( k = 0; k < cn; k++ )
3800                         D[k] = cval[k];
3801                     continue;
3802                 }
3803
3804                 for( i = 0; i < 4; i++ )
3805                 {
3806                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3807                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3808                 }
3809
3810                 for( k = 0; k < cn; k++, S0++, w -= 16 )
3811                 {
3812                     WT cv = cval[k], sum = cv*ONE;
3813                     for( i = 0; i < 4; i++, w += 4 )
3814                     {
3815                         int yi = y[i];
3816                         const T* S = S0 + yi*sstep;
3817                         if( yi < 0 )
3818                             continue;
3819                         if( x[0] >= 0 )
3820                             sum += (S[x[0]] - cv)*w[0];
3821                         if( x[1] >= 0 )
3822                             sum += (S[x[1]] - cv)*w[1];
3823                         if( x[2] >= 0 )
3824                             sum += (S[x[2]] - cv)*w[2];
3825                         if( x[3] >= 0 )
3826                             sum += (S[x[3]] - cv)*w[3];
3827                     }
3828                     D[k] = castOp(sum);
3829                 }
3830                 S0 -= cn;
3831             }
3832         }
3833     }
3834 }
3835
3836
3837 template<class CastOp, typename AT, int ONE>
3838 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
3839                            const Mat& _fxy, const void* _wtab,
3840                            int borderType, const Scalar& _borderValue )
3841 {
3842     typedef typename CastOp::rtype T;
3843     typedef typename CastOp::type1 WT;
3844     Size ssize = _src.size(), dsize = _dst.size();
3845     int cn = _src.channels();
3846     const AT* wtab = (const AT*)_wtab;
3847     const T* S0 = _src.ptr<T>();
3848     size_t sstep = _src.step/sizeof(S0[0]);
3849     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3850         saturate_cast<T>(_borderValue[1]),
3851         saturate_cast<T>(_borderValue[2]),
3852         saturate_cast<T>(_borderValue[3]));
3853     int dx, dy;
3854     CastOp castOp;
3855     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3856
3857     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
3858
3859     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3860     {
3861         dsize.width *= dsize.height;
3862         dsize.height = 1;
3863     }
3864
3865     for( dy = 0; dy < dsize.height; dy++ )
3866     {
3867         T* D = _dst.ptr<T>(dy);
3868         const short* XY = _xy.ptr<short>(dy);
3869         const ushort* FXY = _fxy.ptr<ushort>(dy);
3870
3871         for( dx = 0; dx < dsize.width; dx++, D += cn )
3872         {
3873             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
3874             const AT* w = wtab + FXY[dx]*64;
3875             const T* S = S0 + sy*sstep + sx*cn;
3876             int i, k;
3877             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3878             {
3879                 for( k = 0; k < cn; k++ )
3880                 {
3881                     WT sum = 0;
3882                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
3883                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
3884                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
3885                     w -= 64;
3886                     S -= sstep*8 - 1;
3887                     D[k] = castOp(sum);
3888                 }
3889             }
3890             else
3891             {
3892                 int x[8], y[8];
3893                 if( borderType == BORDER_TRANSPARENT &&
3894                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
3895                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
3896                     continue;
3897
3898                 if( borderType1 == BORDER_CONSTANT &&
3899                     (sx >= ssize.width || sx+8 <= 0 ||
3900                     sy >= ssize.height || sy+8 <= 0))
3901                 {
3902                     for( k = 0; k < cn; k++ )
3903                         D[k] = cval[k];
3904                     continue;
3905                 }
3906
3907                 for( i = 0; i < 8; i++ )
3908                 {
3909                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3910                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3911                 }
3912
3913                 for( k = 0; k < cn; k++, S0++, w -= 64 )
3914                 {
3915                     WT cv = cval[k], sum = cv*ONE;
3916                     for( i = 0; i < 8; i++, w += 8 )
3917                     {
3918                         int yi = y[i];
3919                         const T* S1 = S0 + yi*sstep;
3920                         if( yi < 0 )
3921                             continue;
3922                         if( x[0] >= 0 )
3923                             sum += (S1[x[0]] - cv)*w[0];
3924                         if( x[1] >= 0 )
3925                             sum += (S1[x[1]] - cv)*w[1];
3926                         if( x[2] >= 0 )
3927                             sum += (S1[x[2]] - cv)*w[2];
3928                         if( x[3] >= 0 )
3929                             sum += (S1[x[3]] - cv)*w[3];
3930                         if( x[4] >= 0 )
3931                             sum += (S1[x[4]] - cv)*w[4];
3932                         if( x[5] >= 0 )
3933                             sum += (S1[x[5]] - cv)*w[5];
3934                         if( x[6] >= 0 )
3935                             sum += (S1[x[6]] - cv)*w[6];
3936                         if( x[7] >= 0 )
3937                             sum += (S1[x[7]] - cv)*w[7];
3938                     }
3939                     D[k] = castOp(sum);
3940                 }
3941                 S0 -= cn;
3942             }
3943         }
3944     }
3945 }
3946
3947
3948 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3949                             int borderType, const Scalar& _borderValue );
3950
3951 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3952                           const Mat& _fxy, const void* _wtab,
3953                           int borderType, const Scalar& _borderValue);
3954
3955 class RemapInvoker :
3956     public ParallelLoopBody
3957 {
3958 public:
3959     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
3960                  const Mat *_m2, int _borderType, const Scalar &_borderValue,
3961                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
3962         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
3963         borderType(_borderType), borderValue(_borderValue),
3964         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
3965     {
3966     }
3967
3968     virtual void operator() (const Range& range) const
3969     {
3970         int x, y, x1, y1;
3971         const int buf_size = 1 << 14;
3972         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
3973         int bcols0 = std::min(buf_size/brows0, dst->cols);
3974         brows0 = std::min(buf_size/bcols0, dst->rows);
3975     #if CV_SSE2
3976         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
3977     #endif
3978
3979         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
3980         if( !nnfunc )
3981             _bufa.create(brows0, bcols0, CV_16UC1);
3982
3983         for( y = range.start; y < range.end; y += brows0 )
3984         {
3985             for( x = 0; x < dst->cols; x += bcols0 )
3986             {
3987                 int brows = std::min(brows0, range.end - y);
3988                 int bcols = std::min(bcols0, dst->cols - x);
3989                 Mat dpart(*dst, Rect(x, y, bcols, brows));
3990                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
3991
3992                 if( nnfunc )
3993                 {
3994                     if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
3995                         bufxy = (*m1)(Rect(x, y, bcols, brows));
3996                     else if( map_depth != CV_32F )
3997                     {
3998                         for( y1 = 0; y1 < brows; y1++ )
3999                         {
4000                             short* XY = bufxy.ptr<short>(y1);
4001                             const short* sXY = m1->ptr<short>(y+y1) + x*2;
4002                             const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4003
4004                             for( x1 = 0; x1 < bcols; x1++ )
4005                             {
4006                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
4007                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
4008                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
4009                             }
4010                         }
4011                     }
4012                     else if( !planar_input )
4013                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
4014                     else
4015                     {
4016                         for( y1 = 0; y1 < brows; y1++ )
4017                         {
4018                             short* XY = bufxy.ptr<short>(y1);
4019                             const float* sX = m1->ptr<float>(y+y1) + x;
4020                             const float* sY = m2->ptr<float>(y+y1) + x;
4021                             x1 = 0;
4022
4023                         #if CV_SSE2
4024                             if( useSIMD )
4025                             {
4026                                 for( ; x1 <= bcols - 8; x1 += 8 )
4027                                 {
4028                                     __m128 fx0 = _mm_loadu_ps(sX + x1);
4029                                     __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4030                                     __m128 fy0 = _mm_loadu_ps(sY + x1);
4031                                     __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4032                                     __m128i ix0 = _mm_cvtps_epi32(fx0);
4033                                     __m128i ix1 = _mm_cvtps_epi32(fx1);
4034                                     __m128i iy0 = _mm_cvtps_epi32(fy0);
4035                                     __m128i iy1 = _mm_cvtps_epi32(fy1);
4036                                     ix0 = _mm_packs_epi32(ix0, ix1);
4037                                     iy0 = _mm_packs_epi32(iy0, iy1);
4038                                     ix1 = _mm_unpacklo_epi16(ix0, iy0);
4039                                     iy1 = _mm_unpackhi_epi16(ix0, iy0);
4040                                     _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4041                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4042                                 }
4043                             }
4044                         #endif
4045
4046                             for( ; x1 < bcols; x1++ )
4047                             {
4048                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
4049                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
4050                             }
4051                         }
4052                     }
4053                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
4054                     continue;
4055                 }
4056
4057                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
4058                 for( y1 = 0; y1 < brows; y1++ )
4059                 {
4060                     short* XY = bufxy.ptr<short>(y1);
4061                     ushort* A = bufa.ptr<ushort>(y1);
4062
4063                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
4064                     {
4065                         bufxy = (*m1)(Rect(x, y, bcols, brows));
4066
4067                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4068                         x1 = 0;
4069
4070                     #if CV_NEON
4071                         uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
4072                         for ( ; x1 <= bcols - 8; x1 += 8)
4073                             vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
4074                     #endif
4075
4076                         for( ; x1 < bcols; x1++ )
4077                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
4078                     }
4079                     else if( planar_input )
4080                     {
4081                         const float* sX = m1->ptr<float>(y+y1) + x;
4082                         const float* sY = m2->ptr<float>(y+y1) + x;
4083
4084                         x1 = 0;
4085                     #if CV_SSE2
4086                         if( useSIMD )
4087                         {
4088                             __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
4089                             __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
4090                             for( ; x1 <= bcols - 8; x1 += 8 )
4091                             {
4092                                 __m128 fx0 = _mm_loadu_ps(sX + x1);
4093                                 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4094                                 __m128 fy0 = _mm_loadu_ps(sY + x1);
4095                                 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4096                                 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
4097                                 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
4098                                 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
4099                                 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
4100                                 __m128i mx0 = _mm_and_si128(ix0, mask);
4101                                 __m128i mx1 = _mm_and_si128(ix1, mask);
4102                                 __m128i my0 = _mm_and_si128(iy0, mask);
4103                                 __m128i my1 = _mm_and_si128(iy1, mask);
4104                                 mx0 = _mm_packs_epi32(mx0, mx1);
4105                                 my0 = _mm_packs_epi32(my0, my1);
4106                                 my0 = _mm_slli_epi16(my0, INTER_BITS);
4107                                 mx0 = _mm_or_si128(mx0, my0);
4108                                 _mm_storeu_si128((__m128i*)(A + x1), mx0);
4109                                 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
4110                                 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
4111                                 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
4112                                 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
4113                                 ix0 = _mm_packs_epi32(ix0, ix1);
4114                                 iy0 = _mm_packs_epi32(iy0, iy1);
4115                                 ix1 = _mm_unpacklo_epi16(ix0, iy0);
4116                                 iy1 = _mm_unpackhi_epi16(ix0, iy0);
4117                                 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4118                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4119                             }
4120                         }
4121                     #elif CV_NEON
4122                         float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
4123                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4124
4125                         for( ; x1 <= bcols - 4; x1 += 4 )
4126                         {
4127                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
4128                                       v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
4129                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4130                                                       vandq_s32(v_sy, v_scale2));
4131                             vst1_u16(A + x1, vqmovun_s32(v_v));
4132
4133                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4134                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4135                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4136                         }
4137                     #endif
4138
4139                         for( ; x1 < bcols; x1++ )
4140                         {
4141                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
4142                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
4143                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4144                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4145                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4146                             A[x1] = (ushort)v;
4147                         }
4148                     }
4149                     else
4150                     {
4151                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
4152                         x1 = 0;
4153
4154                     #if CV_NEON
4155                         float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
4156                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4157
4158                         for( ; x1 <= bcols - 4; x1 += 4 )
4159                         {
4160                             float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
4161                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
4162                             int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
4163                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4164                                                       vandq_s32(v_sy, v_scale2));
4165                             vst1_u16(A + x1, vqmovun_s32(v_v));
4166
4167                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4168                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4169                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4170                         }
4171                     #endif
4172
4173                         for( x1 = 0; x1 < bcols; x1++ )
4174                         {
4175                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
4176                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
4177                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4178                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4179                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4180                             A[x1] = (ushort)v;
4181                         }
4182                     }
4183                 }
4184                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
4185             }
4186         }
4187     }
4188
4189 private:
4190     const Mat* src;
4191     Mat* dst;
4192     const Mat *m1, *m2;
4193     int borderType;
4194     Scalar borderValue;
4195     int planar_input;
4196     RemapNNFunc nnfunc;
4197     RemapFunc ifunc;
4198     const void *ctab;
4199 };
4200
4201 #ifdef HAVE_OPENCL
4202
4203 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
4204                       int interpolation, int borderType, const Scalar& borderValue)
4205 {
4206     const ocl::Device & dev = ocl::Device::getDefault();
4207     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
4208             rowsPerWI = dev.isIntel() ? 4 : 1;
4209
4210     if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
4211             || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
4212         return false;
4213
4214     UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
4215
4216     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
4217         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
4218     {
4219         if (map1.type() != CV_16SC2)
4220             std::swap(map1, map2);
4221     }
4222     else
4223         CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4224
4225     _dst.create(map1.size(), type);
4226     UMat dst = _dst.getUMat();
4227
4228     String kernelName = "remap";
4229     if (map1.type() == CV_32FC2 && map2.empty())
4230         kernelName += "_32FC2";
4231     else if (map1.type() == CV_16SC2)
4232     {
4233         kernelName += "_16SC2";
4234         if (!map2.empty())
4235             kernelName += "_16UC1";
4236     }
4237     else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
4238         kernelName += "_2_32FC1";
4239     else
4240         CV_Error(Error::StsBadArg, "Unsupported map types");
4241
4242     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
4243     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4244                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
4245     String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
4246                                  interMap[interpolation], borderMap[borderType],
4247                                  ocl::typeToStr(type), rowsPerWI);
4248
4249     if (interpolation != INTER_NEAREST)
4250     {
4251         char cvt[3][40];
4252         int wdepth = std::max(CV_32F, depth);
4253         buildOptions = buildOptions
4254                       + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
4255                                " -D convertToWT2=%s -D WT2=%s",
4256                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
4257                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
4258                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
4259                                ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
4260                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
4261     }
4262     int scalarcn = cn == 3 ? 4 : cn;
4263     int sctype = CV_MAKETYPE(depth, scalarcn);
4264     buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
4265                            ocl::typeToStr(type), ocl::typeToStr(depth),
4266                            cn, ocl::typeToStr(sctype), depth);
4267
4268     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
4269
4270     Mat scalar(1, 1, sctype, borderValue);
4271     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
4272             map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
4273             scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
4274
4275     if (map2.empty())
4276         k.args(srcarg, dstarg, map1arg, scalararg);
4277     else
4278         k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
4279
4280     size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4281     return k.run(2, globalThreads, NULL, false);
4282 }
4283
4284 #endif
4285
4286 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4287
4288 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
4289                                            const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
4290                                            void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
4291
4292 class IPPRemapInvoker :
4293         public ParallelLoopBody
4294 {
4295 public:
4296     IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
4297                     int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
4298         ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
4299         ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
4300     {
4301         *ok = true;
4302     }
4303
4304     virtual void operator() (const Range & range) const
4305     {
4306         IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
4307         Mat dstRoi = dst.rowRange(range);
4308         IppiSize dstRoiSize = ippiSize(dstRoi.size());
4309         int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4310
4311         if (borderType == BORDER_CONSTANT &&
4312                 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
4313         {
4314             *ok = false;
4315             return;
4316         }
4317
4318         if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
4319                     map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
4320                     dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
4321             *ok = false;
4322         else
4323         {
4324             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4325         }
4326     }
4327
4328 private:
4329     Mat & src, & dst, & map1, & map2;
4330     ippiRemap ippFunc;
4331     int ippInterpolation, borderType;
4332     Scalar borderValue;
4333     bool * ok;
4334 };
4335
4336 #endif
4337
4338 }
4339
4340 void cv::remap( InputArray _src, OutputArray _dst,
4341                 InputArray _map1, InputArray _map2,
4342                 int interpolation, int borderType, const Scalar& borderValue )
4343 {
4344     static RemapNNFunc nn_tab[] =
4345     {
4346         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
4347         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
4348     };
4349
4350     static RemapFunc linear_tab[] =
4351     {
4352         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
4353         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
4354         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
4355         remapBilinear<Cast<float, float>, RemapNoVec, float>,
4356         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
4357     };
4358
4359     static RemapFunc cubic_tab[] =
4360     {
4361         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4362         remapBicubic<Cast<float, ushort>, float, 1>,
4363         remapBicubic<Cast<float, short>, float, 1>, 0,
4364         remapBicubic<Cast<float, float>, float, 1>,
4365         remapBicubic<Cast<double, double>, float, 1>, 0
4366     };
4367
4368     static RemapFunc lanczos4_tab[] =
4369     {
4370         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4371         remapLanczos4<Cast<float, ushort>, float, 1>,
4372         remapLanczos4<Cast<float, short>, float, 1>, 0,
4373         remapLanczos4<Cast<float, float>, float, 1>,
4374         remapLanczos4<Cast<double, double>, float, 1>, 0
4375     };
4376
4377     CV_Assert( _map1.size().area() > 0 );
4378     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
4379
4380     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4381                ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
4382
4383     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
4384     _dst.create( map1.size(), src.type() );
4385     Mat dst = _dst.getMat();
4386     if( dst.data == src.data )
4387         src = src.clone();
4388
4389     if( interpolation == INTER_AREA )
4390         interpolation = INTER_LINEAR;
4391
4392     int type = src.type(), depth = CV_MAT_DEPTH(type);
4393
4394 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4395     CV_IPP_CHECK()
4396     {
4397         if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
4398                 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
4399                 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
4400         {
4401             int ippInterpolation =
4402                 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4403                 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
4404
4405             ippiRemap ippFunc =
4406                 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
4407                 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
4408                 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
4409                 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
4410                 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
4411                 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
4412                 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
4413                 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
4414                 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
4415
4416             if (ippFunc)
4417             {
4418                 bool ok;
4419                 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
4420                                         borderType, borderValue, &ok);
4421                 Range range(0, dst.rows);
4422                 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
4423
4424                 if (ok)
4425                 {
4426                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4427                     return;
4428                 }
4429                 setIppErrorStatus();
4430             }
4431         }
4432     }
4433 #endif
4434
4435     RemapNNFunc nnfunc = 0;
4436     RemapFunc ifunc = 0;
4437     const void* ctab = 0;
4438     bool fixpt = depth == CV_8U;
4439     bool planar_input = false;
4440
4441     if( interpolation == INTER_NEAREST )
4442     {
4443         nnfunc = nn_tab[depth];
4444         CV_Assert( nnfunc != 0 );
4445     }
4446     else
4447     {
4448         if( interpolation == INTER_LINEAR )
4449             ifunc = linear_tab[depth];
4450         else if( interpolation == INTER_CUBIC )
4451             ifunc = cubic_tab[depth];
4452         else if( interpolation == INTER_LANCZOS4 )
4453             ifunc = lanczos4_tab[depth];
4454         else
4455             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
4456         CV_Assert( ifunc != 0 );
4457         ctab = initInterTab2D( interpolation, fixpt );
4458     }
4459
4460     const Mat *m1 = &map1, *m2 = &map2;
4461
4462     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
4463         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
4464     {
4465         if( map1.type() != CV_16SC2 )
4466             std::swap(m1, m2);
4467     }
4468     else
4469     {
4470         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
4471             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4472         planar_input = map1.channels() == 1;
4473     }
4474
4475     RemapInvoker invoker(src, dst, m1, m2,
4476                          borderType, borderValue, planar_input, nnfunc, ifunc,
4477                          ctab);
4478     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
4479 }
4480
4481
4482 void cv::convertMaps( InputArray _map1, InputArray _map2,
4483                       OutputArray _dstmap1, OutputArray _dstmap2,
4484                       int dstm1type, bool nninterpolate )
4485 {
4486     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
4487     Size size = map1.size();
4488     const Mat *m1 = &map1, *m2 = &map2;
4489     int m1type = m1->type(), m2type = m2->type();
4490
4491     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
4492                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
4493                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
4494                (m1type == CV_32FC2 && m2->empty()) );
4495
4496     if( m2type == CV_16SC2 )
4497     {
4498         std::swap( m1, m2 );
4499         std::swap( m1type, m2type );
4500     }
4501
4502     if( dstm1type <= 0 )
4503         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
4504     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
4505     _dstmap1.create( size, dstm1type );
4506     dstmap1 = _dstmap1.getMat();
4507
4508     if( !nninterpolate && dstm1type != CV_32FC2 )
4509     {
4510         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
4511         dstmap2 = _dstmap2.getMat();
4512     }
4513     else
4514         _dstmap2.release();
4515
4516     if( m1type == dstm1type || (nninterpolate &&
4517         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
4518         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
4519     {
4520         m1->convertTo( dstmap1, dstmap1.type() );
4521         if( !dstmap2.empty() && dstmap2.type() == m2->type() )
4522             m2->copyTo( dstmap2 );
4523         return;
4524     }
4525
4526     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
4527     {
4528         Mat vdata[] = { *m1, *m2 };
4529         merge( vdata, 2, dstmap1 );
4530         return;
4531     }
4532
4533     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
4534     {
4535         Mat mv[] = { dstmap1, dstmap2 };
4536         split( *m1, mv );
4537         return;
4538     }
4539
4540     if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
4541         dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
4542     {
4543         size.width *= size.height;
4544         size.height = 1;
4545     }
4546
4547     const float scale = 1.f/INTER_TAB_SIZE;
4548     int x, y;
4549     for( y = 0; y < size.height; y++ )
4550     {
4551         const float* src1f = m1->ptr<float>(y);
4552         const float* src2f = m2->ptr<float>(y);
4553         const short* src1 = (const short*)src1f;
4554         const ushort* src2 = (const ushort*)src2f;
4555
4556         float* dst1f = dstmap1.ptr<float>(y);
4557         float* dst2f = dstmap2.ptr<float>(y);
4558         short* dst1 = (short*)dst1f;
4559         ushort* dst2 = (ushort*)dst2f;
4560
4561         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
4562         {
4563             if( nninterpolate )
4564                 for( x = 0; x < size.width; x++ )
4565                 {
4566                     dst1[x*2] = saturate_cast<short>(src1f[x]);
4567                     dst1[x*2+1] = saturate_cast<short>(src2f[x]);
4568                 }
4569             else
4570                 for( x = 0; x < size.width; x++ )
4571                 {
4572                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
4573                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
4574                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4575                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4576                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4577                 }
4578         }
4579         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
4580         {
4581             if( nninterpolate )
4582                 for( x = 0; x < size.width; x++ )
4583                 {
4584                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
4585                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
4586                 }
4587             else
4588                 for( x = 0; x < size.width; x++ )
4589                 {
4590                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
4591                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
4592                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4593                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4594                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4595                 }
4596         }
4597         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
4598         {
4599             for( x = 0; x < size.width; x++ )
4600             {
4601                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
4602                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4603                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4604             }
4605         }
4606         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
4607         {
4608             for( x = 0; x < size.width; x++ )
4609             {
4610                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
4611                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4612                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4613             }
4614         }
4615         else
4616             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
4617     }
4618 }
4619
4620
4621 namespace cv
4622 {
4623
4624 class WarpAffineInvoker :
4625     public ParallelLoopBody
4626 {
4627 public:
4628     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
4629                       const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
4630         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
4631         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
4632         M(_M)
4633     {
4634     }
4635
4636     virtual void operator() (const Range& range) const
4637     {
4638         const int BLOCK_SZ = 64;
4639         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
4640         const int AB_BITS = MAX(10, (int)INTER_BITS);
4641         const int AB_SCALE = 1 << AB_BITS;
4642         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
4643     #if CV_SSE2
4644         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
4645     #endif
4646
4647         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
4648         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
4649         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
4650
4651         for( y = range.start; y < range.end; y += bh0 )
4652         {
4653             for( x = 0; x < dst.cols; x += bw0 )
4654             {
4655                 int bw = std::min( bw0, dst.cols - x);
4656                 int bh = std::min( bh0, range.end - y);
4657
4658                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
4659                 Mat dpart(dst, Rect(x, y, bw, bh));
4660
4661                 for( y1 = 0; y1 < bh; y1++ )
4662                 {
4663                     short* xy = XY + y1*bw*2;
4664                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
4665                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
4666
4667                     if( interpolation == INTER_NEAREST )
4668                     {
4669                         x1 = 0;
4670                         #if CV_NEON
4671                         int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
4672                         for( ; x1 <= bw - 4; x1 += 4 )
4673                         {
4674                             int32x4_t v_X = vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
4675                             int32x4_t v_Y = vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
4676
4677                             vst1q_s16(xy + (x1 << 1), vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X, INTER_BITS)),
4678                                                                    vqmovn_s32(vshrq_n_s32(v_Y, INTER_BITS))));
4679                         }
4680                         #endif
4681                         for( ; x1 < bw; x1++ )
4682                         {
4683                             int X = (X0 + adelta[x+x1]) >> AB_BITS;
4684                             int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
4685                             xy[x1*2] = saturate_cast<short>(X);
4686                             xy[x1*2+1] = saturate_cast<short>(Y);
4687                         }
4688                     }
4689                     else
4690                     {
4691                         short* alpha = A + y1*bw;
4692                         x1 = 0;
4693                     #if CV_SSE2
4694                         if( useSIMD )
4695                         {
4696                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
4697                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
4698                             for( ; x1 <= bw - 8; x1 += 8 )
4699                             {
4700                                 __m128i tx0, tx1, ty0, ty1;
4701                                 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
4702                                 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
4703                                 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
4704                                 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
4705
4706                                 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
4707                                 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
4708                                 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
4709                                 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
4710
4711                                 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
4712                                                             _mm_and_si128(tx1, fxy_mask));
4713                                 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
4714                                                             _mm_and_si128(ty1, fxy_mask));
4715                                 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
4716                                                             _mm_srai_epi32(tx1, INTER_BITS));
4717                                 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
4718                                                     _mm_srai_epi32(ty1, INTER_BITS));
4719                                 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
4720
4721                                 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
4722                                 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
4723                                 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
4724                             }
4725                         }
4726                     // #elif CV_NEON
4727                     //     int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
4728                     //     for( ; x1 <= bw - 4; x1 += 4 )
4729                     //     {
4730                     //         int32x4_t v_X = vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
4731                     //         int32x4_t v_Y = vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
4732
4733                     //         vst1q_s16(xy + (x1 << 1), vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X, INTER_BITS)),
4734                     //                                                vqmovn_s32(vshrq_n_s32(v_Y, INTER_BITS))));
4735                     //         vst1_s16(alpha + x1, vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y, v_mask), INTER_BITS),
4736                     //                                                  vandq_s32(v_X, v_mask))));
4737                     //     }
4738                     #endif
4739                         for( ; x1 < bw; x1++ )
4740                         {
4741                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
4742                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
4743                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
4744                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
4745                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
4746                                     (X & (INTER_TAB_SIZE-1)));
4747                         }
4748                     }
4749                 }
4750
4751                 if( interpolation == INTER_NEAREST )
4752                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
4753                 else
4754                 {
4755                     Mat _matA(bh, bw, CV_16U, A);
4756                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
4757                 }
4758             }
4759         }
4760     }
4761
4762 private:
4763     Mat src;
4764     Mat dst;
4765     int interpolation, borderType;
4766     Scalar borderValue;
4767     int *adelta, *bdelta;
4768     double *M;
4769 };
4770
4771
4772 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4773 class IPPWarpAffineInvoker :
4774     public ParallelLoopBody
4775 {
4776 public:
4777     IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
4778                          const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
4779         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
4780         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
4781     {
4782         *ok = true;
4783     }
4784
4785     virtual void operator() (const Range& range) const
4786     {
4787         IppiSize srcsize = { src.cols, src.rows };
4788         IppiRect srcroi = { 0, 0, src.cols, src.rows };
4789         IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
4790         int cnn = src.channels();
4791         if( borderType == BORDER_CONSTANT )
4792         {
4793             IppiSize setSize = { dst.cols, range.end - range.start };
4794             void *dataPointer = dst.ptr(range.start);
4795             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
4796             {
4797                 *ok = false;
4798                 return;
4799             }
4800         }
4801
4802         // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
4803         IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
4804                                 (int)dst.step[0], dstroi, coeffs, mode );
4805         if( status < 0)
4806             *ok = false;
4807         else
4808         {
4809             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4810         }
4811     }
4812 private:
4813     Mat &src;
4814     Mat &dst;
4815     int mode;
4816     double (&coeffs)[2][3];
4817     int borderType;
4818     Scalar borderValue;
4819     ippiWarpAffineBackFunc func;
4820     bool *ok;
4821     const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
4822 };
4823 #endif
4824
4825 #ifdef HAVE_OPENCL
4826
4827 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
4828
4829 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
4830                               Size dsize, int flags, int borderType, const Scalar& borderValue,
4831                               int op_type)
4832 {
4833     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
4834     const ocl::Device & dev = ocl::Device::getDefault();
4835
4836     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4837     double doubleSupport = dev.doubleFPConfig() > 0;
4838
4839     int interpolation = flags & INTER_MAX;
4840     if( interpolation == INTER_AREA )
4841         interpolation = INTER_LINEAR;
4842     int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
4843
4844     if ( !(borderType == cv::BORDER_CONSTANT &&
4845            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
4846          (!doubleSupport && depth == CV_64F) || cn > 4)
4847         return false;
4848
4849     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
4850     ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
4851                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
4852     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
4853
4854     int scalarcn = cn == 3 ? 4 : cn;
4855     bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
4856     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
4857     int sctype = CV_MAKETYPE(wdepth, scalarcn);
4858
4859     ocl::Kernel k;
4860     String opts;
4861     if (interpolation == INTER_NEAREST)
4862     {
4863         opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
4864                       ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4865                       ocl::typeToStr(CV_MAT_DEPTH(type)),
4866                       ocl::typeToStr(sctype), cn, rowsPerWI);
4867     }
4868     else
4869     {
4870         char cvt[2][50];
4871         opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
4872                       " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
4873                       interpolationMap[interpolation], ocl::typeToStr(type),
4874                       ocl::typeToStr(CV_MAT_DEPTH(type)),
4875                       ocl::typeToStr(sctype),
4876                       ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
4877                       ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
4878                       ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
4879                       doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
4880     }
4881
4882     k.create(kernelName, program, opts);
4883     if (k.empty())
4884         return false;
4885
4886     double borderBuf[] = { 0, 0, 0, 0 };
4887     scalarToRawData(borderValue, borderBuf, sctype);
4888
4889     UMat src = _src.getUMat(), M0;
4890     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4891     UMat dst = _dst.getUMat();
4892
4893     double M[9];
4894     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
4895     Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
4896     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
4897                M1.rows == matRows && M1.cols == 3 );
4898     M1.convertTo(matM, matM.type());
4899
4900     if( !(flags & WARP_INVERSE_MAP) )
4901     {
4902         if (op_type == OCL_OP_PERSPECTIVE)
4903             invert(matM, matM);
4904         else
4905         {
4906             double D = M[0]*M[4] - M[1]*M[3];
4907             D = D != 0 ? 1./D : 0;
4908             double A11 = M[4]*D, A22=M[0]*D;
4909             M[0] = A11; M[1] *= -D;
4910             M[3] *= -D; M[4] = A22;
4911             double b1 = -M[0]*M[2] - M[1]*M[5];
4912             double b2 = -M[3]*M[2] - M[4]*M[5];
4913             M[2] = b1; M[5] = b2;
4914         }
4915     }
4916     matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
4917
4918     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
4919            ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
4920
4921     size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4922     return k.run(2, globalThreads, NULL, false);
4923 }
4924
4925 #endif
4926
4927 }
4928
4929
4930 void cv::warpAffine( InputArray _src, OutputArray _dst,
4931                      InputArray _M0, Size dsize,
4932                      int flags, int borderType, const Scalar& borderValue )
4933 {
4934     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4935                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
4936                                  borderValue, OCL_OP_AFFINE))
4937
4938     Mat src = _src.getMat(), M0 = _M0.getMat();
4939     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4940     Mat dst = _dst.getMat();
4941     CV_Assert( src.cols > 0 && src.rows > 0 );
4942     if( dst.data == src.data )
4943         src = src.clone();
4944
4945     double M[6];
4946     Mat matM(2, 3, CV_64F, M);
4947     int interpolation = flags & INTER_MAX;
4948     if( interpolation == INTER_AREA )
4949         interpolation = INTER_LINEAR;
4950
4951     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
4952     M0.convertTo(matM, matM.type());
4953
4954 #ifdef HAVE_TEGRA_OPTIMIZATION
4955     if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
4956         return;
4957 #endif
4958
4959     if( !(flags & WARP_INVERSE_MAP) )
4960     {
4961         double D = M[0]*M[4] - M[1]*M[3];
4962         D = D != 0 ? 1./D : 0;
4963         double A11 = M[4]*D, A22=M[0]*D;
4964         M[0] = A11; M[1] *= -D;
4965         M[3] *= -D; M[4] = A22;
4966         double b1 = -M[0]*M[2] - M[1]*M[5];
4967         double b2 = -M[3]*M[2] - M[4]*M[5];
4968         M[2] = b1; M[5] = b2;
4969     }
4970
4971     int x;
4972     AutoBuffer<int> _abdelta(dst.cols*2);
4973     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
4974     const int AB_BITS = MAX(10, (int)INTER_BITS);
4975     const int AB_SCALE = 1 << AB_BITS;
4976
4977 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4978     CV_IPP_CHECK()
4979     {
4980         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4981         if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
4982            ( cn == 1 || cn == 3 || cn == 4 ) &&
4983            ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
4984            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
4985         {
4986             ippiWarpAffineBackFunc ippFunc = 0;
4987             if ((flags & WARP_INVERSE_MAP) != 0)
4988             {
4989                 ippFunc =
4990                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
4991                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
4992                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
4993                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
4994                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
4995                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
4996                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
4997                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
4998                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
4999                 0;
5000             }
5001             else
5002             {
5003                 ippFunc =
5004                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
5005                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
5006                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
5007                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
5008                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
5009                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
5010                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
5011                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
5012                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
5013                 0;
5014             }
5015             int mode =
5016             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5017             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5018             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
5019             0;
5020             CV_Assert(mode && ippFunc);
5021
5022             double coeffs[2][3];
5023             for( int i = 0; i < 2; i++ )
5024                 for( int j = 0; j < 3; j++ )
5025                     coeffs[i][j] = matM.at<double>(i, j);
5026
5027             bool ok;
5028             Range range(0, dst.rows);
5029             IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5030             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5031             if( ok )
5032             {
5033                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5034                 return;
5035             }
5036             setIppErrorStatus();
5037         }
5038     }
5039 #endif
5040
5041     for( x = 0; x < dst.cols; x++ )
5042     {
5043         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
5044         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
5045     }
5046
5047     Range range(0, dst.rows);
5048     WarpAffineInvoker invoker(src, dst, interpolation, borderType,
5049                               borderValue, adelta, bdelta, M);
5050     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5051 }
5052
5053
5054 namespace cv
5055 {
5056
5057 class WarpPerspectiveInvoker :
5058     public ParallelLoopBody
5059 {
5060 public:
5061     WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
5062                            int _borderType, const Scalar &_borderValue) :
5063         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
5064         borderType(_borderType), borderValue(_borderValue)
5065     {
5066     }
5067
5068     virtual void operator() (const Range& range) const
5069     {
5070         const int BLOCK_SZ = 32;
5071         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
5072         int x, y, x1, y1, width = dst.cols, height = dst.rows;
5073
5074         int bh0 = std::min(BLOCK_SZ/2, height);
5075         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
5076         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
5077
5078         for( y = range.start; y < range.end; y += bh0 )
5079         {
5080             for( x = 0; x < width; x += bw0 )
5081             {
5082                 int bw = std::min( bw0, width - x);
5083                 int bh = std::min( bh0, range.end - y); // height
5084
5085                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
5086                 Mat dpart(dst, Rect(x, y, bw, bh));
5087
5088                 for( y1 = 0; y1 < bh; y1++ )
5089                 {
5090                     short* xy = XY + y1*bw*2;
5091                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
5092                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
5093                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
5094
5095                     if( interpolation == INTER_NEAREST )
5096                         for( x1 = 0; x1 < bw; x1++ )
5097                         {
5098                             double W = W0 + M[6]*x1;
5099                             W = W ? 1./W : 0;
5100                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
5101                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
5102                             int X = saturate_cast<int>(fX);
5103                             int Y = saturate_cast<int>(fY);
5104
5105                             xy[x1*2] = saturate_cast<short>(X);
5106                             xy[x1*2+1] = saturate_cast<short>(Y);
5107                         }
5108                     else
5109                     {
5110                         short* alpha = A + y1*bw;
5111                         for( x1 = 0; x1 < bw; x1++ )
5112                         {
5113                             double W = W0 + M[6]*x1;
5114                             W = W ? INTER_TAB_SIZE/W : 0;
5115                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
5116                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
5117                             int X = saturate_cast<int>(fX);
5118                             int Y = saturate_cast<int>(fY);
5119
5120                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
5121                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
5122                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
5123                                                 (X & (INTER_TAB_SIZE-1)));
5124                         }
5125                     }
5126                 }
5127
5128                 if( interpolation == INTER_NEAREST )
5129                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
5130                 else
5131                 {
5132                     Mat _matA(bh, bw, CV_16U, A);
5133                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
5134                 }
5135             }
5136         }
5137     }
5138
5139 private:
5140     Mat src;
5141     Mat dst;
5142     double* M;
5143     int interpolation, borderType;
5144     Scalar borderValue;
5145 };
5146
5147
5148 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
5149 class IPPWarpPerspectiveInvoker :
5150     public ParallelLoopBody
5151 {
5152 public:
5153     IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
5154                               int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
5155         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
5156         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
5157     {
5158         *ok = true;
5159     }
5160
5161     virtual void operator() (const Range& range) const
5162     {
5163         IppiSize srcsize = {src.cols, src.rows};
5164         IppiRect srcroi = {0, 0, src.cols, src.rows};
5165         IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
5166         int cnn = src.channels();
5167
5168         if( borderType == BORDER_CONSTANT )
5169         {
5170             IppiSize setSize = {dst.cols, range.end - range.start};
5171             void *dataPointer = dst.ptr(range.start);
5172             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
5173             {
5174                 *ok = false;
5175                 return;
5176             }
5177         }
5178
5179         IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode);
5180         if (status != ippStsNoErr)
5181             *ok = false;
5182         else
5183         {
5184             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5185         }
5186     }
5187 private:
5188     Mat &src;
5189     Mat &dst;
5190     int mode;
5191     double (&coeffs)[3][3];
5192     int borderType;
5193     const Scalar borderValue;
5194     ippiWarpPerspectiveFunc func;
5195     bool *ok;
5196
5197     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
5198 };
5199 #endif
5200 }
5201
5202 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
5203                           Size dsize, int flags, int borderType, const Scalar& borderValue )
5204 {
5205     CV_Assert( _src.total() > 0 );
5206
5207     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
5208                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
5209                               OCL_OP_PERSPECTIVE))
5210
5211     Mat src = _src.getMat(), M0 = _M0.getMat();
5212     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5213     Mat dst = _dst.getMat();
5214
5215     if( dst.data == src.data )
5216         src = src.clone();
5217
5218     double M[9];
5219     Mat matM(3, 3, CV_64F, M);
5220     int interpolation = flags & INTER_MAX;
5221     if( interpolation == INTER_AREA )
5222         interpolation = INTER_LINEAR;
5223
5224     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
5225     M0.convertTo(matM, matM.type());
5226
5227 #ifdef HAVE_TEGRA_OPTIMIZATION
5228     if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
5229         return;
5230 #endif
5231
5232
5233 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
5234     CV_IPP_CHECK()
5235     {
5236         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5237         if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
5238            (cn == 1 || cn == 3 || cn == 4) &&
5239            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
5240            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
5241         {
5242             ippiWarpPerspectiveFunc ippFunc = 0;
5243             if ((flags & WARP_INVERSE_MAP) != 0)
5244             {
5245                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
5246                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
5247                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
5248                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
5249                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
5250                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
5251                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
5252                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
5253                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
5254             }
5255             else
5256             {
5257                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
5258                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
5259                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
5260                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
5261                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
5262                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
5263                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
5264                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
5265                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
5266             }
5267             int mode =
5268             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5269             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5270             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
5271             CV_Assert(mode && ippFunc);
5272
5273             double coeffs[3][3];
5274             for( int i = 0; i < 3; i++ )
5275                 for( int j = 0; j < 3; j++ )
5276                     coeffs[i][j] = matM.at<double>(i, j);
5277
5278             bool ok;
5279             Range range(0, dst.rows);
5280             IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5281             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5282             if( ok )
5283             {
5284                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5285                 return;
5286             }
5287             setIppErrorStatus();
5288         }
5289     }
5290 #endif
5291
5292     if( !(flags & WARP_INVERSE_MAP) )
5293         invert(matM, matM);
5294
5295     Range range(0, dst.rows);
5296     WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
5297     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5298 }
5299
5300
5301 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
5302 {
5303     angle *= CV_PI/180;
5304     double alpha = cos(angle)*scale;
5305     double beta = sin(angle)*scale;
5306
5307     Mat M(2, 3, CV_64F);
5308     double* m = M.ptr<double>();
5309
5310     m[0] = alpha;
5311     m[1] = beta;
5312     m[2] = (1-alpha)*center.x - beta*center.y;
5313     m[3] = -beta;
5314     m[4] = alpha;
5315     m[5] = beta*center.x + (1-alpha)*center.y;
5316
5317     return M;
5318 }
5319
5320 /* Calculates coefficients of perspective transformation
5321  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
5322  *
5323  *      c00*xi + c01*yi + c02
5324  * ui = ---------------------
5325  *      c20*xi + c21*yi + c22
5326  *
5327  *      c10*xi + c11*yi + c12
5328  * vi = ---------------------
5329  *      c20*xi + c21*yi + c22
5330  *
5331  * Coefficients are calculated by solving linear system:
5332  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
5333  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
5334  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
5335  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
5336  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
5337  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
5338  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
5339  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
5340  *
5341  * where:
5342  *   cij - matrix coefficients, c22 = 1
5343  */
5344 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
5345 {
5346     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
5347     double a[8][8], b[8];
5348     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
5349
5350     for( int i = 0; i < 4; ++i )
5351     {
5352         a[i][0] = a[i+4][3] = src[i].x;
5353         a[i][1] = a[i+4][4] = src[i].y;
5354         a[i][2] = a[i+4][5] = 1;
5355         a[i][3] = a[i][4] = a[i][5] =
5356         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
5357         a[i][6] = -src[i].x*dst[i].x;
5358         a[i][7] = -src[i].y*dst[i].x;
5359         a[i+4][6] = -src[i].x*dst[i].y;
5360         a[i+4][7] = -src[i].y*dst[i].y;
5361         b[i] = dst[i].x;
5362         b[i+4] = dst[i].y;
5363     }
5364
5365     solve( A, B, X, DECOMP_SVD );
5366     M.ptr<double>()[8] = 1.;
5367
5368     return M;
5369 }
5370
5371 /* Calculates coefficients of affine transformation
5372  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
5373  *
5374  * ui = c00*xi + c01*yi + c02
5375  *
5376  * vi = c10*xi + c11*yi + c12
5377  *
5378  * Coefficients are calculated by solving linear system:
5379  * / x0 y0  1  0  0  0 \ /c00\ /u0\
5380  * | x1 y1  1  0  0  0 | |c01| |u1|
5381  * | x2 y2  1  0  0  0 | |c02| |u2|
5382  * |  0  0  0 x0 y0  1 | |c10| |v0|
5383  * |  0  0  0 x1 y1  1 | |c11| |v1|
5384  * \  0  0  0 x2 y2  1 / |c12| |v2|
5385  *
5386  * where:
5387  *   cij - matrix coefficients
5388  */
5389
5390 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
5391 {
5392     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
5393     double a[6*6], b[6];
5394     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
5395
5396     for( int i = 0; i < 3; i++ )
5397     {
5398         int j = i*12;
5399         int k = i*12+6;
5400         a[j] = a[k+3] = src[i].x;
5401         a[j+1] = a[k+4] = src[i].y;
5402         a[j+2] = a[k+5] = 1;
5403         a[j+3] = a[j+4] = a[j+5] = 0;
5404         a[k] = a[k+1] = a[k+2] = 0;
5405         b[i*2] = dst[i].x;
5406         b[i*2+1] = dst[i].y;
5407     }
5408
5409     solve( A, B, X );
5410     return M;
5411 }
5412
5413 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
5414 {
5415     Mat matM = _matM.getMat();
5416     CV_Assert(matM.rows == 2 && matM.cols == 3);
5417     __iM.create(2, 3, matM.type());
5418     Mat _iM = __iM.getMat();
5419
5420     if( matM.type() == CV_32F )
5421     {
5422         const float* M = matM.ptr<float>();
5423         float* iM = _iM.ptr<float>();
5424         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5425
5426         double D = M[0]*M[step+1] - M[1]*M[step];
5427         D = D != 0 ? 1./D : 0;
5428         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5429         double b1 = -A11*M[2] - A12*M[step+2];
5430         double b2 = -A21*M[2] - A22*M[step+2];
5431
5432         iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
5433         iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
5434     }
5435     else if( matM.type() == CV_64F )
5436     {
5437         const double* M = matM.ptr<double>();
5438         double* iM = _iM.ptr<double>();
5439         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5440
5441         double D = M[0]*M[step+1] - M[1]*M[step];
5442         D = D != 0 ? 1./D : 0;
5443         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5444         double b1 = -A11*M[2] - A12*M[step+2];
5445         double b2 = -A21*M[2] - A22*M[step+2];
5446
5447         iM[0] = A11; iM[1] = A12; iM[2] = b1;
5448         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
5449     }
5450     else
5451         CV_Error( CV_StsUnsupportedFormat, "" );
5452 }
5453
5454 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
5455 {
5456     Mat src = _src.getMat(), dst = _dst.getMat();
5457     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
5458     return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5459 }
5460
5461 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
5462 {
5463     Mat src = _src.getMat(), dst = _dst.getMat();
5464     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
5465     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5466 }
5467
5468 CV_IMPL void
5469 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
5470 {
5471     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5472     CV_Assert( src.type() == dst.type() );
5473     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
5474         (double)dst.rows/src.rows, method );
5475 }
5476
5477
5478 CV_IMPL void
5479 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5480               int flags, CvScalar fillval )
5481 {
5482     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5483     cv::Mat matrix = cv::cvarrToMat(marr);
5484     CV_Assert( src.type() == dst.type() );
5485     cv::warpAffine( src, dst, matrix, dst.size(), flags,
5486         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5487         fillval );
5488 }
5489
5490 CV_IMPL void
5491 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5492                    int flags, CvScalar fillval )
5493 {
5494     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5495     cv::Mat matrix = cv::cvarrToMat(marr);
5496     CV_Assert( src.type() == dst.type() );
5497     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
5498         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5499         fillval );
5500 }
5501
5502 CV_IMPL void
5503 cvRemap( const CvArr* srcarr, CvArr* dstarr,
5504          const CvArr* _mapx, const CvArr* _mapy,
5505          int flags, CvScalar fillval )
5506 {
5507     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
5508     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
5509     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
5510     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
5511         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5512         fillval );
5513     CV_Assert( dst0.data == dst.data );
5514 }
5515
5516
5517 CV_IMPL CvMat*
5518 cv2DRotationMatrix( CvPoint2D32f center, double angle,
5519                     double scale, CvMat* matrix )
5520 {
5521     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
5522     CV_Assert( M.size() == M0.size() );
5523     M.convertTo(M0, M0.type());
5524     return matrix;
5525 }
5526
5527
5528 CV_IMPL CvMat*
5529 cvGetPerspectiveTransform( const CvPoint2D32f* src,
5530                           const CvPoint2D32f* dst,
5531                           CvMat* matrix )
5532 {
5533     cv::Mat M0 = cv::cvarrToMat(matrix),
5534         M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5535     CV_Assert( M.size() == M0.size() );
5536     M.convertTo(M0, M0.type());
5537     return matrix;
5538 }
5539
5540
5541 CV_IMPL CvMat*
5542 cvGetAffineTransform( const CvPoint2D32f* src,
5543                           const CvPoint2D32f* dst,
5544                           CvMat* matrix )
5545 {
5546     cv::Mat M0 = cv::cvarrToMat(matrix),
5547         M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5548     CV_Assert( M.size() == M0.size() );
5549     M.convertTo(M0, M0.type());
5550     return matrix;
5551 }
5552
5553
5554 CV_IMPL void
5555 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
5556 {
5557     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
5558     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
5559
5560     if( arr2 )
5561         map2 = cv::cvarrToMat(arr2);
5562     if( dstarr2 )
5563     {
5564         dstmap2 = cv::cvarrToMat(dstarr2);
5565         if( dstmap2.type() == CV_16SC1 )
5566             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
5567     }
5568
5569     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
5570 }
5571
5572 /****************************************************************************************\
5573 *                                   Log-Polar Transform                                  *
5574 \****************************************************************************************/
5575
5576 /* now it is done via Remap; more correct implementation should use
5577    some super-sampling technique outside of the "fovea" circle */
5578 CV_IMPL void
5579 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
5580             CvPoint2D32f center, double M, int flags )
5581 {
5582     cv::Ptr<CvMat> mapx, mapy;
5583
5584     CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
5585     CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
5586     CvSize ssize, dsize;
5587
5588     if( !CV_ARE_TYPES_EQ( src, dst ))
5589         CV_Error( CV_StsUnmatchedFormats, "" );
5590
5591     if( M <= 0 )
5592         CV_Error( CV_StsOutOfRange, "M should be >0" );
5593
5594     ssize = cvGetMatSize(src);
5595     dsize = cvGetMatSize(dst);
5596
5597     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5598     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5599
5600     if( !(flags & CV_WARP_INVERSE_MAP) )
5601     {
5602         int phi, rho;
5603         cv::AutoBuffer<double> _exp_tab(dsize.width);
5604         double* exp_tab = _exp_tab;
5605
5606         for( rho = 0; rho < dst->width; rho++ )
5607             exp_tab[rho] = std::exp(rho/M);
5608
5609         for( phi = 0; phi < dsize.height; phi++ )
5610         {
5611             double cp = cos(phi*2*CV_PI/dsize.height);
5612             double sp = sin(phi*2*CV_PI/dsize.height);
5613             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5614             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5615
5616             for( rho = 0; rho < dsize.width; rho++ )
5617             {
5618                 double r = exp_tab[rho];
5619                 double x = r*cp + center.x;
5620                 double y = r*sp + center.y;
5621
5622                 mx[rho] = (float)x;
5623                 my[rho] = (float)y;
5624             }
5625         }
5626     }
5627     else
5628     {
5629         int x, y;
5630         CvMat bufx, bufy, bufp, bufa;
5631         double ascale = ssize.height/(2*CV_PI);
5632         cv::AutoBuffer<float> _buf(4*dsize.width);
5633         float* buf = _buf;
5634
5635         bufx = cvMat( 1, dsize.width, CV_32F, buf );
5636         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5637         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5638         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5639
5640         for( x = 0; x < dsize.width; x++ )
5641             bufx.data.fl[x] = (float)x - center.x;
5642
5643         for( y = 0; y < dsize.height; y++ )
5644         {
5645             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5646             float* my = (float*)(mapy->data.ptr + y*mapy->step);
5647
5648             for( x = 0; x < dsize.width; x++ )
5649                 bufy.data.fl[x] = (float)y - center.y;
5650
5651 #if 1
5652             cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
5653
5654             for( x = 0; x < dsize.width; x++ )
5655                 bufp.data.fl[x] += 1.f;
5656
5657             cvLog( &bufp, &bufp );
5658
5659             for( x = 0; x < dsize.width; x++ )
5660             {
5661                 double rho = bufp.data.fl[x]*M;
5662                 double phi = bufa.data.fl[x]*ascale;
5663
5664                 mx[x] = (float)rho;
5665                 my[x] = (float)phi;
5666             }
5667 #else
5668             for( x = 0; x < dsize.width; x++ )
5669             {
5670                 double xx = bufx.data.fl[x];
5671                 double yy = bufy.data.fl[x];
5672
5673                 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
5674                 double a = atan2(yy,xx);
5675                 if( a < 0 )
5676                     a = 2*CV_PI + a;
5677                 a *= ascale;
5678
5679                 mx[x] = (float)p;
5680                 my[x] = (float)a;
5681             }
5682 #endif
5683         }
5684     }
5685
5686     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5687 }
5688
5689 void cv::logPolar( InputArray _src, OutputArray _dst,
5690                    Point2f center, double M, int flags )
5691 {
5692     Mat src = _src.getMat();
5693     _dst.create( src.size(), src.type() );
5694     CvMat c_src = src, c_dst = _dst.getMat();
5695     cvLogPolar( &c_src, &c_dst, center, M, flags );
5696 }
5697
5698 /****************************************************************************************
5699                                    Linear-Polar Transform
5700   J.L. Blanco, Apr 2009
5701  ****************************************************************************************/
5702 CV_IMPL
5703 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
5704             CvPoint2D32f center, double maxRadius, int flags )
5705 {
5706     cv::Ptr<CvMat> mapx, mapy;
5707
5708     CvMat srcstub, *src = (CvMat*)srcarr;
5709     CvMat dststub, *dst = (CvMat*)dstarr;
5710     CvSize ssize, dsize;
5711
5712     src = cvGetMat( srcarr, &srcstub,0,0 );
5713     dst = cvGetMat( dstarr, &dststub,0,0 );
5714
5715     if( !CV_ARE_TYPES_EQ( src, dst ))
5716         CV_Error( CV_StsUnmatchedFormats, "" );
5717
5718     ssize.width = src->cols;
5719     ssize.height = src->rows;
5720     dsize.width = dst->cols;
5721     dsize.height = dst->rows;
5722
5723     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5724     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5725
5726     if( !(flags & CV_WARP_INVERSE_MAP) )
5727     {
5728         int phi, rho;
5729
5730         for( phi = 0; phi < dsize.height; phi++ )
5731         {
5732             double cp = cos(phi*2*CV_PI/dsize.height);
5733             double sp = sin(phi*2*CV_PI/dsize.height);
5734             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5735             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5736
5737             for( rho = 0; rho < dsize.width; rho++ )
5738             {
5739                 double r = maxRadius*(rho+1)/dsize.width;
5740                 double x = r*cp + center.x;
5741                 double y = r*sp + center.y;
5742
5743                 mx[rho] = (float)x;
5744                 my[rho] = (float)y;
5745             }
5746         }
5747     }
5748     else
5749     {
5750         int x, y;
5751         CvMat bufx, bufy, bufp, bufa;
5752         const double ascale = ssize.height/(2*CV_PI);
5753         const double pscale = ssize.width/maxRadius;
5754
5755         cv::AutoBuffer<float> _buf(4*dsize.width);
5756         float* buf = _buf;
5757
5758         bufx = cvMat( 1, dsize.width, CV_32F, buf );
5759         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5760         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5761         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5762
5763         for( x = 0; x < dsize.width; x++ )
5764             bufx.data.fl[x] = (float)x - center.x;
5765
5766         for( y = 0; y < dsize.height; y++ )
5767         {
5768             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5769             float* my = (float*)(mapy->data.ptr + y*mapy->step);
5770
5771             for( x = 0; x < dsize.width; x++ )
5772                 bufy.data.fl[x] = (float)y - center.y;
5773
5774             cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
5775
5776             for( x = 0; x < dsize.width; x++ )
5777                 bufp.data.fl[x] += 1.f;
5778
5779             for( x = 0; x < dsize.width; x++ )
5780             {
5781                 double rho = bufp.data.fl[x]*pscale;
5782                 double phi = bufa.data.fl[x]*ascale;
5783                 mx[x] = (float)rho;
5784                 my[x] = (float)phi;
5785             }
5786         }
5787     }
5788
5789     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5790 }
5791
5792 void cv::linearPolar( InputArray _src, OutputArray _dst,
5793                       Point2f center, double maxRadius, int flags )
5794 {
5795     Mat src = _src.getMat();
5796     _dst.create( src.size(), src.type() );
5797     CvMat c_src = src, c_dst = _dst.getMat();
5798     cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );
5799 }
5800
5801 /* End of file. */