more optimization for warpAffine and warpPerspective
[platform/upstream/opencv.git] / modules / imgproc / src / imgwarp.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 //  Geometrical transforms on images and matrices: rotation, zoom etc.
47 //
48 // */
49
50 #include "precomp.hpp"
51 #include "opencl_kernels_imgproc.hpp"
52 #include "hal_replacement.hpp"
53
54 using namespace cv;
55
56 namespace cv
57 {
58 #if IPP_VERSION_X100 >= 710
59     typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
60     typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
61     typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
62 #endif
63
64 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
65     typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
66     typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
67     typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
68
69     template <int channels, typename Type>
70     bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
71     {
72         CV_INSTRUMENT_REGION_IPP()
73
74         Type values[channels];
75         for( int i = 0; i < channels; i++ )
76             values[i] = saturate_cast<Type>(value[i]);
77         return func(values, dataPointer, step, size) >= 0;
78     }
79
80     static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
81     {
82         CV_INSTRUMENT_REGION_IPP()
83
84         if( channels == 1 )
85         {
86             switch( depth )
87             {
88             case CV_8U:
89                 return CV_INSTRUMENT_FUN_IPP(ippiSet_8u_C1R,(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size)) >= 0;
90             case CV_16U:
91                 return CV_INSTRUMENT_FUN_IPP(ippiSet_16u_C1R,(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size)) >= 0;
92             case CV_32F:
93                 return CV_INSTRUMENT_FUN_IPP(ippiSet_32f_C1R,(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size)) >= 0;
94             }
95         }
96         else
97         {
98             if( channels == 3 )
99             {
100                 switch( depth )
101                 {
102                 case CV_8U:
103                     return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
104                 case CV_16U:
105                     return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
106                 case CV_32F:
107                     return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
108                 }
109             }
110             else if( channels == 4 )
111             {
112                 switch( depth )
113                 {
114                 case CV_8U:
115                     return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
116                 case CV_16U:
117                     return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
118                 case CV_32F:
119                     return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
120                 }
121             }
122         }
123         return false;
124     }
125 #endif
126
127 /************** interpolation formulas and tables ***************/
128
129 const int INTER_RESIZE_COEF_BITS=11;
130 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
131
132 const int INTER_REMAP_COEF_BITS=15;
133 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
134
135 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
136
137 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
138 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
139
140 #if CV_SSE2 || CV_NEON
141 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
142 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
143 #endif
144
145 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
146 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
147
148 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
149 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
150
151 static inline void interpolateLinear( float x, float* coeffs )
152 {
153     coeffs[0] = 1.f - x;
154     coeffs[1] = x;
155 }
156
157 static inline void interpolateCubic( float x, float* coeffs )
158 {
159     const float A = -0.75f;
160
161     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
162     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
163     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
164     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
165 }
166
167 static inline void interpolateLanczos4( float x, float* coeffs )
168 {
169     static const double s45 = 0.70710678118654752440084436210485;
170     static const double cs[][2]=
171     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
172
173     if( x < FLT_EPSILON )
174     {
175         for( int i = 0; i < 8; i++ )
176             coeffs[i] = 0;
177         coeffs[3] = 1;
178         return;
179     }
180
181     float sum = 0;
182     double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
183     for(int i = 0; i < 8; i++ )
184     {
185         double y = -(x+3-i)*CV_PI*0.25;
186         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
187         sum += coeffs[i];
188     }
189
190     sum = 1.f/sum;
191     for(int i = 0; i < 8; i++ )
192         coeffs[i] *= sum;
193 }
194
195 static void initInterTab1D(int method, float* tab, int tabsz)
196 {
197     float scale = 1.f/tabsz;
198     if( method == INTER_LINEAR )
199     {
200         for( int i = 0; i < tabsz; i++, tab += 2 )
201             interpolateLinear( i*scale, tab );
202     }
203     else if( method == INTER_CUBIC )
204     {
205         for( int i = 0; i < tabsz; i++, tab += 4 )
206             interpolateCubic( i*scale, tab );
207     }
208     else if( method == INTER_LANCZOS4 )
209     {
210         for( int i = 0; i < tabsz; i++, tab += 8 )
211             interpolateLanczos4( i*scale, tab );
212     }
213     else
214         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
215 }
216
217
218 static const void* initInterTab2D( int method, bool fixpt )
219 {
220     static bool inittab[INTER_MAX+1] = {false};
221     float* tab = 0;
222     short* itab = 0;
223     int ksize = 0;
224     if( method == INTER_LINEAR )
225         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
226     else if( method == INTER_CUBIC )
227         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
228     else if( method == INTER_LANCZOS4 )
229         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
230     else
231         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
232
233     if( !inittab[method] )
234     {
235         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
236         int i, j, k1, k2;
237         initInterTab1D(method, _tab, INTER_TAB_SIZE);
238         for( i = 0; i < INTER_TAB_SIZE; i++ )
239             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
240             {
241                 int isum = 0;
242                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
243                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
244
245                 for( k1 = 0; k1 < ksize; k1++ )
246                 {
247                     float vy = _tab[i*ksize + k1];
248                     for( k2 = 0; k2 < ksize; k2++ )
249                     {
250                         float v = vy*_tab[j*ksize + k2];
251                         tab[k1*ksize + k2] = v;
252                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
253                     }
254                 }
255
256                 if( isum != INTER_REMAP_COEF_SCALE )
257                 {
258                     int diff = isum - INTER_REMAP_COEF_SCALE;
259                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
260                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
261                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
262                         {
263                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
264                                 mk1 = k1, mk2 = k2;
265                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
266                                 Mk1 = k1, Mk2 = k2;
267                         }
268                     if( diff < 0 )
269                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
270                     else
271                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
272                 }
273             }
274         tab -= INTER_TAB_SIZE2*ksize*ksize;
275         itab -= INTER_TAB_SIZE2*ksize*ksize;
276 #if CV_SSE2 || CV_NEON
277         if( method == INTER_LINEAR )
278         {
279             for( i = 0; i < INTER_TAB_SIZE2; i++ )
280                 for( j = 0; j < 4; j++ )
281                 {
282                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
283                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
284                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
285                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
286                 }
287         }
288 #endif
289         inittab[method] = true;
290     }
291     return fixpt ? (const void*)itab : (const void*)tab;
292 }
293
294 #ifndef __MINGW32__
295 static bool initAllInterTab2D()
296 {
297     return  initInterTab2D( INTER_LINEAR, false ) &&
298             initInterTab2D( INTER_LINEAR, true ) &&
299             initInterTab2D( INTER_CUBIC, false ) &&
300             initInterTab2D( INTER_CUBIC, true ) &&
301             initInterTab2D( INTER_LANCZOS4, false ) &&
302             initInterTab2D( INTER_LANCZOS4, true );
303 }
304
305 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
306 #endif
307
308 template<typename ST, typename DT> struct Cast
309 {
310     typedef ST type1;
311     typedef DT rtype;
312
313     DT operator()(ST val) const { return saturate_cast<DT>(val); }
314 };
315
316 template<typename ST, typename DT, int bits> struct FixedPtCast
317 {
318     typedef ST type1;
319     typedef DT rtype;
320     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
321
322     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
323 };
324
325 /****************************************************************************************\
326 *                                         Resize                                         *
327 \****************************************************************************************/
328
329 class resizeNNInvoker :
330     public ParallelLoopBody
331 {
332 public:
333     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
334         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
335         ify(_ify)
336     {
337     }
338
339     virtual void operator() (const Range& range) const
340     {
341         Size ssize = src.size(), dsize = dst.size();
342         int y, x, pix_size = (int)src.elemSize();
343
344         for( y = range.start; y < range.end; y++ )
345         {
346             uchar* D = dst.data + dst.step*y;
347             int sy = std::min(cvFloor(y*ify), ssize.height-1);
348             const uchar* S = src.ptr(sy);
349
350             switch( pix_size )
351             {
352             case 1:
353                 for( x = 0; x <= dsize.width - 2; x += 2 )
354                 {
355                     uchar t0 = S[x_ofs[x]];
356                     uchar t1 = S[x_ofs[x+1]];
357                     D[x] = t0;
358                     D[x+1] = t1;
359                 }
360
361                 for( ; x < dsize.width; x++ )
362                     D[x] = S[x_ofs[x]];
363                 break;
364             case 2:
365                 for( x = 0; x < dsize.width; x++ )
366                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
367                 break;
368             case 3:
369                 for( x = 0; x < dsize.width; x++, D += 3 )
370                 {
371                     const uchar* _tS = S + x_ofs[x];
372                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
373                 }
374                 break;
375             case 4:
376                 for( x = 0; x < dsize.width; x++ )
377                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
378                 break;
379             case 6:
380                 for( x = 0; x < dsize.width; x++, D += 6 )
381                 {
382                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
383                     ushort* _tD = (ushort*)D;
384                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
385                 }
386                 break;
387             case 8:
388                 for( x = 0; x < dsize.width; x++, D += 8 )
389                 {
390                     const int* _tS = (const int*)(S + x_ofs[x]);
391                     int* _tD = (int*)D;
392                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
393                 }
394                 break;
395             case 12:
396                 for( x = 0; x < dsize.width; x++, D += 12 )
397                 {
398                     const int* _tS = (const int*)(S + x_ofs[x]);
399                     int* _tD = (int*)D;
400                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
401                 }
402                 break;
403             default:
404                 for( x = 0; x < dsize.width; x++, D += pix_size )
405                 {
406                     const int* _tS = (const int*)(S + x_ofs[x]);
407                     int* _tD = (int*)D;
408                     for( int k = 0; k < pix_size4; k++ )
409                         _tD[k] = _tS[k];
410                 }
411             }
412         }
413     }
414
415 private:
416     const Mat src;
417     Mat dst;
418     int* x_ofs, pix_size4;
419     double ify;
420
421     resizeNNInvoker(const resizeNNInvoker&);
422     resizeNNInvoker& operator=(const resizeNNInvoker&);
423 };
424
425 static void
426 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
427 {
428     Size ssize = src.size(), dsize = dst.size();
429     AutoBuffer<int> _x_ofs(dsize.width);
430     int* x_ofs = _x_ofs;
431     int pix_size = (int)src.elemSize();
432     int pix_size4 = (int)(pix_size / sizeof(int));
433     double ifx = 1./fx, ify = 1./fy;
434     int x;
435
436     for( x = 0; x < dsize.width; x++ )
437     {
438         int sx = cvFloor(x*ifx);
439         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
440     }
441
442     Range range(0, dsize.height);
443     resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
444     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
445 }
446
447
448 struct VResizeNoVec
449 {
450     int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
451 };
452
453 struct HResizeNoVec
454 {
455     int operator()(const uchar**, uchar**, int, const int*,
456         const uchar*, int, int, int, int, int) const { return 0; }
457 };
458
459 #if CV_SSE2
460
461 struct VResizeLinearVec_32s8u
462 {
463     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
464     {
465         if( !checkHardwareSupport(CV_CPU_SSE2) )
466             return 0;
467
468         const int** src = (const int**)_src;
469         const short* beta = (const short*)_beta;
470         const int *S0 = src[0], *S1 = src[1];
471         int x = 0;
472         __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
473         __m128i delta = _mm_set1_epi16(2);
474
475         if( (((size_t)S0|(size_t)S1)&15) == 0 )
476             for( ; x <= width - 16; x += 16 )
477             {
478                 __m128i x0, x1, x2, y0, y1, y2;
479                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
480                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
481                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
482                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
483                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
484                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
485
486                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
487                 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
488                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
489                 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
490                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
491                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
492
493                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
494                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
495
496                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
497                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
498                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
499             }
500         else
501             for( ; x <= width - 16; x += 16 )
502             {
503                 __m128i x0, x1, x2, y0, y1, y2;
504                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
505                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
506                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
507                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
508                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
509                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
510
511                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
512                 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
513                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
514                 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
515                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
516                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
517
518                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
519                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
520
521                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
522                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
523                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
524             }
525
526         for( ; x < width - 4; x += 4 )
527         {
528             __m128i x0, y0;
529             x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
530             y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
531             x0 = _mm_packs_epi32(x0, x0);
532             y0 = _mm_packs_epi32(y0, y0);
533             x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
534             x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
535             x0 = _mm_packus_epi16(x0, x0);
536             *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
537         }
538
539         return x;
540     }
541 };
542
543
544 template<int shiftval> struct VResizeLinearVec_32f16
545 {
546     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
547     {
548         if( !checkHardwareSupport(CV_CPU_SSE2) )
549             return 0;
550
551         const float** src = (const float**)_src;
552         const float* beta = (const float*)_beta;
553         const float *S0 = src[0], *S1 = src[1];
554         ushort* dst = (ushort*)_dst;
555         int x = 0;
556
557         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
558         __m128i preshift = _mm_set1_epi32(shiftval);
559         __m128i postshift = _mm_set1_epi16((short)shiftval);
560
561         if( (((size_t)S0|(size_t)S1)&15) == 0 )
562             for( ; x <= width - 16; x += 16 )
563             {
564                 __m128 x0, x1, y0, y1;
565                 __m128i t0, t1, t2;
566                 x0 = _mm_load_ps(S0 + x);
567                 x1 = _mm_load_ps(S0 + x + 4);
568                 y0 = _mm_load_ps(S1 + x);
569                 y1 = _mm_load_ps(S1 + x + 4);
570
571                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
572                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
573                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
574                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
575                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
576
577                 x0 = _mm_load_ps(S0 + x + 8);
578                 x1 = _mm_load_ps(S0 + x + 12);
579                 y0 = _mm_load_ps(S1 + x + 8);
580                 y1 = _mm_load_ps(S1 + x + 12);
581
582                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
583                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
584                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
585                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
586                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
587
588                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
589                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
590             }
591         else
592             for( ; x <= width - 16; x += 16 )
593             {
594                 __m128 x0, x1, y0, y1;
595                 __m128i t0, t1, t2;
596                 x0 = _mm_loadu_ps(S0 + x);
597                 x1 = _mm_loadu_ps(S0 + x + 4);
598                 y0 = _mm_loadu_ps(S1 + x);
599                 y1 = _mm_loadu_ps(S1 + x + 4);
600
601                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
602                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
603                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
604                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
605                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
606
607                 x0 = _mm_loadu_ps(S0 + x + 8);
608                 x1 = _mm_loadu_ps(S0 + x + 12);
609                 y0 = _mm_loadu_ps(S1 + x + 8);
610                 y1 = _mm_loadu_ps(S1 + x + 12);
611
612                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
613                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
614                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
615                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
616                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
617
618                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
619                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
620             }
621
622         for( ; x < width - 4; x += 4 )
623         {
624             __m128 x0, y0;
625             __m128i t0;
626             x0 = _mm_loadu_ps(S0 + x);
627             y0 = _mm_loadu_ps(S1 + x);
628
629             x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
630             t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
631             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
632             _mm_storel_epi64( (__m128i*)(dst + x), t0);
633         }
634
635         return x;
636     }
637 };
638
639 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
640 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
641
642 struct VResizeLinearVec_32f
643 {
644     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
645     {
646         if( !checkHardwareSupport(CV_CPU_SSE) )
647             return 0;
648
649         const float** src = (const float**)_src;
650         const float* beta = (const float*)_beta;
651         const float *S0 = src[0], *S1 = src[1];
652         float* dst = (float*)_dst;
653         int x = 0;
654
655         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
656
657         if( (((size_t)S0|(size_t)S1)&15) == 0 )
658             for( ; x <= width - 8; x += 8 )
659             {
660                 __m128 x0, x1, y0, y1;
661                 x0 = _mm_load_ps(S0 + x);
662                 x1 = _mm_load_ps(S0 + x + 4);
663                 y0 = _mm_load_ps(S1 + x);
664                 y1 = _mm_load_ps(S1 + x + 4);
665
666                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
667                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
668
669                 _mm_storeu_ps( dst + x, x0);
670                 _mm_storeu_ps( dst + x + 4, x1);
671             }
672         else
673             for( ; x <= width - 8; x += 8 )
674             {
675                 __m128 x0, x1, y0, y1;
676                 x0 = _mm_loadu_ps(S0 + x);
677                 x1 = _mm_loadu_ps(S0 + x + 4);
678                 y0 = _mm_loadu_ps(S1 + x);
679                 y1 = _mm_loadu_ps(S1 + x + 4);
680
681                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
682                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
683
684                 _mm_storeu_ps( dst + x, x0);
685                 _mm_storeu_ps( dst + x + 4, x1);
686             }
687
688         return x;
689     }
690 };
691
692
693 struct VResizeCubicVec_32s8u
694 {
695     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
696     {
697         if( !checkHardwareSupport(CV_CPU_SSE2) )
698             return 0;
699
700         const int** src = (const int**)_src;
701         const short* beta = (const short*)_beta;
702         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
703         int x = 0;
704         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
705         __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
706             b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
707
708         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
709             for( ; x <= width - 8; x += 8 )
710             {
711                 __m128i x0, x1, y0, y1;
712                 __m128 s0, s1, f0, f1;
713                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
714                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
715                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
716                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
717
718                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
719                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
720                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
721                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
722                 s0 = _mm_add_ps(s0, f0);
723                 s1 = _mm_add_ps(s1, f1);
724
725                 x0 = _mm_load_si128((const __m128i*)(S2 + x));
726                 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
727                 y0 = _mm_load_si128((const __m128i*)(S3 + x));
728                 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
729
730                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
731                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
732                 s0 = _mm_add_ps(s0, f0);
733                 s1 = _mm_add_ps(s1, f1);
734                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
735                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
736                 s0 = _mm_add_ps(s0, f0);
737                 s1 = _mm_add_ps(s1, f1);
738
739                 x0 = _mm_cvtps_epi32(s0);
740                 x1 = _mm_cvtps_epi32(s1);
741
742                 x0 = _mm_packs_epi32(x0, x1);
743                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
744             }
745         else
746             for( ; x <= width - 8; x += 8 )
747             {
748                 __m128i x0, x1, y0, y1;
749                 __m128 s0, s1, f0, f1;
750                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
751                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
752                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
753                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
754
755                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
756                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
757                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
758                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
759                 s0 = _mm_add_ps(s0, f0);
760                 s1 = _mm_add_ps(s1, f1);
761
762                 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
763                 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
764                 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
765                 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
766
767                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
768                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
769                 s0 = _mm_add_ps(s0, f0);
770                 s1 = _mm_add_ps(s1, f1);
771                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
772                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
773                 s0 = _mm_add_ps(s0, f0);
774                 s1 = _mm_add_ps(s1, f1);
775
776                 x0 = _mm_cvtps_epi32(s0);
777                 x1 = _mm_cvtps_epi32(s1);
778
779                 x0 = _mm_packs_epi32(x0, x1);
780                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
781             }
782
783         return x;
784     }
785 };
786
787
788 template<int shiftval> struct VResizeCubicVec_32f16
789 {
790     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
791     {
792         if( !checkHardwareSupport(CV_CPU_SSE2) )
793             return 0;
794
795         const float** src = (const float**)_src;
796         const float* beta = (const float*)_beta;
797         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
798         ushort* dst = (ushort*)_dst;
799         int x = 0;
800         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
801             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
802         __m128i preshift = _mm_set1_epi32(shiftval);
803         __m128i postshift = _mm_set1_epi16((short)shiftval);
804
805         for( ; x <= width - 8; x += 8 )
806         {
807             __m128 x0, x1, y0, y1, s0, s1;
808             __m128i t0, t1;
809             x0 = _mm_loadu_ps(S0 + x);
810             x1 = _mm_loadu_ps(S0 + x + 4);
811             y0 = _mm_loadu_ps(S1 + x);
812             y1 = _mm_loadu_ps(S1 + x + 4);
813
814             s0 = _mm_mul_ps(x0, b0);
815             s1 = _mm_mul_ps(x1, b0);
816             y0 = _mm_mul_ps(y0, b1);
817             y1 = _mm_mul_ps(y1, b1);
818             s0 = _mm_add_ps(s0, y0);
819             s1 = _mm_add_ps(s1, y1);
820
821             x0 = _mm_loadu_ps(S2 + x);
822             x1 = _mm_loadu_ps(S2 + x + 4);
823             y0 = _mm_loadu_ps(S3 + x);
824             y1 = _mm_loadu_ps(S3 + x + 4);
825
826             x0 = _mm_mul_ps(x0, b2);
827             x1 = _mm_mul_ps(x1, b2);
828             y0 = _mm_mul_ps(y0, b3);
829             y1 = _mm_mul_ps(y1, b3);
830             s0 = _mm_add_ps(s0, x0);
831             s1 = _mm_add_ps(s1, x1);
832             s0 = _mm_add_ps(s0, y0);
833             s1 = _mm_add_ps(s1, y1);
834
835             t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
836             t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
837
838             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
839             _mm_storeu_si128( (__m128i*)(dst + x), t0);
840         }
841
842         return x;
843     }
844 };
845
846 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
847 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
848
849 struct VResizeCubicVec_32f
850 {
851     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
852     {
853         if( !checkHardwareSupport(CV_CPU_SSE) )
854             return 0;
855
856         const float** src = (const float**)_src;
857         const float* beta = (const float*)_beta;
858         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
859         float* dst = (float*)_dst;
860         int x = 0;
861         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
862             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
863
864         for( ; x <= width - 8; x += 8 )
865         {
866             __m128 x0, x1, y0, y1, s0, s1;
867             x0 = _mm_loadu_ps(S0 + x);
868             x1 = _mm_loadu_ps(S0 + x + 4);
869             y0 = _mm_loadu_ps(S1 + x);
870             y1 = _mm_loadu_ps(S1 + x + 4);
871
872             s0 = _mm_mul_ps(x0, b0);
873             s1 = _mm_mul_ps(x1, b0);
874             y0 = _mm_mul_ps(y0, b1);
875             y1 = _mm_mul_ps(y1, b1);
876             s0 = _mm_add_ps(s0, y0);
877             s1 = _mm_add_ps(s1, y1);
878
879             x0 = _mm_loadu_ps(S2 + x);
880             x1 = _mm_loadu_ps(S2 + x + 4);
881             y0 = _mm_loadu_ps(S3 + x);
882             y1 = _mm_loadu_ps(S3 + x + 4);
883
884             x0 = _mm_mul_ps(x0, b2);
885             x1 = _mm_mul_ps(x1, b2);
886             y0 = _mm_mul_ps(y0, b3);
887             y1 = _mm_mul_ps(y1, b3);
888             s0 = _mm_add_ps(s0, x0);
889             s1 = _mm_add_ps(s1, x1);
890             s0 = _mm_add_ps(s0, y0);
891             s1 = _mm_add_ps(s1, y1);
892
893             _mm_storeu_ps( dst + x, s0);
894             _mm_storeu_ps( dst + x + 4, s1);
895         }
896
897         return x;
898     }
899 };
900
901 #if CV_SSE4_1
902
903 struct VResizeLanczos4Vec_32f16u
904 {
905     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
906     {
907         const float** src = (const float**)_src;
908         const float* beta = (const float*)_beta;
909         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
910                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
911         short * dst = (short*)_dst;
912         int x = 0;
913         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
914                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
915                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
916                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
917
918         for( ; x <= width - 8; x += 8 )
919         {
920             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
921             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
922             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
923             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
924             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
925             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
926             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
927             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
928
929             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
930             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
931             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
932             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
933             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
934             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
935             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
936             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
937
938             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
939             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
940
941             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
942         }
943
944         return x;
945     }
946 };
947
948 #else
949
950 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
951
952 #endif
953
954 struct VResizeLanczos4Vec_32f16s
955 {
956     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
957     {
958         const float** src = (const float**)_src;
959         const float* beta = (const float*)_beta;
960         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
961                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
962         short * dst = (short*)_dst;
963         int x = 0;
964         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
965                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
966                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
967                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
968
969         for( ; x <= width - 8; x += 8 )
970         {
971             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
972             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
973             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
974             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
975             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
976             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
977             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
978             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
979
980             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
981             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
982             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
983             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
984             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
985             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
986             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
987             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
988
989             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
990             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
991
992             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
993         }
994
995         return x;
996     }
997 };
998
999
1000 struct VResizeLanczos4Vec_32f
1001 {
1002     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1003     {
1004         const float** src = (const float**)_src;
1005         const float* beta = (const float*)_beta;
1006         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1007                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1008         float* dst = (float*)_dst;
1009         int x = 0;
1010
1011         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
1012                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
1013                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
1014                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
1015
1016         for( ; x <= width - 4; x += 4 )
1017         {
1018             __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
1019             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
1020             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
1021             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
1022             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
1023             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
1024             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
1025             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
1026
1027             _mm_storeu_ps(dst + x, v_dst);
1028         }
1029
1030         return x;
1031     }
1032 };
1033
1034
1035 #elif CV_NEON
1036
1037 struct VResizeLinearVec_32s8u
1038 {
1039     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
1040     {
1041         const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
1042         const short* beta = (const short*)_beta;
1043         int x = 0;
1044         int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
1045
1046         for( ; x <= width - 16; x += 16)
1047         {
1048             int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
1049             int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
1050
1051             int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
1052             int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
1053
1054             int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
1055                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
1056             v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
1057
1058             v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
1059             v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
1060             v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
1061             v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
1062
1063             v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
1064             v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
1065
1066             int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
1067                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
1068             v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
1069
1070             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
1071         }
1072
1073         return x;
1074     }
1075 };
1076
1077 struct VResizeLinearVec_32f16u
1078 {
1079     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1080     {
1081         const float** src = (const float**)_src;
1082         const float* beta = (const float*)_beta;
1083         const float *S0 = src[0], *S1 = src[1];
1084         ushort* dst = (ushort*)_dst;
1085         int x = 0;
1086
1087         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1088
1089         for( ; x <= width - 8; x += 8 )
1090         {
1091             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1092             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1093
1094             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
1095             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
1096
1097             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1098                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1099         }
1100
1101         return x;
1102     }
1103 };
1104
1105 struct VResizeLinearVec_32f16s
1106 {
1107     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1108     {
1109         const float** src = (const float**)_src;
1110         const float* beta = (const float*)_beta;
1111         const float *S0 = src[0], *S1 = src[1];
1112         short* dst = (short*)_dst;
1113         int x = 0;
1114
1115         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1116
1117         for( ; x <= width - 8; x += 8 )
1118         {
1119             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1120             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1121
1122             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
1123             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
1124
1125             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1126                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1127         }
1128
1129         return x;
1130     }
1131 };
1132
1133 struct VResizeLinearVec_32f
1134 {
1135     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1136     {
1137         const float** src = (const float**)_src;
1138         const float* beta = (const float*)_beta;
1139         const float *S0 = src[0], *S1 = src[1];
1140         float* dst = (float*)_dst;
1141         int x = 0;
1142
1143         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1144
1145         for( ; x <= width - 8; x += 8 )
1146         {
1147             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1148             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1149
1150             vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
1151             vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
1152         }
1153
1154         return x;
1155     }
1156 };
1157
1158 typedef VResizeNoVec VResizeCubicVec_32s8u;
1159
1160 struct VResizeCubicVec_32f16u
1161 {
1162     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1163     {
1164         const float** src = (const float**)_src;
1165         const float* beta = (const float*)_beta;
1166         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1167         ushort* dst = (ushort*)_dst;
1168         int x = 0;
1169         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1170                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1171
1172         for( ; x <= width - 8; x += 8 )
1173         {
1174             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1175                                                                          v_b1, vld1q_f32(S1 + x)),
1176                                                                          v_b2, vld1q_f32(S2 + x)),
1177                                                                          v_b3, vld1q_f32(S3 + x));
1178             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1179                                                                          v_b1, vld1q_f32(S1 + x + 4)),
1180                                                                          v_b2, vld1q_f32(S2 + x + 4)),
1181                                                                          v_b3, vld1q_f32(S3 + x + 4));
1182
1183             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1184                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1185         }
1186
1187         return x;
1188     }
1189 };
1190
1191 struct VResizeCubicVec_32f16s
1192 {
1193     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1194     {
1195         const float** src = (const float**)_src;
1196         const float* beta = (const float*)_beta;
1197         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1198         short* dst = (short*)_dst;
1199         int x = 0;
1200         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1201                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1202
1203         for( ; x <= width - 8; x += 8 )
1204         {
1205             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1206                                                                          v_b1, vld1q_f32(S1 + x)),
1207                                                                          v_b2, vld1q_f32(S2 + x)),
1208                                                                          v_b3, vld1q_f32(S3 + x));
1209             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1210                                                                          v_b1, vld1q_f32(S1 + x + 4)),
1211                                                                          v_b2, vld1q_f32(S2 + x + 4)),
1212                                                                          v_b3, vld1q_f32(S3 + x + 4));
1213
1214             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1215                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1216         }
1217
1218         return x;
1219     }
1220 };
1221
1222 struct VResizeCubicVec_32f
1223 {
1224     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1225     {
1226         const float** src = (const float**)_src;
1227         const float* beta = (const float*)_beta;
1228         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1229         float* dst = (float*)_dst;
1230         int x = 0;
1231         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1232                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1233
1234         for( ; x <= width - 8; x += 8 )
1235         {
1236             vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1237                                                                        v_b1, vld1q_f32(S1 + x)),
1238                                                                        v_b2, vld1q_f32(S2 + x)),
1239                                                                        v_b3, vld1q_f32(S3 + x)));
1240             vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1241                                                                           v_b1, vld1q_f32(S1 + x + 4)),
1242                                                                           v_b2, vld1q_f32(S2 + x + 4)),
1243                                                                           v_b3, vld1q_f32(S3 + x + 4)));
1244         }
1245
1246         return x;
1247     }
1248 };
1249
1250 struct VResizeLanczos4Vec_32f16u
1251 {
1252     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1253     {
1254         const float** src = (const float**)_src;
1255         const float* beta = (const float*)_beta;
1256         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1257                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1258         ushort * dst = (ushort*)_dst;
1259         int x = 0;
1260         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1261                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1262                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1263                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1264
1265         for( ; x <= width - 8; x += 8 )
1266         {
1267             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1268                                                                          v_b1, vld1q_f32(S1 + x)),
1269                                                                          v_b2, vld1q_f32(S2 + x)),
1270                                                                          v_b3, vld1q_f32(S3 + x));
1271             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1272                                                                          v_b5, vld1q_f32(S5 + x)),
1273                                                                          v_b6, vld1q_f32(S6 + x)),
1274                                                                          v_b7, vld1q_f32(S7 + x));
1275             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1276
1277             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1278                                                              v_b1, vld1q_f32(S1 + x + 4)),
1279                                                              v_b2, vld1q_f32(S2 + x + 4)),
1280                                                              v_b3, vld1q_f32(S3 + x + 4));
1281             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1282                                                              v_b5, vld1q_f32(S5 + x + 4)),
1283                                                              v_b6, vld1q_f32(S6 + x + 4)),
1284                                                              v_b7, vld1q_f32(S7 + x + 4));
1285             v_dst1 = vaddq_f32(v_dst0, v_dst1);
1286
1287             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
1288                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1289         }
1290
1291         return x;
1292     }
1293 };
1294
1295 struct VResizeLanczos4Vec_32f16s
1296 {
1297     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1298     {
1299         const float** src = (const float**)_src;
1300         const float* beta = (const float*)_beta;
1301         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1302                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1303         short * dst = (short*)_dst;
1304         int x = 0;
1305         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1306                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1307                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1308                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1309
1310         for( ; x <= width - 8; x += 8 )
1311         {
1312             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1313                                                                          v_b1, vld1q_f32(S1 + x)),
1314                                                                          v_b2, vld1q_f32(S2 + x)),
1315                                                                          v_b3, vld1q_f32(S3 + x));
1316             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1317                                                                          v_b5, vld1q_f32(S5 + x)),
1318                                                                          v_b6, vld1q_f32(S6 + x)),
1319                                                                          v_b7, vld1q_f32(S7 + x));
1320             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1321
1322             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1323                                                              v_b1, vld1q_f32(S1 + x + 4)),
1324                                                              v_b2, vld1q_f32(S2 + x + 4)),
1325                                                              v_b3, vld1q_f32(S3 + x + 4));
1326             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1327                                                              v_b5, vld1q_f32(S5 + x + 4)),
1328                                                              v_b6, vld1q_f32(S6 + x + 4)),
1329                                                              v_b7, vld1q_f32(S7 + x + 4));
1330             v_dst1 = vaddq_f32(v_dst0, v_dst1);
1331
1332             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
1333                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1334         }
1335
1336         return x;
1337     }
1338 };
1339
1340 struct VResizeLanczos4Vec_32f
1341 {
1342     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1343     {
1344         const float** src = (const float**)_src;
1345         const float* beta = (const float*)_beta;
1346         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1347                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1348         float* dst = (float*)_dst;
1349         int x = 0;
1350         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1351                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1352                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1353                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1354
1355         for( ; x <= width - 4; x += 4 )
1356         {
1357             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1358                                                                          v_b1, vld1q_f32(S1 + x)),
1359                                                                          v_b2, vld1q_f32(S2 + x)),
1360                                                                          v_b3, vld1q_f32(S3 + x));
1361             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1362                                                                          v_b5, vld1q_f32(S5 + x)),
1363                                                                          v_b6, vld1q_f32(S6 + x)),
1364                                                                          v_b7, vld1q_f32(S7 + x));
1365             vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
1366         }
1367
1368         return x;
1369     }
1370 };
1371
1372 #else
1373
1374 typedef VResizeNoVec VResizeLinearVec_32s8u;
1375 typedef VResizeNoVec VResizeLinearVec_32f16u;
1376 typedef VResizeNoVec VResizeLinearVec_32f16s;
1377 typedef VResizeNoVec VResizeLinearVec_32f;
1378
1379 typedef VResizeNoVec VResizeCubicVec_32s8u;
1380 typedef VResizeNoVec VResizeCubicVec_32f16u;
1381 typedef VResizeNoVec VResizeCubicVec_32f16s;
1382 typedef VResizeNoVec VResizeCubicVec_32f;
1383
1384 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
1385 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
1386 typedef VResizeNoVec VResizeLanczos4Vec_32f;
1387
1388 #endif
1389
1390 typedef HResizeNoVec HResizeLinearVec_8u32s;
1391 typedef HResizeNoVec HResizeLinearVec_16u32f;
1392 typedef HResizeNoVec HResizeLinearVec_16s32f;
1393 typedef HResizeNoVec HResizeLinearVec_32f;
1394 typedef HResizeNoVec HResizeLinearVec_64f;
1395
1396
1397 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1398 struct HResizeLinear
1399 {
1400     typedef T value_type;
1401     typedef WT buf_type;
1402     typedef AT alpha_type;
1403
1404     void operator()(const T** src, WT** dst, int count,
1405                     const int* xofs, const AT* alpha,
1406                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1407     {
1408         int dx, k;
1409         VecOp vecOp;
1410
1411         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
1412             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
1413
1414         for( k = 0; k <= count - 2; k++ )
1415         {
1416             const T *S0 = src[k], *S1 = src[k+1];
1417             WT *D0 = dst[k], *D1 = dst[k+1];
1418             for( dx = dx0; dx < xmax; dx++ )
1419             {
1420                 int sx = xofs[dx];
1421                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1422                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1423                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1424                 D0[dx] = t0; D1[dx] = t1;
1425             }
1426
1427             for( ; dx < dwidth; dx++ )
1428             {
1429                 int sx = xofs[dx];
1430                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1431             }
1432         }
1433
1434         for( ; k < count; k++ )
1435         {
1436             const T *S = src[k];
1437             WT *D = dst[k];
1438             for( dx = 0; dx < xmax; dx++ )
1439             {
1440                 int sx = xofs[dx];
1441                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1442             }
1443
1444             for( ; dx < dwidth; dx++ )
1445                 D[dx] = WT(S[xofs[dx]]*ONE);
1446         }
1447     }
1448 };
1449
1450
1451 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1452 struct VResizeLinear
1453 {
1454     typedef T value_type;
1455     typedef WT buf_type;
1456     typedef AT alpha_type;
1457
1458     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1459     {
1460         WT b0 = beta[0], b1 = beta[1];
1461         const WT *S0 = src[0], *S1 = src[1];
1462         CastOp castOp;
1463         VecOp vecOp;
1464
1465         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1466         #if CV_ENABLE_UNROLLED
1467         for( ; x <= width - 4; x += 4 )
1468         {
1469             WT t0, t1;
1470             t0 = S0[x]*b0 + S1[x]*b1;
1471             t1 = S0[x+1]*b0 + S1[x+1]*b1;
1472             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1473             t0 = S0[x+2]*b0 + S1[x+2]*b1;
1474             t1 = S0[x+3]*b0 + S1[x+3]*b1;
1475             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1476         }
1477         #endif
1478         for( ; x < width; x++ )
1479             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1480     }
1481 };
1482
1483 template<>
1484 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1485 {
1486     typedef uchar value_type;
1487     typedef int buf_type;
1488     typedef short alpha_type;
1489
1490     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1491     {
1492         alpha_type b0 = beta[0], b1 = beta[1];
1493         const buf_type *S0 = src[0], *S1 = src[1];
1494         VResizeLinearVec_32s8u vecOp;
1495
1496         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1497         #if CV_ENABLE_UNROLLED
1498         for( ; x <= width - 4; x += 4 )
1499         {
1500             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1501             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1502             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1503             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1504         }
1505         #endif
1506         for( ; x < width; x++ )
1507             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1508     }
1509 };
1510
1511
1512 template<typename T, typename WT, typename AT>
1513 struct HResizeCubic
1514 {
1515     typedef T value_type;
1516     typedef WT buf_type;
1517     typedef AT alpha_type;
1518
1519     void operator()(const T** src, WT** dst, int count,
1520                     const int* xofs, const AT* alpha,
1521                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1522     {
1523         for( int k = 0; k < count; k++ )
1524         {
1525             const T *S = src[k];
1526             WT *D = dst[k];
1527             int dx = 0, limit = xmin;
1528             for(;;)
1529             {
1530                 for( ; dx < limit; dx++, alpha += 4 )
1531                 {
1532                     int j, sx = xofs[dx] - cn;
1533                     WT v = 0;
1534                     for( j = 0; j < 4; j++ )
1535                     {
1536                         int sxj = sx + j*cn;
1537                         if( (unsigned)sxj >= (unsigned)swidth )
1538                         {
1539                             while( sxj < 0 )
1540                                 sxj += cn;
1541                             while( sxj >= swidth )
1542                                 sxj -= cn;
1543                         }
1544                         v += S[sxj]*alpha[j];
1545                     }
1546                     D[dx] = v;
1547                 }
1548                 if( limit == dwidth )
1549                     break;
1550                 for( ; dx < xmax; dx++, alpha += 4 )
1551                 {
1552                     int sx = xofs[dx];
1553                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1554                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1555                 }
1556                 limit = dwidth;
1557             }
1558             alpha -= dwidth*4;
1559         }
1560     }
1561 };
1562
1563
1564 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1565 struct VResizeCubic
1566 {
1567     typedef T value_type;
1568     typedef WT buf_type;
1569     typedef AT alpha_type;
1570
1571     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1572     {
1573         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1574         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1575         CastOp castOp;
1576         VecOp vecOp;
1577
1578         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1579         for( ; x < width; x++ )
1580             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
1581     }
1582 };
1583
1584
1585 template<typename T, typename WT, typename AT>
1586 struct HResizeLanczos4
1587 {
1588     typedef T value_type;
1589     typedef WT buf_type;
1590     typedef AT alpha_type;
1591
1592     void operator()(const T** src, WT** dst, int count,
1593                     const int* xofs, const AT* alpha,
1594                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1595     {
1596         for( int k = 0; k < count; k++ )
1597         {
1598             const T *S = src[k];
1599             WT *D = dst[k];
1600             int dx = 0, limit = xmin;
1601             for(;;)
1602             {
1603                 for( ; dx < limit; dx++, alpha += 8 )
1604                 {
1605                     int j, sx = xofs[dx] - cn*3;
1606                     WT v = 0;
1607                     for( j = 0; j < 8; j++ )
1608                     {
1609                         int sxj = sx + j*cn;
1610                         if( (unsigned)sxj >= (unsigned)swidth )
1611                         {
1612                             while( sxj < 0 )
1613                                 sxj += cn;
1614                             while( sxj >= swidth )
1615                                 sxj -= cn;
1616                         }
1617                         v += S[sxj]*alpha[j];
1618                     }
1619                     D[dx] = v;
1620                 }
1621                 if( limit == dwidth )
1622                     break;
1623                 for( ; dx < xmax; dx++, alpha += 8 )
1624                 {
1625                     int sx = xofs[dx];
1626                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
1627                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
1628                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
1629                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
1630                 }
1631                 limit = dwidth;
1632             }
1633             alpha -= dwidth*8;
1634         }
1635     }
1636 };
1637
1638
1639 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1640 struct VResizeLanczos4
1641 {
1642     typedef T value_type;
1643     typedef WT buf_type;
1644     typedef AT alpha_type;
1645
1646     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1647     {
1648         CastOp castOp;
1649         VecOp vecOp;
1650         int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1651         #if CV_ENABLE_UNROLLED
1652         for( ; x <= width - 4; x += 4 )
1653         {
1654             WT b = beta[0];
1655             const WT* S = src[0];
1656             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
1657
1658             for( k = 1; k < 8; k++ )
1659             {
1660                 b = beta[k]; S = src[k];
1661                 s0 += S[x]*b; s1 += S[x+1]*b;
1662                 s2 += S[x+2]*b; s3 += S[x+3]*b;
1663             }
1664
1665             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
1666             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
1667         }
1668         #endif
1669         for( ; x < width; x++ )
1670         {
1671             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
1672                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
1673                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
1674         }
1675     }
1676 };
1677
1678
1679 static inline int clip(int x, int a, int b)
1680 {
1681     return x >= a ? (x < b ? x : b-1) : a;
1682 }
1683
1684 static const int MAX_ESIZE=16;
1685
1686 template <typename HResize, typename VResize>
1687 class resizeGeneric_Invoker :
1688     public ParallelLoopBody
1689 {
1690 public:
1691     typedef typename HResize::value_type T;
1692     typedef typename HResize::buf_type WT;
1693     typedef typename HResize::alpha_type AT;
1694
1695     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
1696         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
1697         int _ksize, int _xmin, int _xmax) :
1698         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
1699         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
1700         ksize(_ksize), xmin(_xmin), xmax(_xmax)
1701     {
1702         CV_Assert(ksize <= MAX_ESIZE);
1703     }
1704
1705 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1706 # pragma GCC diagnostic push
1707 # pragma GCC diagnostic ignored "-Warray-bounds"
1708 #endif
1709     virtual void operator() (const Range& range) const
1710     {
1711         int dy, cn = src.channels();
1712         HResize hresize;
1713         VResize vresize;
1714
1715         int bufstep = (int)alignSize(dsize.width, 16);
1716         AutoBuffer<WT> _buffer(bufstep*ksize);
1717         const T* srows[MAX_ESIZE]={0};
1718         WT* rows[MAX_ESIZE]={0};
1719         int prev_sy[MAX_ESIZE];
1720
1721         for(int k = 0; k < ksize; k++ )
1722         {
1723             prev_sy[k] = -1;
1724             rows[k] = (WT*)_buffer + bufstep*k;
1725         }
1726
1727         const AT* beta = _beta + ksize * range.start;
1728
1729         for( dy = range.start; dy < range.end; dy++, beta += ksize )
1730         {
1731             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
1732
1733             for(int k = 0; k < ksize; k++ )
1734             {
1735                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
1736                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
1737                 {
1738                     if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
1739                     {
1740                         if( k1 > k )
1741                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
1742                         break;
1743                     }
1744                 }
1745                 if( k1 == ksize )
1746                     k0 = std::min(k0, k); // remember the first row that needs to be computed
1747                 srows[k] = src.template ptr<T>(sy);
1748                 prev_sy[k] = sy;
1749             }
1750
1751             if( k0 < ksize )
1752                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
1753                         ssize.width, dsize.width, cn, xmin, xmax );
1754             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
1755         }
1756     }
1757 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1758 # pragma GCC diagnostic pop
1759 #endif
1760
1761 private:
1762     Mat src;
1763     Mat dst;
1764     const int* xofs, *yofs;
1765     const AT* alpha, *_beta;
1766     Size ssize, dsize;
1767     const int ksize, xmin, xmax;
1768
1769     resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
1770 };
1771
1772 template<class HResize, class VResize>
1773 static void resizeGeneric_( const Mat& src, Mat& dst,
1774                             const int* xofs, const void* _alpha,
1775                             const int* yofs, const void* _beta,
1776                             int xmin, int xmax, int ksize )
1777 {
1778     typedef typename HResize::alpha_type AT;
1779
1780     const AT* beta = (const AT*)_beta;
1781     Size ssize = src.size(), dsize = dst.size();
1782     int cn = src.channels();
1783     ssize.width *= cn;
1784     dsize.width *= cn;
1785     xmin *= cn;
1786     xmax *= cn;
1787     // image resize is a separable operation. In case of not too strong
1788
1789     Range range(0, dsize.height);
1790     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
1791         ssize, dsize, ksize, xmin, xmax);
1792     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1793 }
1794
1795 template <typename T, typename WT>
1796 struct ResizeAreaFastNoVec
1797 {
1798     ResizeAreaFastNoVec(int, int) { }
1799     ResizeAreaFastNoVec(int, int, int, int) { }
1800     int operator() (const T*, T*, int) const
1801     { return 0; }
1802 };
1803
1804 #if CV_NEON
1805
1806 class ResizeAreaFastVec_SIMD_8u
1807 {
1808 public:
1809     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1810         cn(_cn), step(_step)
1811     {
1812     }
1813
1814     int operator() (const uchar* S, uchar* D, int w) const
1815     {
1816         int dx = 0;
1817         const uchar* S0 = S, * S1 = S0 + step;
1818
1819         uint16x8_t v_2 = vdupq_n_u16(2);
1820
1821         if (cn == 1)
1822         {
1823             for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
1824             {
1825                 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
1826
1827                 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
1828                 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
1829                 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
1830
1831                 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
1832                 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
1833                 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
1834
1835                 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
1836             }
1837         }
1838         else if (cn == 4)
1839         {
1840             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1841             {
1842                 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
1843
1844                 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
1845                 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
1846                 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
1847                 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
1848
1849                 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
1850                                            vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
1851                 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
1852                                            vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
1853                 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
1854
1855                 vst1_u8(D, vmovn_u16(v_dst));
1856             }
1857         }
1858
1859         return dx;
1860     }
1861
1862 private:
1863     int cn, step;
1864 };
1865
1866 class ResizeAreaFastVec_SIMD_16u
1867 {
1868 public:
1869     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1870         cn(_cn), step(_step)
1871     {
1872     }
1873
1874     int operator() (const ushort * S, ushort * D, int w) const
1875     {
1876         int dx = 0;
1877         const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
1878
1879         uint32x4_t v_2 = vdupq_n_u32(2);
1880
1881         if (cn == 1)
1882         {
1883             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1884             {
1885                 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
1886
1887                 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
1888                 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
1889                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
1890
1891                 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
1892                 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
1893                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
1894
1895                 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
1896             }
1897         }
1898         else if (cn == 4)
1899         {
1900             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1901             {
1902                 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
1903                 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
1904                                              vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
1905                 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
1906             }
1907         }
1908
1909         return dx;
1910     }
1911
1912 private:
1913     int cn, step;
1914 };
1915
1916 class ResizeAreaFastVec_SIMD_16s
1917 {
1918 public:
1919     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
1920         cn(_cn), step(_step)
1921     {
1922     }
1923
1924     int operator() (const short * S, short * D, int w) const
1925     {
1926         int dx = 0;
1927         const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
1928
1929         int32x4_t v_2 = vdupq_n_s32(2);
1930
1931         if (cn == 1)
1932         {
1933             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1934             {
1935                 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
1936
1937                 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
1938                 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
1939                 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
1940
1941                 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
1942                 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
1943                 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
1944
1945                 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
1946             }
1947         }
1948         else if (cn == 4)
1949         {
1950             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1951             {
1952                 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
1953                 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
1954                                             vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
1955                 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
1956             }
1957         }
1958
1959         return dx;
1960     }
1961
1962 private:
1963     int cn, step;
1964 };
1965
1966 struct ResizeAreaFastVec_SIMD_32f
1967 {
1968     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
1969         cn(_cn), step(_step)
1970     {
1971         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
1972     }
1973
1974     int operator() (const float * S, float * D, int w) const
1975     {
1976         if (!fast_mode)
1977             return 0;
1978
1979         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
1980         int dx = 0;
1981
1982         float32x4_t v_025 = vdupq_n_f32(0.25f);
1983
1984         if (cn == 1)
1985         {
1986             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1987             {
1988                 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
1989
1990                 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
1991                 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
1992
1993                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
1994             }
1995         }
1996         else if (cn == 4)
1997         {
1998             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1999             {
2000                 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
2001                 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
2002
2003                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
2004             }
2005         }
2006
2007         return dx;
2008     }
2009
2010 private:
2011     int cn;
2012     bool fast_mode;
2013     int step;
2014 };
2015
2016 #elif CV_SSE2
2017
2018 class ResizeAreaFastVec_SIMD_8u
2019 {
2020 public:
2021     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
2022         cn(_cn), step(_step)
2023     {
2024         use_simd = checkHardwareSupport(CV_CPU_SSE2);
2025     }
2026
2027     int operator() (const uchar* S, uchar* D, int w) const
2028     {
2029         if (!use_simd)
2030             return 0;
2031
2032         int dx = 0;
2033         const uchar* S0 = S;
2034         const uchar* S1 = S0 + step;
2035         __m128i zero = _mm_setzero_si128();
2036         __m128i delta2 = _mm_set1_epi16(2);
2037
2038         if (cn == 1)
2039         {
2040             __m128i masklow = _mm_set1_epi16(0x00ff);
2041             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2042             {
2043                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2044                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2045
2046                 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
2047                 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
2048                 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
2049                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
2050
2051                 _mm_storel_epi64((__m128i*)D, s0);
2052             }
2053         }
2054         else if (cn == 3)
2055             for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
2056             {
2057                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2058                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2059
2060                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
2061                 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
2062                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
2063                 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
2064
2065                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
2066                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
2067                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2068                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
2069                 _mm_storel_epi64((__m128i*)D, s0);
2070
2071                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
2072                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
2073                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2074                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
2075                 _mm_storel_epi64((__m128i*)(D+3), s0);
2076             }
2077         else
2078         {
2079             CV_Assert(cn == 4);
2080             int v[] = { 0, 0, -1, -1 };
2081             __m128i mask = _mm_loadu_si128((const __m128i*)v);
2082
2083             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2084             {
2085                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2086                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2087
2088                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
2089                 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
2090                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
2091                 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
2092
2093                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
2094                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
2095                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2096                 __m128i res0 = _mm_srli_epi16(s0, 2);
2097
2098                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
2099                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
2100                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2101                 __m128i res1 = _mm_srli_epi16(s0, 2);
2102                 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
2103                                                    _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
2104                 _mm_storel_epi64((__m128i*)(D), s0);
2105             }
2106         }
2107
2108         return dx;
2109     }
2110
2111 private:
2112     int cn;
2113     bool use_simd;
2114     int step;
2115 };
2116
2117 class ResizeAreaFastVec_SIMD_16u
2118 {
2119 public:
2120     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
2121         cn(_cn), step(_step)
2122     {
2123         use_simd = checkHardwareSupport(CV_CPU_SSE2);
2124     }
2125
2126     int operator() (const ushort* S, ushort* D, int w) const
2127     {
2128         if (!use_simd)
2129             return 0;
2130
2131         int dx = 0;
2132         const ushort* S0 = (const ushort*)S;
2133         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
2134         __m128i masklow = _mm_set1_epi32(0x0000ffff);
2135         __m128i zero = _mm_setzero_si128();
2136         __m128i delta2 = _mm_set1_epi32(2);
2137
2138 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
2139
2140         if (cn == 1)
2141         {
2142             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2143             {
2144                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2145                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2146
2147                 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
2148                 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
2149                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
2150                 s0 = _mm_srli_epi32(s0, 2);
2151                 s0 = _mm_packus_epi32(s0, zero);
2152
2153                 _mm_storel_epi64((__m128i*)D, s0);
2154             }
2155         }
2156         else if (cn == 3)
2157             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2158             {
2159                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2160                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2161
2162                 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
2163                 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
2164                 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
2165                 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
2166
2167                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
2168                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
2169                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
2170                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2171                 _mm_storel_epi64((__m128i*)D, s0);
2172             }
2173         else
2174         {
2175             CV_Assert(cn == 4);
2176             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2177             {
2178                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2179                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2180
2181                 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
2182                 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
2183                 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
2184                 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
2185
2186                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
2187                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
2188                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
2189                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2190                 _mm_storel_epi64((__m128i*)D, s0);
2191             }
2192         }
2193
2194 #undef _mm_packus_epi32
2195
2196         return dx;
2197     }
2198
2199 private:
2200     int cn;
2201     int step;
2202     bool use_simd;
2203 };
2204
2205 class ResizeAreaFastVec_SIMD_16s
2206 {
2207 public:
2208     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
2209         cn(_cn), step(_step)
2210     {
2211         use_simd = checkHardwareSupport(CV_CPU_SSE2);
2212     }
2213
2214     int operator() (const short* S, short* D, int w) const
2215     {
2216         if (!use_simd)
2217             return 0;
2218
2219         int dx = 0;
2220         const short* S0 = (const short*)S;
2221         const short* S1 = (const short*)((const uchar*)(S) + step);
2222         __m128i masklow = _mm_set1_epi32(0x0000ffff);
2223         __m128i zero = _mm_setzero_si128();
2224         __m128i delta2 = _mm_set1_epi32(2);
2225
2226         if (cn == 1)
2227         {
2228             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2229             {
2230                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2231                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2232
2233                 __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
2234                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
2235                 __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
2236                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
2237                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
2238                 s0 = _mm_srai_epi32(s0, 2);
2239                 s0 = _mm_packs_epi32(s0, zero);
2240
2241                 _mm_storel_epi64((__m128i*)D, s0);
2242             }
2243         }
2244         else if (cn == 3)
2245             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2246             {
2247                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2248                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2249
2250                 __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
2251                 __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
2252                 __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
2253                 __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
2254
2255                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
2256                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
2257                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
2258                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
2259                 _mm_storel_epi64((__m128i*)D, s0);
2260             }
2261         else
2262         {
2263             CV_Assert(cn == 4);
2264             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2265             {
2266                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2267                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2268
2269                 __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
2270                 __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
2271                 __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
2272                 __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
2273
2274                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
2275                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
2276                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
2277                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
2278                 _mm_storel_epi64((__m128i*)D, s0);
2279             }
2280         }
2281
2282         return dx;
2283     }
2284
2285 private:
2286     int cn;
2287     int step;
2288     bool use_simd;
2289 };
2290
2291 struct ResizeAreaFastVec_SIMD_32f
2292 {
2293     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
2294         cn(_cn), step(_step)
2295     {
2296         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
2297         fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
2298     }
2299
2300     int operator() (const float * S, float * D, int w) const
2301     {
2302         if (!fast_mode)
2303             return 0;
2304
2305         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
2306         int dx = 0;
2307
2308         __m128 v_025 = _mm_set1_ps(0.25f);
2309
2310         if (cn == 1)
2311         {
2312             const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
2313             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2314             {
2315                 __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
2316                        v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
2317
2318                 __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
2319                                            _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
2320                 __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
2321                                            _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
2322
2323                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
2324             }
2325         }
2326         else if (cn == 4)
2327         {
2328             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2329             {
2330                 __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
2331                 __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
2332
2333                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
2334             }
2335         }
2336
2337         return dx;
2338     }
2339
2340 private:
2341     int cn;
2342     bool fast_mode;
2343     int step;
2344 };
2345
2346 #else
2347
2348 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
2349 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
2350 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2351 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2352
2353 #endif
2354
2355 template<typename T, typename SIMDVecOp>
2356 struct ResizeAreaFastVec
2357 {
2358     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
2359         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
2360     {
2361         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
2362     }
2363
2364     int operator() (const T* S, T* D, int w) const
2365     {
2366         if (!fast_mode)
2367             return 0;
2368
2369         const T* nextS = (const T*)((const uchar*)S + step);
2370         int dx = vecOp(S, D, w);
2371
2372         if (cn == 1)
2373             for( ; dx < w; ++dx )
2374             {
2375                 int index = dx*2;
2376                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
2377             }
2378         else if (cn == 3)
2379             for( ; dx < w; dx += 3 )
2380             {
2381                 int index = dx*2;
2382                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
2383                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
2384                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
2385             }
2386         else
2387             {
2388                 CV_Assert(cn == 4);
2389                 for( ; dx < w; dx += 4 )
2390                 {
2391                     int index = dx*2;
2392                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
2393                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
2394                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
2395                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
2396                 }
2397             }
2398
2399         return dx;
2400     }
2401
2402 private:
2403     int scale_x, scale_y;
2404     int cn;
2405     bool fast_mode;
2406     int step;
2407     SIMDVecOp vecOp;
2408 };
2409
2410 template <typename T, typename WT, typename VecOp>
2411 class resizeAreaFast_Invoker :
2412     public ParallelLoopBody
2413 {
2414 public:
2415     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
2416         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
2417         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
2418         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
2419     {
2420     }
2421
2422     virtual void operator() (const Range& range) const
2423     {
2424         Size ssize = src.size(), dsize = dst.size();
2425         int cn = src.channels();
2426         int area = scale_x*scale_y;
2427         float scale = 1.f/(area);
2428         int dwidth1 = (ssize.width/scale_x)*cn;
2429         dsize.width *= cn;
2430         ssize.width *= cn;
2431         int dy, dx, k = 0;
2432
2433         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
2434
2435         for( dy = range.start; dy < range.end; dy++ )
2436         {
2437             T* D = (T*)(dst.data + dst.step*dy);
2438             int sy0 = dy*scale_y;
2439             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
2440
2441             if( sy0 >= ssize.height )
2442             {
2443                 for( dx = 0; dx < dsize.width; dx++ )
2444                     D[dx] = 0;
2445                 continue;
2446             }
2447
2448             dx = vop(src.template ptr<T>(sy0), D, w);
2449             for( ; dx < w; dx++ )
2450             {
2451                 const T* S = src.template ptr<T>(sy0) + xofs[dx];
2452                 WT sum = 0;
2453                 k = 0;
2454                 #if CV_ENABLE_UNROLLED
2455                 for( ; k <= area - 4; k += 4 )
2456                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
2457                 #endif
2458                 for( ; k < area; k++ )
2459                     sum += S[ofs[k]];
2460
2461                 D[dx] = saturate_cast<T>(sum * scale);
2462             }
2463
2464             for( ; dx < dsize.width; dx++ )
2465             {
2466                 WT sum = 0;
2467                 int count = 0, sx0 = xofs[dx];
2468                 if( sx0 >= ssize.width )
2469                     D[dx] = 0;
2470
2471                 for( int sy = 0; sy < scale_y; sy++ )
2472                 {
2473                     if( sy0 + sy >= ssize.height )
2474                         break;
2475                     const T* S = src.template ptr<T>(sy0 + sy) + sx0;
2476                     for( int sx = 0; sx < scale_x*cn; sx += cn )
2477                     {
2478                         if( sx0 + sx >= ssize.width )
2479                             break;
2480                         sum += S[sx];
2481                         count++;
2482                     }
2483                 }
2484
2485                 D[dx] = saturate_cast<T>((float)sum/count);
2486             }
2487         }
2488     }
2489
2490 private:
2491     Mat src;
2492     Mat dst;
2493     int scale_x, scale_y;
2494     const int *ofs, *xofs;
2495 };
2496
2497 template<typename T, typename WT, typename VecOp>
2498 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
2499                              int scale_x, int scale_y )
2500 {
2501     Range range(0, dst.rows);
2502     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
2503         scale_y, ofs, xofs);
2504     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2505 }
2506
2507 struct DecimateAlpha
2508 {
2509     int si, di;
2510     float alpha;
2511 };
2512
2513
2514 template<typename T, typename WT> class ResizeArea_Invoker :
2515     public ParallelLoopBody
2516 {
2517 public:
2518     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
2519                         const DecimateAlpha* _xtab, int _xtab_size,
2520                         const DecimateAlpha* _ytab, int _ytab_size,
2521                         const int* _tabofs )
2522     {
2523         src = &_src;
2524         dst = &_dst;
2525         xtab0 = _xtab;
2526         xtab_size0 = _xtab_size;
2527         ytab = _ytab;
2528         ytab_size = _ytab_size;
2529         tabofs = _tabofs;
2530     }
2531
2532     virtual void operator() (const Range& range) const
2533     {
2534         Size dsize = dst->size();
2535         int cn = dst->channels();
2536         dsize.width *= cn;
2537         AutoBuffer<WT> _buffer(dsize.width*2);
2538         const DecimateAlpha* xtab = xtab0;
2539         int xtab_size = xtab_size0;
2540         WT *buf = _buffer, *sum = buf + dsize.width;
2541         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
2542
2543         for( dx = 0; dx < dsize.width; dx++ )
2544             sum[dx] = (WT)0;
2545
2546         for( j = j_start; j < j_end; j++ )
2547         {
2548             WT beta = ytab[j].alpha;
2549             int dy = ytab[j].di;
2550             int sy = ytab[j].si;
2551
2552             {
2553                 const T* S = src->template ptr<T>(sy);
2554                 for( dx = 0; dx < dsize.width; dx++ )
2555                     buf[dx] = (WT)0;
2556
2557                 if( cn == 1 )
2558                     for( k = 0; k < xtab_size; k++ )
2559                     {
2560                         int dxn = xtab[k].di;
2561                         WT alpha = xtab[k].alpha;
2562                         buf[dxn] += S[xtab[k].si]*alpha;
2563                     }
2564                 else if( cn == 2 )
2565                     for( k = 0; k < xtab_size; k++ )
2566                     {
2567                         int sxn = xtab[k].si;
2568                         int dxn = xtab[k].di;
2569                         WT alpha = xtab[k].alpha;
2570                         WT t0 = buf[dxn] + S[sxn]*alpha;
2571                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2572                         buf[dxn] = t0; buf[dxn+1] = t1;
2573                     }
2574                 else if( cn == 3 )
2575                     for( k = 0; k < xtab_size; k++ )
2576                     {
2577                         int sxn = xtab[k].si;
2578                         int dxn = xtab[k].di;
2579                         WT alpha = xtab[k].alpha;
2580                         WT t0 = buf[dxn] + S[sxn]*alpha;
2581                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2582                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
2583                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
2584                     }
2585                 else if( cn == 4 )
2586                 {
2587                     for( k = 0; k < xtab_size; k++ )
2588                     {
2589                         int sxn = xtab[k].si;
2590                         int dxn = xtab[k].di;
2591                         WT alpha = xtab[k].alpha;
2592                         WT t0 = buf[dxn] + S[sxn]*alpha;
2593                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2594                         buf[dxn] = t0; buf[dxn+1] = t1;
2595                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
2596                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
2597                         buf[dxn+2] = t0; buf[dxn+3] = t1;
2598                     }
2599                 }
2600                 else
2601                 {
2602                     for( k = 0; k < xtab_size; k++ )
2603                     {
2604                         int sxn = xtab[k].si;
2605                         int dxn = xtab[k].di;
2606                         WT alpha = xtab[k].alpha;
2607                         for( int c = 0; c < cn; c++ )
2608                             buf[dxn + c] += S[sxn + c]*alpha;
2609                     }
2610                 }
2611             }
2612
2613             if( dy != prev_dy )
2614             {
2615                 T* D = dst->template ptr<T>(prev_dy);
2616
2617                 for( dx = 0; dx < dsize.width; dx++ )
2618                 {
2619                     D[dx] = saturate_cast<T>(sum[dx]);
2620                     sum[dx] = beta*buf[dx];
2621                 }
2622                 prev_dy = dy;
2623             }
2624             else
2625             {
2626                 for( dx = 0; dx < dsize.width; dx++ )
2627                     sum[dx] += beta*buf[dx];
2628             }
2629         }
2630
2631         {
2632         T* D = dst->template ptr<T>(prev_dy);
2633         for( dx = 0; dx < dsize.width; dx++ )
2634             D[dx] = saturate_cast<T>(sum[dx]);
2635         }
2636     }
2637
2638 private:
2639     const Mat* src;
2640     Mat* dst;
2641     const DecimateAlpha* xtab0;
2642     const DecimateAlpha* ytab;
2643     int xtab_size0, ytab_size;
2644     const int* tabofs;
2645 };
2646
2647
2648 template <typename T, typename WT>
2649 static void resizeArea_( const Mat& src, Mat& dst,
2650                          const DecimateAlpha* xtab, int xtab_size,
2651                          const DecimateAlpha* ytab, int ytab_size,
2652                          const int* tabofs )
2653 {
2654     parallel_for_(Range(0, dst.rows),
2655                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
2656                  dst.total()/((double)(1 << 16)));
2657 }
2658
2659
2660 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
2661                             const int* xofs, const void* alpha,
2662                             const int* yofs, const void* beta,
2663                             int xmin, int xmax, int ksize );
2664
2665 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
2666                                     const int* ofs, const int *xofs,
2667                                     int scale_x, int scale_y );
2668
2669 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
2670                                 const DecimateAlpha* xtab, int xtab_size,
2671                                 const DecimateAlpha* ytab, int ytab_size,
2672                                 const int* yofs);
2673
2674
2675 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
2676 {
2677     int k = 0;
2678     for(int dx = 0; dx < dsize; dx++ )
2679     {
2680         double fsx1 = dx * scale;
2681         double fsx2 = fsx1 + scale;
2682         double cellWidth = std::min(scale, ssize - fsx1);
2683
2684         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2685
2686         sx2 = std::min(sx2, ssize - 1);
2687         sx1 = std::min(sx1, sx2);
2688
2689         if( sx1 - fsx1 > 1e-3 )
2690         {
2691             assert( k < ssize*2 );
2692             tab[k].di = dx * cn;
2693             tab[k].si = (sx1 - 1) * cn;
2694             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
2695         }
2696
2697         for(int sx = sx1; sx < sx2; sx++ )
2698         {
2699             assert( k < ssize*2 );
2700             tab[k].di = dx * cn;
2701             tab[k].si = sx * cn;
2702             tab[k++].alpha = float(1.0 / cellWidth);
2703         }
2704
2705         if( fsx2 - sx2 > 1e-3 )
2706         {
2707             assert( k < ssize*2 );
2708             tab[k].di = dx * cn;
2709             tab[k].si = sx2 * cn;
2710             tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2711         }
2712     }
2713     return k;
2714 }
2715
2716 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
2717
2718 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
2719     ippiResize = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2720     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2721     specBuf.allocate(specSize);\
2722     pSpec = (uchar*)specBuf;\
2723     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
2724
2725 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
2726     if (mode == (int)ippCubic) { *ok = false; return; } \
2727     ippiResize = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2728     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2729     specBuf.allocate(specSize);\
2730     pSpec = (uchar*)specBuf;\
2731     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
2732     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
2733     getSrcOffsetFunc =  (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
2734
2735 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
2736     ippiResize = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
2737     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2738     specBuf.allocate(specSize);\
2739     pSpec = (uchar*)specBuf;\
2740     AutoBuffer<uchar> buf(initSize);\
2741     uchar* pInit = (uchar*)buf;\
2742     CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
2743
2744 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
2745     if (mode == (int)ippLinear)     { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
2746     else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
2747     else { *ok = false; return; } \
2748     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
2749     getSrcOffsetFunc =  (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
2750
2751 #if IPP_VERSION_X100 >= 710
2752 class IPPresizeInvoker :
2753     public ParallelLoopBody
2754 {
2755 public:
2756     IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
2757         ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
2758         inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
2759         ippiResize(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
2760     {
2761         *ok = true;
2762         IppiSize srcSize, dstSize;
2763         int type = src.type(), specSize = 0, initSize = 0;
2764         srcSize.width  = src.cols;
2765         srcSize.height = src.rows;
2766         dstSize.width  = dst.cols;
2767         dstSize.height = dst.rows;
2768
2769         switch (type)
2770         {
2771 #if IPP_DISABLE_BLOCK // disabled since it breaks tests for CascadeClassifier
2772             case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
2773             case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
2774             case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
2775 #endif
2776             case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
2777             case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
2778             case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
2779             case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
2780             case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
2781             case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
2782             case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
2783             case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
2784             case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
2785             case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
2786             case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
2787             case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
2788             default: { *ok = false; return; } break;
2789         }
2790     }
2791
2792     ~IPPresizeInvoker()
2793     {
2794     }
2795
2796     virtual void operator() (const Range& range) const
2797     {
2798         if (*ok == false)
2799             return;
2800
2801         int cn = src.channels();
2802         int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
2803         int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
2804         int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
2805
2806         IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
2807         IppiSize  dstSize   = { dstwidth, dstheight - dsty };
2808         int bufsize = 0, itemSize = (int)src.elemSize1();
2809
2810         CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
2811         CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
2812
2813         const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
2814         Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
2815
2816         AutoBuffer<uchar> buf(bufsize + 64);
2817         uchar* bufptr = alignPtr((uchar*)buf, 32);
2818
2819         if( CV_INSTRUMENT_FUN_IPP(ippiResize, pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr) < 0 )
2820             *ok = false;
2821         else
2822         {
2823             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2824         }
2825     }
2826 private:
2827     const Mat & src;
2828     Mat & dst;
2829     double inv_scale_x;
2830     double inv_scale_y;
2831     void *pSpec;
2832     AutoBuffer<uchar> specBuf;
2833     int mode;
2834     ippiResizeFunc ippiResize;
2835     ippiResizeGetBufferSize getBufferSizeFunc;
2836     ippiResizeGetSrcOffset getSrcOffsetFunc;
2837     bool *ok;
2838     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
2839 };
2840
2841 #endif
2842
2843 #ifdef HAVE_OPENCL
2844
2845 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
2846                                       float * const alpha_tab, int * const ofs_tab)
2847 {
2848     int k = 0, dx = 0;
2849     for ( ; dx < dsize; dx++)
2850     {
2851         ofs_tab[dx] = k;
2852
2853         double fsx1 = dx * scale;
2854         double fsx2 = fsx1 + scale;
2855         double cellWidth = std::min(scale, ssize - fsx1);
2856
2857         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2858
2859         sx2 = std::min(sx2, ssize - 1);
2860         sx1 = std::min(sx1, sx2);
2861
2862         if (sx1 - fsx1 > 1e-3)
2863         {
2864             map_tab[k] = sx1 - 1;
2865             alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
2866         }
2867
2868         for (int sx = sx1; sx < sx2; sx++)
2869         {
2870             map_tab[k] = sx;
2871             alpha_tab[k++] = float(1.0 / cellWidth);
2872         }
2873
2874         if (fsx2 - sx2 > 1e-3)
2875         {
2876             map_tab[k] = sx2;
2877             alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2878         }
2879     }
2880     ofs_tab[dx] = k;
2881 }
2882
2883 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
2884                         double fx, double fy, int interpolation)
2885 {
2886     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2887
2888     double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
2889     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
2890     int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
2891     bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
2892         std::abs(inv_fy - iscale_y) < DBL_EPSILON;
2893
2894     // in case of scale_x && scale_y is equal to 2
2895     // INTER_AREA (fast) also is equal to INTER_LINEAR
2896     if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2897         /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
2898
2899     if( !(cn <= 4 &&
2900            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
2901             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
2902         return false;
2903
2904     UMat src = _src.getUMat();
2905     _dst.create(dsize, type);
2906     UMat dst = _dst.getUMat();
2907
2908     Size ssize = src.size();
2909     ocl::Kernel k;
2910     size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
2911
2912     ocl::Image2D srcImage;
2913
2914     // See if this could be done with a sampler.  We stick with integer
2915     // datatypes because the observed error is low.
2916     bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
2917                        ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
2918                        ocl::Image2D::isFormatSupported(depth, cn, true) &&
2919                        src.offset==0);
2920     if (useSampler)
2921     {
2922         int wdepth = std::max(depth, CV_32S);
2923         char buf[2][32];
2924         cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
2925                         "-D convertToDT=%s -D cn=%d",
2926                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
2927                         ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2928                         cn);
2929         k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
2930
2931         if (k.empty())
2932             useSampler = false;
2933         else
2934         {
2935             // Convert the input into an OpenCL image type, using normalized channel data types
2936             // and aliasing the UMat.
2937             srcImage = ocl::Image2D(src, true, true);
2938             k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
2939                    (float)inv_fx, (float)inv_fy);
2940         }
2941     }
2942
2943     if (interpolation == INTER_LINEAR && !useSampler)
2944     {
2945         char buf[2][32];
2946
2947         // integer path is slower because of CPU part, so it's disabled
2948         if (depth == CV_8U && ((void)0, 0))
2949         {
2950             AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
2951             int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
2952             short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
2953             float fxx, fyy;
2954             int sx, sy;
2955
2956             for (int dx = 0; dx < dsize.width; dx++)
2957             {
2958                 fxx = (float)((dx+0.5)*inv_fx - 0.5);
2959                 sx = cvFloor(fxx);
2960                 fxx -= sx;
2961
2962                 if (sx < 0)
2963                     fxx = 0, sx = 0;
2964
2965                 if (sx >= ssize.width-1)
2966                     fxx = 0, sx = ssize.width-1;
2967
2968                 xofs[dx] = sx;
2969                 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
2970                 ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
2971             }
2972
2973             for (int dy = 0; dy < dsize.height; dy++)
2974             {
2975                 fyy = (float)((dy+0.5)*inv_fy - 0.5);
2976                 sy = cvFloor(fyy);
2977                 fyy -= sy;
2978
2979                 yofs[dy] = sy;
2980                 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
2981                 ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
2982             }
2983
2984             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2985             UMat coeffs;
2986             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
2987
2988             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2989                      format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
2990                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2991                             "-D INTER_RESIZE_COEF_BITS=%d",
2992                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2993                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2994                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2995                             cn, INTER_RESIZE_COEF_BITS));
2996             if (k.empty())
2997                 return false;
2998
2999             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3000                    ocl::KernelArg::PtrReadOnly(coeffs));
3001         }
3002         else
3003         {
3004             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
3005             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
3006                      format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
3007                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
3008                             "-D INTER_RESIZE_COEF_BITS=%d",
3009                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3010                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
3011                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3012                             cn, INTER_RESIZE_COEF_BITS));
3013             if (k.empty())
3014                 return false;
3015
3016             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3017                    (float)inv_fx, (float)inv_fy);
3018         }
3019     }
3020     else if (interpolation == INTER_NEAREST)
3021     {
3022         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
3023                  format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
3024                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
3025         if (k.empty())
3026             return false;
3027
3028         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3029                (float)inv_fx, (float)inv_fy);
3030     }
3031     else if (interpolation == INTER_AREA)
3032     {
3033         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
3034         int wtype = CV_MAKE_TYPE(wdepth, cn);
3035
3036         char cvt[2][40];
3037         String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
3038                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3039                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
3040
3041         UMat alphaOcl, tabofsOcl, mapOcl;
3042         UMat dmap, smap;
3043
3044         if (is_area_fast)
3045         {
3046             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
3047             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
3048                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
3049                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
3050                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
3051                                     iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
3052
3053             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
3054             if (k.empty())
3055                 return false;
3056         }
3057         else
3058         {
3059             buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
3060             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
3061             if (k.empty())
3062                 return false;
3063
3064             int xytab_size = (ssize.width + ssize.height) << 1;
3065             int tabofs_size = dsize.height + dsize.width + 2;
3066
3067             AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
3068             AutoBuffer<float> _xyalpha_tab(xytab_size);
3069             int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
3070             float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
3071             int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
3072
3073             ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
3074             ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
3075
3076             // loading precomputed arrays to GPU
3077             Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
3078             Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
3079             Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
3080         }
3081
3082         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
3083
3084         if (is_area_fast)
3085             k.args(srcarg, dstarg);
3086         else
3087             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
3088                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
3089
3090         return k.run(2, globalsize, NULL, false);
3091     }
3092
3093     return k.run(2, globalsize, 0, false);
3094 }
3095
3096 #endif
3097
3098 #if IPP_VERSION_X100 >= 710
3099 static bool ipp_resize_mt(Mat & src, Mat & dst,
3100                           double inv_scale_x, double inv_scale_y, int interpolation)
3101 {
3102     CV_INSTRUMENT_REGION_IPP()
3103
3104     int mode = -1;
3105     if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
3106         mode = ippLinear;
3107     else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
3108         mode = ippCubic;
3109     else
3110         return false;
3111
3112     bool ok = true;
3113     Range range(0, src.rows);
3114     IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
3115     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3116     if( ok )
3117         return true;
3118
3119     return false;
3120 }
3121 #endif
3122
3123 //==================================================================================================
3124
3125 namespace hal {
3126
3127 void resize(int src_type,
3128             const uchar * src_data, size_t src_step, int src_width, int src_height,
3129             uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
3130             double inv_scale_x, double inv_scale_y, int interpolation)
3131 {
3132     CV_INSTRUMENT_REGION()
3133
3134     CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
3135     if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
3136     {
3137         inv_scale_x = static_cast<double>(dst_width) / src_width;
3138         inv_scale_y = static_cast<double>(dst_height) / src_height;
3139     }
3140
3141     CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
3142
3143     static ResizeFunc linear_tab[] =
3144     {
3145         resizeGeneric_<
3146             HResizeLinear<uchar, int, short,
3147                 INTER_RESIZE_COEF_SCALE,
3148                 HResizeLinearVec_8u32s>,
3149             VResizeLinear<uchar, int, short,
3150                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3151                 VResizeLinearVec_32s8u> >,
3152         0,
3153         resizeGeneric_<
3154             HResizeLinear<ushort, float, float, 1,
3155                 HResizeLinearVec_16u32f>,
3156             VResizeLinear<ushort, float, float, Cast<float, ushort>,
3157                 VResizeLinearVec_32f16u> >,
3158         resizeGeneric_<
3159             HResizeLinear<short, float, float, 1,
3160                 HResizeLinearVec_16s32f>,
3161             VResizeLinear<short, float, float, Cast<float, short>,
3162                 VResizeLinearVec_32f16s> >,
3163         0,
3164         resizeGeneric_<
3165             HResizeLinear<float, float, float, 1,
3166                 HResizeLinearVec_32f>,
3167             VResizeLinear<float, float, float, Cast<float, float>,
3168                 VResizeLinearVec_32f> >,
3169         resizeGeneric_<
3170             HResizeLinear<double, double, float, 1,
3171                 HResizeNoVec>,
3172             VResizeLinear<double, double, float, Cast<double, double>,
3173                 VResizeNoVec> >,
3174         0
3175     };
3176
3177     static ResizeFunc cubic_tab[] =
3178     {
3179         resizeGeneric_<
3180             HResizeCubic<uchar, int, short>,
3181             VResizeCubic<uchar, int, short,
3182                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3183                 VResizeCubicVec_32s8u> >,
3184         0,
3185         resizeGeneric_<
3186             HResizeCubic<ushort, float, float>,
3187             VResizeCubic<ushort, float, float, Cast<float, ushort>,
3188             VResizeCubicVec_32f16u> >,
3189         resizeGeneric_<
3190             HResizeCubic<short, float, float>,
3191             VResizeCubic<short, float, float, Cast<float, short>,
3192             VResizeCubicVec_32f16s> >,
3193         0,
3194         resizeGeneric_<
3195             HResizeCubic<float, float, float>,
3196             VResizeCubic<float, float, float, Cast<float, float>,
3197             VResizeCubicVec_32f> >,
3198         resizeGeneric_<
3199             HResizeCubic<double, double, float>,
3200             VResizeCubic<double, double, float, Cast<double, double>,
3201             VResizeNoVec> >,
3202         0
3203     };
3204
3205     static ResizeFunc lanczos4_tab[] =
3206     {
3207         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
3208             VResizeLanczos4<uchar, int, short,
3209             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3210             VResizeNoVec> >,
3211         0,
3212         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
3213             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
3214             VResizeLanczos4Vec_32f16u> >,
3215         resizeGeneric_<HResizeLanczos4<short, float, float>,
3216             VResizeLanczos4<short, float, float, Cast<float, short>,
3217             VResizeLanczos4Vec_32f16s> >,
3218         0,
3219         resizeGeneric_<HResizeLanczos4<float, float, float>,
3220             VResizeLanczos4<float, float, float, Cast<float, float>,
3221             VResizeLanczos4Vec_32f> >,
3222         resizeGeneric_<HResizeLanczos4<double, double, float>,
3223             VResizeLanczos4<double, double, float, Cast<double, double>,
3224             VResizeNoVec> >,
3225         0
3226     };
3227
3228     static ResizeAreaFastFunc areafast_tab[] =
3229     {
3230         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
3231         0,
3232         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
3233         resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
3234         0,
3235         resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
3236         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
3237         0
3238     };
3239
3240     static ResizeAreaFunc area_tab[] =
3241     {
3242         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
3243         resizeArea_<short, float>, 0, resizeArea_<float, float>,
3244         resizeArea_<double, double>, 0
3245     };
3246
3247     int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
3248     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
3249
3250     int iscale_x = saturate_cast<int>(scale_x);
3251     int iscale_y = saturate_cast<int>(scale_y);
3252
3253     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
3254             std::abs(scale_y - iscale_y) < DBL_EPSILON;
3255
3256     Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
3257                       saturate_cast<int>(src_height*inv_scale_y));
3258     CV_Assert( dsize.area() > 0 );
3259
3260     Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
3261     Mat dst(dsize, src_type, dst_data, dst_step);
3262
3263 #ifdef HAVE_IPP
3264     int mode = -1;
3265     if (interpolation == INTER_LINEAR && src_height >= 2 && src_width >= 2)
3266         mode = INTER_LINEAR;
3267     else if (interpolation == INTER_CUBIC && src_height >= 4 && src_width >= 4)
3268         mode = INTER_CUBIC;
3269
3270     const double IPP_RESIZE_EPS = 1e-10;
3271     double ex = fabs((double)dsize.width / src_width  - inv_scale_x) / inv_scale_x;
3272     double ey = fabs((double)dsize.height / src_height - inv_scale_y) / inv_scale_y;
3273 #endif
3274     CV_IPP_RUN(IPP_VERSION_X100 >= 710 && ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
3275         (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
3276         !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U) &&
3277         mode >= 0 && (cn == 1 || cn == 3 || cn == 4) && (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
3278         (depth == CV_64F && mode == INTER_LINEAR)),
3279         ipp_resize_mt(src, dst, inv_scale_x, inv_scale_y, interpolation))
3280
3281     if( interpolation == INTER_NEAREST )
3282     {
3283         resizeNN( src, dst, inv_scale_x, inv_scale_y );
3284         return;
3285     }
3286
3287     int k, sx, sy, dx, dy;
3288
3289
3290     {
3291         // in case of scale_x && scale_y is equal to 2
3292         // INTER_AREA (fast) also is equal to INTER_LINEAR
3293         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3294             interpolation = INTER_AREA;
3295
3296         // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
3297         // In other cases it is emulated using some variant of bilinear interpolation
3298         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
3299         {
3300             if( is_area_fast )
3301             {
3302                 int area = iscale_x*iscale_y;
3303                 size_t srcstep = src_step / src.elemSize1();
3304                 AutoBuffer<int> _ofs(area + dsize.width*cn);
3305                 int* ofs = _ofs;
3306                 int* xofs = ofs + area;
3307                 ResizeAreaFastFunc func = areafast_tab[depth];
3308                 CV_Assert( func != 0 );
3309
3310                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
3311                     for( sx = 0; sx < iscale_x; sx++ )
3312                         ofs[k++] = (int)(sy*srcstep + sx*cn);
3313
3314                 for( dx = 0; dx < dsize.width; dx++ )
3315                 {
3316                     int j = dx * cn;
3317                     sx = iscale_x * j;
3318                     for( k = 0; k < cn; k++ )
3319                         xofs[j + k] = sx + k;
3320                 }
3321
3322                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
3323                 return;
3324             }
3325
3326             ResizeAreaFunc func = area_tab[depth];
3327             CV_Assert( func != 0 && cn <= 4 );
3328
3329             AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
3330             DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2;
3331
3332             int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
3333             int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
3334
3335             AutoBuffer<int> _tabofs(dsize.height + 1);
3336             int* tabofs = _tabofs;
3337             for( k = 0, dy = 0; k < ytab_size; k++ )
3338             {
3339                 if( k == 0 || ytab[k].di != ytab[k-1].di )
3340                 {
3341                     assert( ytab[k].di == dy );
3342                     tabofs[dy++] = k;
3343                 }
3344             }
3345             tabofs[dy] = ytab_size;
3346
3347             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
3348             return;
3349         }
3350     }
3351
3352     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
3353     bool area_mode = interpolation == INTER_AREA;
3354     bool fixpt = depth == CV_8U;
3355     float fx, fy;
3356     ResizeFunc func=0;
3357     int ksize=0, ksize2;
3358     if( interpolation == INTER_CUBIC )
3359         ksize = 4, func = cubic_tab[depth];
3360     else if( interpolation == INTER_LANCZOS4 )
3361         ksize = 8, func = lanczos4_tab[depth];
3362     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
3363         ksize = 2, func = linear_tab[depth];
3364     else
3365         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
3366     ksize2 = ksize/2;
3367
3368     CV_Assert( func != 0 );
3369
3370     AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
3371     int* xofs = (int*)(uchar*)_buffer;
3372     int* yofs = xofs + width;
3373     float* alpha = (float*)(yofs + dsize.height);
3374     short* ialpha = (short*)alpha;
3375     float* beta = alpha + width*ksize;
3376     short* ibeta = ialpha + width*ksize;
3377     float cbuf[MAX_ESIZE];
3378
3379     for( dx = 0; dx < dsize.width; dx++ )
3380     {
3381         if( !area_mode )
3382         {
3383             fx = (float)((dx+0.5)*scale_x - 0.5);
3384             sx = cvFloor(fx);
3385             fx -= sx;
3386         }
3387         else
3388         {
3389             sx = cvFloor(dx*scale_x);
3390             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
3391             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
3392         }
3393
3394         if( sx < ksize2-1 )
3395         {
3396             xmin = dx+1;
3397             if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3398                 fx = 0, sx = 0;
3399         }
3400
3401         if( sx + ksize2 >= src_width )
3402         {
3403             xmax = std::min( xmax, dx );
3404             if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3405                 fx = 0, sx = src_width-1;
3406         }
3407
3408         for( k = 0, sx *= cn; k < cn; k++ )
3409             xofs[dx*cn + k] = sx + k;
3410
3411         if( interpolation == INTER_CUBIC )
3412             interpolateCubic( fx, cbuf );
3413         else if( interpolation == INTER_LANCZOS4 )
3414             interpolateLanczos4( fx, cbuf );
3415         else
3416         {
3417             cbuf[0] = 1.f - fx;
3418             cbuf[1] = fx;
3419         }
3420         if( fixpt )
3421         {
3422             for( k = 0; k < ksize; k++ )
3423                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3424             for( ; k < cn*ksize; k++ )
3425                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
3426         }
3427         else
3428         {
3429             for( k = 0; k < ksize; k++ )
3430                 alpha[dx*cn*ksize + k] = cbuf[k];
3431             for( ; k < cn*ksize; k++ )
3432                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
3433         }
3434     }
3435
3436     for( dy = 0; dy < dsize.height; dy++ )
3437     {
3438         if( !area_mode )
3439         {
3440             fy = (float)((dy+0.5)*scale_y - 0.5);
3441             sy = cvFloor(fy);
3442             fy -= sy;
3443         }
3444         else
3445         {
3446             sy = cvFloor(dy*scale_y);
3447             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
3448             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
3449         }
3450
3451         yofs[dy] = sy;
3452         if( interpolation == INTER_CUBIC )
3453             interpolateCubic( fy, cbuf );
3454         else if( interpolation == INTER_LANCZOS4 )
3455             interpolateLanczos4( fy, cbuf );
3456         else
3457         {
3458             cbuf[0] = 1.f - fy;
3459             cbuf[1] = fy;
3460         }
3461
3462         if( fixpt )
3463         {
3464             for( k = 0; k < ksize; k++ )
3465                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3466         }
3467         else
3468         {
3469             for( k = 0; k < ksize; k++ )
3470                 beta[dy*ksize + k] = cbuf[k];
3471         }
3472     }
3473
3474     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
3475           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
3476 }
3477
3478 } // cv::hal::
3479 } // cv::
3480
3481 //==================================================================================================
3482
3483 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
3484                  double inv_scale_x, double inv_scale_y, int interpolation )
3485 {
3486     CV_INSTRUMENT_REGION()
3487
3488     Size ssize = _src.size();
3489
3490     CV_Assert( ssize.width > 0 && ssize.height > 0 );
3491     CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
3492     if( dsize.area() == 0 )
3493     {
3494         dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
3495                      saturate_cast<int>(ssize.height*inv_scale_y));
3496         CV_Assert( dsize.area() > 0 );
3497     }
3498     else
3499     {
3500         inv_scale_x = (double)dsize.width/ssize.width;
3501         inv_scale_y = (double)dsize.height/ssize.height;
3502     }
3503
3504     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
3505                ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
3506
3507     Mat src = _src.getMat();
3508     _dst.create(dsize, src.type());
3509     Mat dst = _dst.getMat();
3510
3511     if (dsize == ssize)
3512     {
3513         // Source and destination are of same size. Use simple copy.
3514         src.copyTo(dst);
3515         return;
3516     }
3517
3518     hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
3519 }
3520
3521
3522 /****************************************************************************************\
3523 *                       General warping (affine, perspective, remap)                     *
3524 \****************************************************************************************/
3525
3526 namespace cv
3527 {
3528
3529 template<typename T>
3530 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
3531                           int borderType, const Scalar& _borderValue )
3532 {
3533     Size ssize = _src.size(), dsize = _dst.size();
3534     int cn = _src.channels();
3535     const T* S0 = _src.ptr<T>();
3536     size_t sstep = _src.step/sizeof(S0[0]);
3537     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3538         saturate_cast<T>(_borderValue[1]),
3539         saturate_cast<T>(_borderValue[2]),
3540         saturate_cast<T>(_borderValue[3]));
3541     int dx, dy;
3542
3543     unsigned width1 = ssize.width, height1 = ssize.height;
3544
3545     if( _dst.isContinuous() && _xy.isContinuous() )
3546     {
3547         dsize.width *= dsize.height;
3548         dsize.height = 1;
3549     }
3550
3551     for( dy = 0; dy < dsize.height; dy++ )
3552     {
3553         T* D = _dst.ptr<T>(dy);
3554         const short* XY = _xy.ptr<short>(dy);
3555
3556         if( cn == 1 )
3557         {
3558             for( dx = 0; dx < dsize.width; dx++ )
3559             {
3560                 int sx = XY[dx*2], sy = XY[dx*2+1];
3561                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3562                     D[dx] = S0[sy*sstep + sx];
3563                 else
3564                 {
3565                     if( borderType == BORDER_REPLICATE )
3566                     {
3567                         sx = clip(sx, 0, ssize.width);
3568                         sy = clip(sy, 0, ssize.height);
3569                         D[dx] = S0[sy*sstep + sx];
3570                     }
3571                     else if( borderType == BORDER_CONSTANT )
3572                         D[dx] = cval[0];
3573                     else if( borderType != BORDER_TRANSPARENT )
3574                     {
3575                         sx = borderInterpolate(sx, ssize.width, borderType);
3576                         sy = borderInterpolate(sy, ssize.height, borderType);
3577                         D[dx] = S0[sy*sstep + sx];
3578                     }
3579                 }
3580             }
3581         }
3582         else
3583         {
3584             for( dx = 0; dx < dsize.width; dx++, D += cn )
3585             {
3586                 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3587                 const T *S;
3588                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3589                 {
3590                     if( cn == 3 )
3591                     {
3592                         S = S0 + sy*sstep + sx*3;
3593                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
3594                     }
3595                     else if( cn == 4 )
3596                     {
3597                         S = S0 + sy*sstep + sx*4;
3598                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
3599                     }
3600                     else
3601                     {
3602                         S = S0 + sy*sstep + sx*cn;
3603                         for( k = 0; k < cn; k++ )
3604                             D[k] = S[k];
3605                     }
3606                 }
3607                 else if( borderType != BORDER_TRANSPARENT )
3608                 {
3609                     if( borderType == BORDER_REPLICATE )
3610                     {
3611                         sx = clip(sx, 0, ssize.width);
3612                         sy = clip(sy, 0, ssize.height);
3613                         S = S0 + sy*sstep + sx*cn;
3614                     }
3615                     else if( borderType == BORDER_CONSTANT )
3616                         S = &cval[0];
3617                     else
3618                     {
3619                         sx = borderInterpolate(sx, ssize.width, borderType);
3620                         sy = borderInterpolate(sy, ssize.height, borderType);
3621                         S = S0 + sy*sstep + sx*cn;
3622                     }
3623                     for( k = 0; k < cn; k++ )
3624                         D[k] = S[k];
3625                 }
3626             }
3627         }
3628     }
3629 }
3630
3631
3632 struct RemapNoVec
3633 {
3634     int operator()( const Mat&, void*, const short*, const ushort*,
3635                     const void*, int ) const { return 0; }
3636 };
3637
3638 #if CV_SSE2
3639
3640 struct RemapVec_8u
3641 {
3642     int operator()( const Mat& _src, void* _dst, const short* XY,
3643                     const ushort* FXY, const void* _wtab, int width ) const
3644     {
3645         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
3646
3647         if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
3648             sstep > 0x8000 )
3649             return 0;
3650
3651         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
3652         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
3653         uchar* D = (uchar*)_dst;
3654         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
3655         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
3656         __m128i z = _mm_setzero_si128();
3657         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
3658
3659         if( cn == 1 )
3660         {
3661             for( ; x <= width - 8; x += 8 )
3662             {
3663                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3664                 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
3665                 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
3666                 unsigned i0, i1;
3667
3668                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3669                 xy1 = _mm_madd_epi16( xy1, xy2ofs );
3670                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3671                 _mm_store_si128( (__m128i*)iofs1, xy1 );
3672
3673                 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
3674                 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
3675                 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3676                 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
3677                 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
3678                 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3679                 v0 = _mm_unpacklo_epi8(v0, z);
3680                 v1 = _mm_unpacklo_epi8(v1, z);
3681
3682                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
3683                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
3684                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
3685                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
3686                 b0 = _mm_unpacklo_epi64(a0, a1);
3687                 b1 = _mm_unpackhi_epi64(a0, a1);
3688                 v0 = _mm_madd_epi16(v0, b0);
3689                 v1 = _mm_madd_epi16(v1, b1);
3690                 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
3691
3692                 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
3693                 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
3694                 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3695                 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
3696                 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
3697                 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3698                 v2 = _mm_unpacklo_epi8(v2, z);
3699                 v3 = _mm_unpacklo_epi8(v3, z);
3700
3701                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
3702                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
3703                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
3704                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
3705                 b0 = _mm_unpacklo_epi64(a0, a1);
3706                 b1 = _mm_unpackhi_epi64(a0, a1);
3707                 v2 = _mm_madd_epi16(v2, b0);
3708                 v3 = _mm_madd_epi16(v3, b1);
3709                 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
3710
3711                 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
3712                 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
3713                 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
3714                 _mm_storel_epi64( (__m128i*)(D + x), v0 );
3715             }
3716         }
3717         else if( cn == 3 )
3718         {
3719             for( ; x <= width - 5; x += 4, D += 12 )
3720             {
3721                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3722                 __m128i u0, v0, u1, v1;
3723
3724                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3725                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3726                 const __m128i *w0, *w1;
3727                 w0 = (const __m128i*)(wtab + FXY[x]*16);
3728                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3729
3730                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3731                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
3732                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3733                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
3734                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3735                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
3736                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3737                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
3738                 u0 = _mm_unpacklo_epi8(u0, z);
3739                 v0 = _mm_unpacklo_epi8(v0, z);
3740                 u1 = _mm_unpacklo_epi8(u1, z);
3741                 v1 = _mm_unpacklo_epi8(v1, z);
3742                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3743                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3744                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3745                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3746                 u0 = _mm_slli_si128(u0, 4);
3747                 u0 = _mm_packs_epi32(u0, u1);
3748                 u0 = _mm_packus_epi16(u0, u0);
3749                 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
3750
3751                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3752                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3753
3754                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3755                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
3756                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3757                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
3758                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3759                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
3760                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3761                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
3762                 u0 = _mm_unpacklo_epi8(u0, z);
3763                 v0 = _mm_unpacklo_epi8(v0, z);
3764                 u1 = _mm_unpacklo_epi8(u1, z);
3765                 v1 = _mm_unpacklo_epi8(v1, z);
3766                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3767                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3768                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3769                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3770                 u0 = _mm_slli_si128(u0, 4);
3771                 u0 = _mm_packs_epi32(u0, u1);
3772                 u0 = _mm_packus_epi16(u0, u0);
3773                 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
3774             }
3775         }
3776         else if( cn == 4 )
3777         {
3778             for( ; x <= width - 4; x += 4, D += 16 )
3779             {
3780                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3781                 __m128i u0, v0, u1, v1;
3782
3783                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3784                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3785                 const __m128i *w0, *w1;
3786                 w0 = (const __m128i*)(wtab + FXY[x]*16);
3787                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3788
3789                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3790                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
3791                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3792                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
3793                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3794                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
3795                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3796                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
3797                 u0 = _mm_unpacklo_epi8(u0, z);
3798                 v0 = _mm_unpacklo_epi8(v0, z);
3799                 u1 = _mm_unpacklo_epi8(u1, z);
3800                 v1 = _mm_unpacklo_epi8(v1, z);
3801                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3802                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3803                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3804                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3805                 u0 = _mm_packs_epi32(u0, u1);
3806                 u0 = _mm_packus_epi16(u0, u0);
3807                 _mm_storel_epi64((__m128i*)D, u0);
3808
3809                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3810                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3811
3812                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3813                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
3814                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3815                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
3816                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3817                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
3818                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3819                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
3820                 u0 = _mm_unpacklo_epi8(u0, z);
3821                 v0 = _mm_unpacklo_epi8(v0, z);
3822                 u1 = _mm_unpacklo_epi8(u1, z);
3823                 v1 = _mm_unpacklo_epi8(v1, z);
3824                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3825                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3826                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3827                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3828                 u0 = _mm_packs_epi32(u0, u1);
3829                 u0 = _mm_packus_epi16(u0, u0);
3830                 _mm_storel_epi64((__m128i*)(D + 8), u0);
3831             }
3832         }
3833
3834         return x;
3835     }
3836 };
3837
3838 #else
3839
3840 typedef RemapNoVec RemapVec_8u;
3841
3842 #endif
3843
3844
3845 template<class CastOp, class VecOp, typename AT>
3846 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
3847                            const Mat& _fxy, const void* _wtab,
3848                            int borderType, const Scalar& _borderValue )
3849 {
3850     typedef typename CastOp::rtype T;
3851     typedef typename CastOp::type1 WT;
3852     Size ssize = _src.size(), dsize = _dst.size();
3853     int k, cn = _src.channels();
3854     const AT* wtab = (const AT*)_wtab;
3855     const T* S0 = _src.ptr<T>();
3856     size_t sstep = _src.step/sizeof(S0[0]);
3857     T cval[CV_CN_MAX];
3858     int dx, dy;
3859     CastOp castOp;
3860     VecOp vecOp;
3861
3862     for( k = 0; k < cn; k++ )
3863         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
3864
3865     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
3866     CV_Assert( ssize.area() > 0 );
3867 #if CV_SSE2
3868     if( _src.type() == CV_8UC3 )
3869         width1 = std::max(ssize.width-2, 0);
3870 #endif
3871
3872     for( dy = 0; dy < dsize.height; dy++ )
3873     {
3874         T* D = _dst.ptr<T>(dy);
3875         const short* XY = _xy.ptr<short>(dy);
3876         const ushort* FXY = _fxy.ptr<ushort>(dy);
3877         int X0 = 0;
3878         bool prevInlier = false;
3879
3880         for( dx = 0; dx <= dsize.width; dx++ )
3881         {
3882             bool curInlier = dx < dsize.width ?
3883                 (unsigned)XY[dx*2] < width1 &&
3884                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
3885             if( curInlier == prevInlier )
3886                 continue;
3887
3888             int X1 = dx;
3889             dx = X0;
3890             X0 = X1;
3891             prevInlier = curInlier;
3892
3893             if( !curInlier )
3894             {
3895                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
3896                 D += len*cn;
3897                 dx += len;
3898
3899                 if( cn == 1 )
3900                 {
3901                     for( ; dx < X1; dx++, D++ )
3902                     {
3903                         int sx = XY[dx*2], sy = XY[dx*2+1];
3904                         const AT* w = wtab + FXY[dx]*4;
3905                         const T* S = S0 + sy*sstep + sx;
3906                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
3907                     }
3908                 }
3909                 else if( cn == 2 )
3910                     for( ; dx < X1; dx++, D += 2 )
3911                     {
3912                         int sx = XY[dx*2], sy = XY[dx*2+1];
3913                         const AT* w = wtab + FXY[dx]*4;
3914                         const T* S = S0 + sy*sstep + sx*2;
3915                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
3916                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
3917                         D[0] = castOp(t0); D[1] = castOp(t1);
3918                     }
3919                 else if( cn == 3 )
3920                     for( ; dx < X1; dx++, D += 3 )
3921                     {
3922                         int sx = XY[dx*2], sy = XY[dx*2+1];
3923                         const AT* w = wtab + FXY[dx]*4;
3924                         const T* S = S0 + sy*sstep + sx*3;
3925                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
3926                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
3927                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
3928                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
3929                     }
3930                 else if( cn == 4 )
3931                     for( ; dx < X1; dx++, D += 4 )
3932                     {
3933                         int sx = XY[dx*2], sy = XY[dx*2+1];
3934                         const AT* w = wtab + FXY[dx]*4;
3935                         const T* S = S0 + sy*sstep + sx*4;
3936                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
3937                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
3938                         D[0] = castOp(t0); D[1] = castOp(t1);
3939                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
3940                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
3941                         D[2] = castOp(t0); D[3] = castOp(t1);
3942                     }
3943                 else
3944                     for( ; dx < X1; dx++, D += cn )
3945                     {
3946                         int sx = XY[dx*2], sy = XY[dx*2+1];
3947                         const AT* w = wtab + FXY[dx]*4;
3948                         const T* S = S0 + sy*sstep + sx*cn;
3949                         for( k = 0; k < cn; k++ )
3950                         {
3951                             WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
3952                             D[k] = castOp(t0);
3953                         }
3954                     }
3955             }
3956             else
3957             {
3958                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
3959                 {
3960                     D += (X1 - dx)*cn;
3961                     dx = X1;
3962                     continue;
3963                 }
3964
3965                 if( cn == 1 )
3966                     for( ; dx < X1; dx++, D++ )
3967                     {
3968                         int sx = XY[dx*2], sy = XY[dx*2+1];
3969                         if( borderType == BORDER_CONSTANT &&
3970                             (sx >= ssize.width || sx+1 < 0 ||
3971                              sy >= ssize.height || sy+1 < 0) )
3972                         {
3973                             D[0] = cval[0];
3974                         }
3975                         else
3976                         {
3977                             int sx0, sx1, sy0, sy1;
3978                             T v0, v1, v2, v3;
3979                             const AT* w = wtab + FXY[dx]*4;
3980                             if( borderType == BORDER_REPLICATE )
3981                             {
3982                                 sx0 = clip(sx, 0, ssize.width);
3983                                 sx1 = clip(sx+1, 0, ssize.width);
3984                                 sy0 = clip(sy, 0, ssize.height);
3985                                 sy1 = clip(sy+1, 0, ssize.height);
3986                                 v0 = S0[sy0*sstep + sx0];
3987                                 v1 = S0[sy0*sstep + sx1];
3988                                 v2 = S0[sy1*sstep + sx0];
3989                                 v3 = S0[sy1*sstep + sx1];
3990                             }
3991                             else
3992                             {
3993                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
3994                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3995                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
3996                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3997                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
3998                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
3999                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
4000                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
4001                             }
4002                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
4003                         }
4004                     }
4005                 else
4006                     for( ; dx < X1; dx++, D += cn )
4007                     {
4008                         int sx = XY[dx*2], sy = XY[dx*2+1];
4009                         if( borderType == BORDER_CONSTANT &&
4010                             (sx >= ssize.width || sx+1 < 0 ||
4011                              sy >= ssize.height || sy+1 < 0) )
4012                         {
4013                             for( k = 0; k < cn; k++ )
4014                                 D[k] = cval[k];
4015                         }
4016                         else
4017                         {
4018                             int sx0, sx1, sy0, sy1;
4019                             const T *v0, *v1, *v2, *v3;
4020                             const AT* w = wtab + FXY[dx]*4;
4021                             if( borderType == BORDER_REPLICATE )
4022                             {
4023                                 sx0 = clip(sx, 0, ssize.width);
4024                                 sx1 = clip(sx+1, 0, ssize.width);
4025                                 sy0 = clip(sy, 0, ssize.height);
4026                                 sy1 = clip(sy+1, 0, ssize.height);
4027                                 v0 = S0 + sy0*sstep + sx0*cn;
4028                                 v1 = S0 + sy0*sstep + sx1*cn;
4029                                 v2 = S0 + sy1*sstep + sx0*cn;
4030                                 v3 = S0 + sy1*sstep + sx1*cn;
4031                             }
4032                             else if( borderType == BORDER_TRANSPARENT &&
4033                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
4034                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
4035                                 continue;
4036                             else
4037                             {
4038                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
4039                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
4040                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
4041                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
4042                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
4043                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
4044                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
4045                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
4046                             }
4047                             for( k = 0; k < cn; k++ )
4048                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
4049                         }
4050                     }
4051             }
4052         }
4053     }
4054 }
4055
4056
4057 template<class CastOp, typename AT, int ONE>
4058 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
4059                           const Mat& _fxy, const void* _wtab,
4060                           int borderType, const Scalar& _borderValue )
4061 {
4062     typedef typename CastOp::rtype T;
4063     typedef typename CastOp::type1 WT;
4064     Size ssize = _src.size(), dsize = _dst.size();
4065     int cn = _src.channels();
4066     const AT* wtab = (const AT*)_wtab;
4067     const T* S0 = _src.ptr<T>();
4068     size_t sstep = _src.step/sizeof(S0[0]);
4069     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
4070         saturate_cast<T>(_borderValue[1]),
4071         saturate_cast<T>(_borderValue[2]),
4072         saturate_cast<T>(_borderValue[3]));
4073     int dx, dy;
4074     CastOp castOp;
4075     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
4076
4077     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
4078
4079     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
4080     {
4081         dsize.width *= dsize.height;
4082         dsize.height = 1;
4083     }
4084
4085     for( dy = 0; dy < dsize.height; dy++ )
4086     {
4087         T* D = _dst.ptr<T>(dy);
4088         const short* XY = _xy.ptr<short>(dy);
4089         const ushort* FXY = _fxy.ptr<ushort>(dy);
4090
4091         for( dx = 0; dx < dsize.width; dx++, D += cn )
4092         {
4093             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
4094             const AT* w = wtab + FXY[dx]*16;
4095             int i, k;
4096             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
4097             {
4098                 const T* S = S0 + sy*sstep + sx*cn;
4099                 for( k = 0; k < cn; k++ )
4100                 {
4101                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
4102                     S += sstep;
4103                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
4104                     S += sstep;
4105                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
4106                     S += sstep;
4107                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
4108                     S += 1 - sstep*3;
4109                     D[k] = castOp(sum);
4110                 }
4111             }
4112             else
4113             {
4114                 int x[4], y[4];
4115                 if( borderType == BORDER_TRANSPARENT &&
4116                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
4117                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
4118                     continue;
4119
4120                 if( borderType1 == BORDER_CONSTANT &&
4121                     (sx >= ssize.width || sx+4 <= 0 ||
4122                     sy >= ssize.height || sy+4 <= 0))
4123                 {
4124                     for( k = 0; k < cn; k++ )
4125                         D[k] = cval[k];
4126                     continue;
4127                 }
4128
4129                 for( i = 0; i < 4; i++ )
4130                 {
4131                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
4132                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
4133                 }
4134
4135                 for( k = 0; k < cn; k++, S0++, w -= 16 )
4136                 {
4137                     WT cv = cval[k], sum = cv*ONE;
4138                     for( i = 0; i < 4; i++, w += 4 )
4139                     {
4140                         int yi = y[i];
4141                         const T* S = S0 + yi*sstep;
4142                         if( yi < 0 )
4143                             continue;
4144                         if( x[0] >= 0 )
4145                             sum += (S[x[0]] - cv)*w[0];
4146                         if( x[1] >= 0 )
4147                             sum += (S[x[1]] - cv)*w[1];
4148                         if( x[2] >= 0 )
4149                             sum += (S[x[2]] - cv)*w[2];
4150                         if( x[3] >= 0 )
4151                             sum += (S[x[3]] - cv)*w[3];
4152                     }
4153                     D[k] = castOp(sum);
4154                 }
4155                 S0 -= cn;
4156             }
4157         }
4158     }
4159 }
4160
4161
4162 template<class CastOp, typename AT, int ONE>
4163 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
4164                            const Mat& _fxy, const void* _wtab,
4165                            int borderType, const Scalar& _borderValue )
4166 {
4167     typedef typename CastOp::rtype T;
4168     typedef typename CastOp::type1 WT;
4169     Size ssize = _src.size(), dsize = _dst.size();
4170     int cn = _src.channels();
4171     const AT* wtab = (const AT*)_wtab;
4172     const T* S0 = _src.ptr<T>();
4173     size_t sstep = _src.step/sizeof(S0[0]);
4174     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
4175         saturate_cast<T>(_borderValue[1]),
4176         saturate_cast<T>(_borderValue[2]),
4177         saturate_cast<T>(_borderValue[3]));
4178     int dx, dy;
4179     CastOp castOp;
4180     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
4181
4182     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
4183
4184     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
4185     {
4186         dsize.width *= dsize.height;
4187         dsize.height = 1;
4188     }
4189
4190     for( dy = 0; dy < dsize.height; dy++ )
4191     {
4192         T* D = _dst.ptr<T>(dy);
4193         const short* XY = _xy.ptr<short>(dy);
4194         const ushort* FXY = _fxy.ptr<ushort>(dy);
4195
4196         for( dx = 0; dx < dsize.width; dx++, D += cn )
4197         {
4198             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
4199             const AT* w = wtab + FXY[dx]*64;
4200             const T* S = S0 + sy*sstep + sx*cn;
4201             int i, k;
4202             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
4203             {
4204                 for( k = 0; k < cn; k++ )
4205                 {
4206                     WT sum = 0;
4207                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
4208                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
4209                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
4210                     w -= 64;
4211                     S -= sstep*8 - 1;
4212                     D[k] = castOp(sum);
4213                 }
4214             }
4215             else
4216             {
4217                 int x[8], y[8];
4218                 if( borderType == BORDER_TRANSPARENT &&
4219                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
4220                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
4221                     continue;
4222
4223                 if( borderType1 == BORDER_CONSTANT &&
4224                     (sx >= ssize.width || sx+8 <= 0 ||
4225                     sy >= ssize.height || sy+8 <= 0))
4226                 {
4227                     for( k = 0; k < cn; k++ )
4228                         D[k] = cval[k];
4229                     continue;
4230                 }
4231
4232                 for( i = 0; i < 8; i++ )
4233                 {
4234                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
4235                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
4236                 }
4237
4238                 for( k = 0; k < cn; k++, S0++, w -= 64 )
4239                 {
4240                     WT cv = cval[k], sum = cv*ONE;
4241                     for( i = 0; i < 8; i++, w += 8 )
4242                     {
4243                         int yi = y[i];
4244                         const T* S1 = S0 + yi*sstep;
4245                         if( yi < 0 )
4246                             continue;
4247                         if( x[0] >= 0 )
4248                             sum += (S1[x[0]] - cv)*w[0];
4249                         if( x[1] >= 0 )
4250                             sum += (S1[x[1]] - cv)*w[1];
4251                         if( x[2] >= 0 )
4252                             sum += (S1[x[2]] - cv)*w[2];
4253                         if( x[3] >= 0 )
4254                             sum += (S1[x[3]] - cv)*w[3];
4255                         if( x[4] >= 0 )
4256                             sum += (S1[x[4]] - cv)*w[4];
4257                         if( x[5] >= 0 )
4258                             sum += (S1[x[5]] - cv)*w[5];
4259                         if( x[6] >= 0 )
4260                             sum += (S1[x[6]] - cv)*w[6];
4261                         if( x[7] >= 0 )
4262                             sum += (S1[x[7]] - cv)*w[7];
4263                     }
4264                     D[k] = castOp(sum);
4265                 }
4266                 S0 -= cn;
4267             }
4268         }
4269     }
4270 }
4271
4272
4273 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
4274                             int borderType, const Scalar& _borderValue );
4275
4276 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
4277                           const Mat& _fxy, const void* _wtab,
4278                           int borderType, const Scalar& _borderValue);
4279
4280 class RemapInvoker :
4281     public ParallelLoopBody
4282 {
4283 public:
4284     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
4285                  const Mat *_m2, int _borderType, const Scalar &_borderValue,
4286                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
4287         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
4288         borderType(_borderType), borderValue(_borderValue),
4289         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
4290     {
4291     }
4292
4293     virtual void operator() (const Range& range) const
4294     {
4295         int x, y, x1, y1;
4296         const int buf_size = 1 << 14;
4297         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
4298         int bcols0 = std::min(buf_size/brows0, dst->cols);
4299         brows0 = std::min(buf_size/bcols0, dst->rows);
4300     #if CV_SSE2
4301         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
4302     #endif
4303
4304         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
4305         if( !nnfunc )
4306             _bufa.create(brows0, bcols0, CV_16UC1);
4307
4308         for( y = range.start; y < range.end; y += brows0 )
4309         {
4310             for( x = 0; x < dst->cols; x += bcols0 )
4311             {
4312                 int brows = std::min(brows0, range.end - y);
4313                 int bcols = std::min(bcols0, dst->cols - x);
4314                 Mat dpart(*dst, Rect(x, y, bcols, brows));
4315                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
4316
4317                 if( nnfunc )
4318                 {
4319                     if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
4320                         bufxy = (*m1)(Rect(x, y, bcols, brows));
4321                     else if( map_depth != CV_32F )
4322                     {
4323                         for( y1 = 0; y1 < brows; y1++ )
4324                         {
4325                             short* XY = bufxy.ptr<short>(y1);
4326                             const short* sXY = m1->ptr<short>(y+y1) + x*2;
4327                             const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4328
4329                             for( x1 = 0; x1 < bcols; x1++ )
4330                             {
4331                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
4332                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
4333                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
4334                             }
4335                         }
4336                     }
4337                     else if( !planar_input )
4338                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
4339                     else
4340                     {
4341                         for( y1 = 0; y1 < brows; y1++ )
4342                         {
4343                             short* XY = bufxy.ptr<short>(y1);
4344                             const float* sX = m1->ptr<float>(y+y1) + x;
4345                             const float* sY = m2->ptr<float>(y+y1) + x;
4346                             x1 = 0;
4347
4348                         #if CV_SSE2
4349                             if( useSIMD )
4350                             {
4351                                 for( ; x1 <= bcols - 8; x1 += 8 )
4352                                 {
4353                                     __m128 fx0 = _mm_loadu_ps(sX + x1);
4354                                     __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4355                                     __m128 fy0 = _mm_loadu_ps(sY + x1);
4356                                     __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4357                                     __m128i ix0 = _mm_cvtps_epi32(fx0);
4358                                     __m128i ix1 = _mm_cvtps_epi32(fx1);
4359                                     __m128i iy0 = _mm_cvtps_epi32(fy0);
4360                                     __m128i iy1 = _mm_cvtps_epi32(fy1);
4361                                     ix0 = _mm_packs_epi32(ix0, ix1);
4362                                     iy0 = _mm_packs_epi32(iy0, iy1);
4363                                     ix1 = _mm_unpacklo_epi16(ix0, iy0);
4364                                     iy1 = _mm_unpackhi_epi16(ix0, iy0);
4365                                     _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4366                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4367                                 }
4368                             }
4369                         #endif
4370
4371                             for( ; x1 < bcols; x1++ )
4372                             {
4373                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
4374                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
4375                             }
4376                         }
4377                     }
4378                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
4379                     continue;
4380                 }
4381
4382                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
4383                 for( y1 = 0; y1 < brows; y1++ )
4384                 {
4385                     short* XY = bufxy.ptr<short>(y1);
4386                     ushort* A = bufa.ptr<ushort>(y1);
4387
4388                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
4389                     {
4390                         bufxy = (*m1)(Rect(x, y, bcols, brows));
4391
4392                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4393                         x1 = 0;
4394
4395                     #if CV_NEON
4396                         uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
4397                         for ( ; x1 <= bcols - 8; x1 += 8)
4398                             vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
4399                     #elif CV_SSE2
4400                         __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
4401                         for ( ; x1 <= bcols - 8; x1 += 8)
4402                             _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
4403                     #endif
4404
4405                         for( ; x1 < bcols; x1++ )
4406                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
4407                     }
4408                     else if( planar_input )
4409                     {
4410                         const float* sX = m1->ptr<float>(y+y1) + x;
4411                         const float* sY = m2->ptr<float>(y+y1) + x;
4412
4413                         x1 = 0;
4414                     #if CV_SSE2
4415                         if( useSIMD )
4416                         {
4417                             __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
4418                             __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
4419                             for( ; x1 <= bcols - 8; x1 += 8 )
4420                             {
4421                                 __m128 fx0 = _mm_loadu_ps(sX + x1);
4422                                 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4423                                 __m128 fy0 = _mm_loadu_ps(sY + x1);
4424                                 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4425                                 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
4426                                 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
4427                                 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
4428                                 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
4429                                 __m128i mx0 = _mm_and_si128(ix0, mask);
4430                                 __m128i mx1 = _mm_and_si128(ix1, mask);
4431                                 __m128i my0 = _mm_and_si128(iy0, mask);
4432                                 __m128i my1 = _mm_and_si128(iy1, mask);
4433                                 mx0 = _mm_packs_epi32(mx0, mx1);
4434                                 my0 = _mm_packs_epi32(my0, my1);
4435                                 my0 = _mm_slli_epi16(my0, INTER_BITS);
4436                                 mx0 = _mm_or_si128(mx0, my0);
4437                                 _mm_storeu_si128((__m128i*)(A + x1), mx0);
4438                                 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
4439                                 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
4440                                 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
4441                                 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
4442                                 ix0 = _mm_packs_epi32(ix0, ix1);
4443                                 iy0 = _mm_packs_epi32(iy0, iy1);
4444                                 ix1 = _mm_unpacklo_epi16(ix0, iy0);
4445                                 iy1 = _mm_unpackhi_epi16(ix0, iy0);
4446                                 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4447                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4448                             }
4449                         }
4450                     #elif CV_NEON
4451                         float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
4452                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4453
4454                         for( ; x1 <= bcols - 4; x1 += 4 )
4455                         {
4456                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
4457                                       v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
4458                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4459                                                       vandq_s32(v_sy, v_scale2));
4460                             vst1_u16(A + x1, vqmovun_s32(v_v));
4461
4462                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4463                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4464                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4465                         }
4466                     #endif
4467
4468                         for( ; x1 < bcols; x1++ )
4469                         {
4470                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
4471                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
4472                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4473                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4474                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4475                             A[x1] = (ushort)v;
4476                         }
4477                     }
4478                     else
4479                     {
4480                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
4481                         x1 = 0;
4482
4483                     #if CV_NEON
4484                         float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
4485                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4486
4487                         for( ; x1 <= bcols - 4; x1 += 4 )
4488                         {
4489                             float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
4490                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
4491                             int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
4492                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4493                                                       vandq_s32(v_sy, v_scale2));
4494                             vst1_u16(A + x1, vqmovun_s32(v_v));
4495
4496                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4497                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4498                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4499                         }
4500                     #endif
4501
4502                         for( x1 = 0; x1 < bcols; x1++ )
4503                         {
4504                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
4505                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
4506                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4507                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4508                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4509                             A[x1] = (ushort)v;
4510                         }
4511                     }
4512                 }
4513                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
4514             }
4515         }
4516     }
4517
4518 private:
4519     const Mat* src;
4520     Mat* dst;
4521     const Mat *m1, *m2;
4522     int borderType;
4523     Scalar borderValue;
4524     int planar_input;
4525     RemapNNFunc nnfunc;
4526     RemapFunc ifunc;
4527     const void *ctab;
4528 };
4529
4530 #ifdef HAVE_OPENCL
4531
4532 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
4533                       int interpolation, int borderType, const Scalar& borderValue)
4534 {
4535     const ocl::Device & dev = ocl::Device::getDefault();
4536     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
4537             rowsPerWI = dev.isIntel() ? 4 : 1;
4538
4539     if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
4540             || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
4541         return false;
4542
4543     UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
4544
4545     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
4546         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
4547     {
4548         if (map1.type() != CV_16SC2)
4549             std::swap(map1, map2);
4550     }
4551     else
4552         CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4553
4554     _dst.create(map1.size(), type);
4555     UMat dst = _dst.getUMat();
4556
4557     String kernelName = "remap";
4558     if (map1.type() == CV_32FC2 && map2.empty())
4559         kernelName += "_32FC2";
4560     else if (map1.type() == CV_16SC2)
4561     {
4562         kernelName += "_16SC2";
4563         if (!map2.empty())
4564             kernelName += "_16UC1";
4565     }
4566     else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
4567         kernelName += "_2_32FC1";
4568     else
4569         CV_Error(Error::StsBadArg, "Unsupported map types");
4570
4571     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
4572     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4573                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
4574     String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
4575                                  interMap[interpolation], borderMap[borderType],
4576                                  ocl::typeToStr(type), rowsPerWI);
4577
4578     if (interpolation != INTER_NEAREST)
4579     {
4580         char cvt[3][40];
4581         int wdepth = std::max(CV_32F, depth);
4582         buildOptions = buildOptions
4583                       + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
4584                                " -D convertToWT2=%s -D WT2=%s",
4585                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
4586                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
4587                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
4588                                ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
4589                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
4590     }
4591     int scalarcn = cn == 3 ? 4 : cn;
4592     int sctype = CV_MAKETYPE(depth, scalarcn);
4593     buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
4594                            ocl::typeToStr(type), ocl::typeToStr(depth),
4595                            cn, ocl::typeToStr(sctype), depth);
4596
4597     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
4598
4599     Mat scalar(1, 1, sctype, borderValue);
4600     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
4601             map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
4602             scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
4603
4604     if (map2.empty())
4605         k.args(srcarg, dstarg, map1arg, scalararg);
4606     else
4607         k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
4608
4609     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
4610     return k.run(2, globalThreads, NULL, false);
4611 }
4612
4613 static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
4614     Point2f center, double maxRadius, int flags)
4615 {
4616     UMat src_with_border; // don't scope this variable (it holds image data)
4617
4618     UMat mapx, mapy, r, cp_sp;
4619     UMat src = _src.getUMat();
4620     _dst.create(src.size(), src.type());
4621     Size dsize = src.size();
4622     r.create(Size(1, dsize.width), CV_32F);
4623     cp_sp.create(Size(1, dsize.height), CV_32FC2);
4624
4625     mapx.create(dsize, CV_32F);
4626     mapy.create(dsize, CV_32F);
4627     size_t w = dsize.width;
4628     size_t h = dsize.height;
4629     String buildOptions;
4630     unsigned mem_size = 32;
4631     if (flags & CV_WARP_INVERSE_MAP)
4632     {
4633         buildOptions = "-D InverseMap";
4634     }
4635     else
4636     {
4637         buildOptions = format("-D ForwardMap  -D MEM_SIZE=%d", mem_size);
4638     }
4639     String retval;
4640     ocl::Program p(ocl::imgproc::linearPolar_oclsrc, buildOptions, retval);
4641     ocl::Kernel k("linearPolar", p);
4642     ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
4643     ocl::KernelArg  ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
4644     ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
4645
4646     if (!(flags & CV_WARP_INVERSE_MAP))
4647     {
4648
4649
4650
4651         ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
4652         float PI2_height = (float) CV_2PI / dsize.height;
4653         float maxRadius_width = (float) maxRadius / dsize.width;
4654         computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, maxRadius_width, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
4655         size_t max_dim = max(h, w);
4656         computeAngleRadius_Kernel.run(1, &max_dim, NULL, false);
4657         k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
4658     }
4659     else
4660     {
4661         const int ANGLE_BORDER = 1;
4662
4663         cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
4664         src = src_with_border;
4665         Size ssize = src_with_border.size();
4666         ssize.height -= 2 * ANGLE_BORDER;
4667         float ascale =  ssize.height / ((float)CV_2PI);
4668         float pscale =  ssize.width / ((float) maxRadius);
4669
4670         k.args(ocl_mapx, ocl_mapy, ascale, pscale, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
4671
4672
4673     }
4674     size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
4675     size_t localThreads[2] = { mem_size , mem_size };
4676     k.run(2, globalThreads, localThreads, false);
4677     remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
4678     return true;
4679 }
4680 static bool ocl_logPolar(InputArray _src, OutputArray _dst,
4681     Point2f center, double M, int flags)
4682 {
4683     if (M <= 0)
4684         CV_Error(CV_StsOutOfRange, "M should be >0");
4685     UMat src_with_border; // don't scope this variable (it holds image data)
4686
4687     UMat mapx, mapy, r, cp_sp;
4688     UMat src = _src.getUMat();
4689     _dst.create(src.size(), src.type());
4690     Size dsize = src.size();
4691     r.create(Size(1, dsize.width), CV_32F);
4692     cp_sp.create(Size(1, dsize.height), CV_32FC2);
4693
4694     mapx.create(dsize, CV_32F);
4695     mapy.create(dsize, CV_32F);
4696     size_t w = dsize.width;
4697     size_t h = dsize.height;
4698     String buildOptions;
4699     unsigned mem_size = 32;
4700     if (flags & CV_WARP_INVERSE_MAP)
4701     {
4702         buildOptions = "-D InverseMap";
4703     }
4704     else
4705     {
4706         buildOptions = format("-D ForwardMap  -D MEM_SIZE=%d", mem_size);
4707     }
4708     String retval;
4709     ocl::Program p(ocl::imgproc::logPolar_oclsrc, buildOptions, retval);
4710     //ocl::Program p(ocl::imgproc::my_linearPolar_oclsrc, buildOptions, retval);
4711     //printf("%s\n", retval);
4712     ocl::Kernel k("logPolar", p);
4713     ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
4714     ocl::KernelArg  ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
4715     ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
4716
4717     if (!(flags & CV_WARP_INVERSE_MAP))
4718     {
4719
4720
4721
4722         ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
4723         float PI2_height = (float) CV_2PI / dsize.height;
4724
4725         computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, (float)M, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
4726         size_t max_dim = max(h, w);
4727         computeAngleRadius_Kernel.run(1, &max_dim, NULL, false);
4728         k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
4729     }
4730     else
4731     {
4732         const int ANGLE_BORDER = 1;
4733
4734         cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
4735         src = src_with_border;
4736         Size ssize = src_with_border.size();
4737         ssize.height -= 2 * ANGLE_BORDER;
4738         float ascale =  ssize.height / ((float)CV_2PI);
4739
4740
4741         k.args(ocl_mapx, ocl_mapy, ascale, (float)M, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
4742
4743
4744     }
4745     size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
4746     size_t localThreads[2] = { mem_size , mem_size };
4747     k.run(2, globalThreads, localThreads, false);
4748     remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
4749     return true;
4750 }
4751 #endif
4752
4753 #if defined HAVE_IPP && IPP_DISABLE_BLOCK
4754
4755 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
4756                                            const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
4757                                            void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
4758
4759 class IPPRemapInvoker :
4760         public ParallelLoopBody
4761 {
4762 public:
4763     IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
4764                     int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
4765         ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
4766         ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
4767     {
4768         *ok = true;
4769     }
4770
4771     virtual void operator() (const Range & range) const
4772     {
4773         IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
4774         Mat dstRoi = dst.rowRange(range);
4775         IppiSize dstRoiSize = ippiSize(dstRoi.size());
4776         int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4777
4778         if (borderType == BORDER_CONSTANT &&
4779                 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
4780         {
4781             *ok = false;
4782             return;
4783         }
4784
4785         if (CV_INSTRUMENT_FUN_PTR_CALL_IPP(ippFunc,(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
4786                     map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
4787                     dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation)) < 0)
4788             *ok = false;
4789         else
4790         {
4791             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4792         }
4793     }
4794
4795 private:
4796     Mat & src, & dst, & map1, & map2;
4797     ippiRemap ippFunc;
4798     int ippInterpolation, borderType;
4799     Scalar borderValue;
4800     bool * ok;
4801 };
4802
4803 #endif
4804
4805 }
4806
4807 void cv::remap( InputArray _src, OutputArray _dst,
4808                 InputArray _map1, InputArray _map2,
4809                 int interpolation, int borderType, const Scalar& borderValue )
4810 {
4811     CV_INSTRUMENT_REGION()
4812
4813     static RemapNNFunc nn_tab[] =
4814     {
4815         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
4816         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
4817     };
4818
4819     static RemapFunc linear_tab[] =
4820     {
4821         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
4822         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
4823         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
4824         remapBilinear<Cast<float, float>, RemapNoVec, float>,
4825         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
4826     };
4827
4828     static RemapFunc cubic_tab[] =
4829     {
4830         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4831         remapBicubic<Cast<float, ushort>, float, 1>,
4832         remapBicubic<Cast<float, short>, float, 1>, 0,
4833         remapBicubic<Cast<float, float>, float, 1>,
4834         remapBicubic<Cast<double, double>, float, 1>, 0
4835     };
4836
4837     static RemapFunc lanczos4_tab[] =
4838     {
4839         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4840         remapLanczos4<Cast<float, ushort>, float, 1>,
4841         remapLanczos4<Cast<float, short>, float, 1>, 0,
4842         remapLanczos4<Cast<float, float>, float, 1>,
4843         remapLanczos4<Cast<double, double>, float, 1>, 0
4844     };
4845
4846     CV_Assert( _map1.size().area() > 0 );
4847     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
4848
4849     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4850                ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
4851
4852     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
4853     _dst.create( map1.size(), src.type() );
4854     Mat dst = _dst.getMat();
4855     CV_Assert( dst.cols < SHRT_MAX && dst.rows < SHRT_MAX && src.cols < SHRT_MAX && src.rows < SHRT_MAX );
4856
4857     if( dst.data == src.data )
4858         src = src.clone();
4859
4860     if( interpolation == INTER_AREA )
4861         interpolation = INTER_LINEAR;
4862
4863     int type = src.type(), depth = CV_MAT_DEPTH(type);
4864
4865 #if defined HAVE_IPP && IPP_DISABLE_BLOCK
4866     CV_IPP_CHECK()
4867     {
4868         if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
4869                 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
4870                 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
4871         {
4872             int ippInterpolation =
4873                 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4874                 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
4875
4876             ippiRemap ippFunc =
4877                 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
4878                 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
4879                 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
4880                 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
4881                 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
4882                 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
4883                 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
4884                 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
4885                 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
4886
4887             if (ippFunc)
4888             {
4889                 bool ok;
4890                 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
4891                                         borderType, borderValue, &ok);
4892                 Range range(0, dst.rows);
4893                 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
4894
4895                 if (ok)
4896                 {
4897                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4898                     return;
4899                 }
4900                 setIppErrorStatus();
4901             }
4902         }
4903     }
4904 #endif
4905
4906     RemapNNFunc nnfunc = 0;
4907     RemapFunc ifunc = 0;
4908     const void* ctab = 0;
4909     bool fixpt = depth == CV_8U;
4910     bool planar_input = false;
4911
4912     if( interpolation == INTER_NEAREST )
4913     {
4914         nnfunc = nn_tab[depth];
4915         CV_Assert( nnfunc != 0 );
4916     }
4917     else
4918     {
4919         if( interpolation == INTER_LINEAR )
4920             ifunc = linear_tab[depth];
4921         else if( interpolation == INTER_CUBIC )
4922             ifunc = cubic_tab[depth];
4923         else if( interpolation == INTER_LANCZOS4 )
4924             ifunc = lanczos4_tab[depth];
4925         else
4926             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
4927         CV_Assert( ifunc != 0 );
4928         ctab = initInterTab2D( interpolation, fixpt );
4929     }
4930
4931     const Mat *m1 = &map1, *m2 = &map2;
4932
4933     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
4934         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
4935     {
4936         if( map1.type() != CV_16SC2 )
4937             std::swap(m1, m2);
4938     }
4939     else
4940     {
4941         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
4942             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4943         planar_input = map1.channels() == 1;
4944     }
4945
4946     RemapInvoker invoker(src, dst, m1, m2,
4947                          borderType, borderValue, planar_input, nnfunc, ifunc,
4948                          ctab);
4949     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
4950 }
4951
4952
4953 void cv::convertMaps( InputArray _map1, InputArray _map2,
4954                       OutputArray _dstmap1, OutputArray _dstmap2,
4955                       int dstm1type, bool nninterpolate )
4956 {
4957     CV_INSTRUMENT_REGION()
4958
4959     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
4960     Size size = map1.size();
4961     const Mat *m1 = &map1, *m2 = &map2;
4962     int m1type = m1->type(), m2type = m2->type();
4963
4964     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
4965                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
4966                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
4967                (m1type == CV_32FC2 && m2->empty()) );
4968
4969     if( m2type == CV_16SC2 )
4970     {
4971         std::swap( m1, m2 );
4972         std::swap( m1type, m2type );
4973     }
4974
4975     if( dstm1type <= 0 )
4976         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
4977     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
4978     _dstmap1.create( size, dstm1type );
4979     dstmap1 = _dstmap1.getMat();
4980
4981     if( !nninterpolate && dstm1type != CV_32FC2 )
4982     {
4983         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
4984         dstmap2 = _dstmap2.getMat();
4985     }
4986     else
4987         _dstmap2.release();
4988
4989     if( m1type == dstm1type || (nninterpolate &&
4990         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
4991         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
4992     {
4993         m1->convertTo( dstmap1, dstmap1.type() );
4994         if( !dstmap2.empty() && dstmap2.type() == m2->type() )
4995             m2->copyTo( dstmap2 );
4996         return;
4997     }
4998
4999     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
5000     {
5001         Mat vdata[] = { *m1, *m2 };
5002         merge( vdata, 2, dstmap1 );
5003         return;
5004     }
5005
5006     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
5007     {
5008         Mat mv[] = { dstmap1, dstmap2 };
5009         split( *m1, mv );
5010         return;
5011     }
5012
5013     if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
5014         dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
5015     {
5016         size.width *= size.height;
5017         size.height = 1;
5018     }
5019
5020 #if CV_SSE2
5021     bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
5022 #endif
5023 #if CV_SSE4_1
5024     bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
5025 #endif
5026
5027     const float scale = 1.f/INTER_TAB_SIZE;
5028     int x, y;
5029     for( y = 0; y < size.height; y++ )
5030     {
5031         const float* src1f = m1->ptr<float>(y);
5032         const float* src2f = m2->ptr<float>(y);
5033         const short* src1 = (const short*)src1f;
5034         const ushort* src2 = (const ushort*)src2f;
5035
5036         float* dst1f = dstmap1.ptr<float>(y);
5037         float* dst2f = dstmap2.ptr<float>(y);
5038         short* dst1 = (short*)dst1f;
5039         ushort* dst2 = (ushort*)dst2f;
5040         x = 0;
5041
5042         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
5043         {
5044             if( nninterpolate )
5045             {
5046                 #if CV_NEON
5047                 for( ; x <= size.width - 8; x += 8 )
5048                 {
5049                     int16x8x2_t v_dst;
5050                     v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
5051                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
5052                     v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
5053                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
5054
5055                     vst2q_s16(dst1 + (x << 1), v_dst);
5056                 }
5057                 #elif CV_SSE4_1
5058                 if (useSSE4_1)
5059                 {
5060                     for( ; x <= size.width - 16; x += 16 )
5061                     {
5062                         __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
5063                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
5064                         __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
5065                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
5066
5067                         __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
5068                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
5069                         __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
5070                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
5071
5072                         _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
5073
5074                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
5075                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
5076                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
5077                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
5078                     }
5079                 }
5080                 #endif
5081                 for( ; x < size.width; x++ )
5082                 {
5083                     dst1[x*2] = saturate_cast<short>(src1f[x]);
5084                     dst1[x*2+1] = saturate_cast<short>(src2f[x]);
5085                 }
5086             }
5087             else
5088             {
5089                 #if CV_NEON
5090                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
5091                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
5092
5093                 for( ; x <= size.width - 8; x += 8 )
5094                 {
5095                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
5096                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
5097                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
5098                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
5099
5100                     int16x8x2_t v_dst;
5101                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
5102                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
5103                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
5104                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
5105
5106                     vst2q_s16(dst1 + (x << 1), v_dst);
5107
5108                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
5109                                                               vandq_s32(v_ix0, v_mask)));
5110                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
5111                                                               vandq_s32(v_ix1, v_mask)));
5112                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
5113                 }
5114                 #elif CV_SSE4_1
5115                 if (useSSE4_1)
5116                 {
5117                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
5118                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
5119
5120                     for( ; x <= size.width - 16; x += 16 )
5121                     {
5122                         __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
5123                         __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
5124                         __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
5125                         __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
5126
5127                         __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
5128                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
5129                         __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
5130                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
5131                         __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
5132                                                         _mm_and_si128(v_ix0, v_its1));
5133                         __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
5134                                                         _mm_and_si128(v_ix1, v_its1));
5135                         _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
5136
5137                         v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
5138                         v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
5139                         v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
5140                         v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
5141
5142                         __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
5143                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
5144                         __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
5145                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
5146                         v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
5147                                                 _mm_and_si128(v_ix0, v_its1));
5148                         v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
5149                                                 _mm_and_si128(v_ix1, v_its1));
5150                         _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
5151
5152                         _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
5153
5154                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
5155                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
5156                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
5157                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
5158                     }
5159                 }
5160                 #endif
5161                 for( ; x < size.width; x++ )
5162                 {
5163                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
5164                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
5165                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
5166                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
5167                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
5168                 }
5169             }
5170         }
5171         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
5172         {
5173             if( nninterpolate )
5174             {
5175                 #if CV_NEON
5176                 for( ; x <= (size.width << 1) - 8; x += 8 )
5177                     vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
5178                                                      vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
5179                 #elif CV_SSE2
5180                 for( ; x <= (size.width << 1) - 8; x += 8 )
5181                 {
5182                     _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
5183                                                                             _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
5184                 }
5185                 #endif
5186                 for( ; x < size.width; x++ )
5187                 {
5188                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
5189                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
5190                 }
5191             }
5192             else
5193             {
5194                 #if CV_NEON
5195                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
5196                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
5197
5198                 for( ; x <= size.width - 8; x += 8 )
5199                 {
5200                     float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
5201                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
5202                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
5203                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
5204                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
5205
5206                     int16x8x2_t v_dst;
5207                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
5208                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
5209                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
5210                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
5211
5212                     vst2q_s16(dst1 + (x << 1), v_dst);
5213
5214                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
5215                                                               vandq_s32(v_ix0, v_mask)));
5216                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
5217                                                               vandq_s32(v_ix1, v_mask)));
5218                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
5219                 }
5220                 #elif CV_SSE4_1
5221                 if (useSSE4_1)
5222                 {
5223                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
5224                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
5225                     __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
5226
5227                     for( ; x <= size.width - 4; x += 4 )
5228                     {
5229                         __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
5230                         __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
5231
5232                         __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
5233                                                          _mm_srai_epi32(v_src1, INTER_BITS));
5234                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
5235
5236                         // x0 y0 x1 y1 . . .
5237                         v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
5238                                                  _mm_and_si128(v_src1, v_its1));
5239                         __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
5240                                                       _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
5241                         _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
5242                     }
5243                 }
5244                 #endif
5245                 for( ; x < size.width; x++ )
5246                 {
5247                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
5248                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
5249                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
5250                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
5251                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
5252                 }
5253             }
5254         }
5255         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
5256         {
5257             #if CV_NEON
5258             uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
5259             uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
5260             float32x4_t v_scale = vdupq_n_f32(scale);
5261
5262             for( ; x <= size.width - 8; x += 8)
5263             {
5264                 uint32x4_t v_fxy1, v_fxy2;
5265                 if (src2)
5266                 {
5267                     uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
5268                     v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
5269                     v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
5270                 }
5271                 else
5272                     v_fxy1 = v_fxy2 = v_zero;
5273
5274                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
5275                 float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
5276                                                v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
5277                 float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
5278                                                v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
5279                 vst1q_f32(dst1f + x, v_dst1);
5280                 vst1q_f32(dst2f + x, v_dst2);
5281
5282                 v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
5283                                    v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
5284                 v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
5285                                    v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
5286                 vst1q_f32(dst1f + x + 4, v_dst1);
5287                 vst1q_f32(dst2f + x + 4, v_dst2);
5288             }
5289             #elif CV_SSE2
5290             __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
5291             __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
5292             __m128 v_scale = _mm_set1_ps(scale);
5293
5294             for( ; x <= size.width - 16; x += 16)
5295             {
5296                 __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
5297                 __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
5298                 __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
5299                 __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
5300
5301                 _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
5302
5303                 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
5304                 __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
5305                 _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
5306                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5307                 _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
5308                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5309                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
5310                 _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
5311                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5312                 _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
5313                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5314
5315                 v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
5316                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
5317                 _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
5318                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5319                 _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
5320                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5321                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
5322                 _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
5323                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5324                 _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
5325                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5326             }
5327             #endif
5328             for( ; x < size.width; x++ )
5329             {
5330                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
5331                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
5332                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
5333             }
5334         }
5335         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
5336         {
5337             #if CV_NEON
5338             int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
5339             int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
5340             float32x4_t v_scale = vdupq_n_f32(scale);
5341
5342             for( ; x <= size.width - 8; x += 8)
5343             {
5344                 int32x4_t v_fxy1, v_fxy2;
5345                 if (src2)
5346                 {
5347                     int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
5348                     v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
5349                     v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
5350                 }
5351                 else
5352                     v_fxy1 = v_fxy2 = v_zero;
5353
5354                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
5355                 float32x4x2_t v_dst;
5356                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
5357                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
5358                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
5359                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
5360                 vst2q_f32(dst1f + (x << 1), v_dst);
5361
5362                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
5363                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
5364                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
5365                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
5366                 vst2q_f32(dst1f + (x << 1) + 8, v_dst);
5367             }
5368             #elif CV_SSE2
5369             if (useSSE2)
5370             {
5371                 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
5372                 __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
5373                 __m128 v_scale = _mm_set1_ps(scale);
5374
5375                 for ( ; x <= size.width - 8; x += 8)
5376                 {
5377                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
5378                     __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
5379                     __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
5380                     __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
5381
5382                     __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
5383                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
5384
5385                     v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
5386                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
5387                 }
5388             }
5389             #endif
5390             for( ; x < size.width; x++ )
5391             {
5392                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
5393                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
5394                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
5395             }
5396         }
5397         else
5398             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
5399     }
5400 }
5401
5402
5403 namespace cv
5404 {
5405
5406 class WarpAffineInvoker :
5407     public ParallelLoopBody
5408 {
5409 public:
5410     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
5411                       const Scalar &_borderValue, int *_adelta, int *_bdelta, const double *_M) :
5412         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
5413         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
5414         M(_M)
5415     {
5416     }
5417
5418     virtual void operator() (const Range& range) const
5419     {
5420         const int BLOCK_SZ = 64;
5421         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
5422         const int AB_BITS = MAX(10, (int)INTER_BITS);
5423         const int AB_SCALE = 1 << AB_BITS;
5424         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
5425     #if CV_SSE2
5426         bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
5427     #endif
5428     #if CV_SSE4_1
5429         bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
5430     #endif
5431
5432         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
5433         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
5434         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
5435
5436         for( y = range.start; y < range.end; y += bh0 )
5437         {
5438             for( x = 0; x < dst.cols; x += bw0 )
5439             {
5440                 int bw = std::min( bw0, dst.cols - x);
5441                 int bh = std::min( bh0, range.end - y);
5442
5443                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
5444                 Mat dpart(dst, Rect(x, y, bw, bh));
5445
5446                 for( y1 = 0; y1 < bh; y1++ )
5447                 {
5448                     short* xy = XY + y1*bw*2;
5449                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
5450                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
5451
5452                     if( interpolation == INTER_NEAREST )
5453                     {
5454                         x1 = 0;
5455                         #if CV_NEON
5456                         int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
5457                         for( ; x1 <= bw - 8; x1 += 8 )
5458                         {
5459                             int16x8x2_t v_dst;
5460                             v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
5461                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
5462                             v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
5463                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
5464
5465                             vst2q_s16(xy + (x1 << 1), v_dst);
5466                         }
5467                         #elif CV_SSE4_1
5468                         if (useSSE4_1)
5469                         {
5470                             __m128i v_X0 = _mm_set1_epi32(X0);
5471                             __m128i v_Y0 = _mm_set1_epi32(Y0);
5472                             for ( ; x1 <= bw - 16; x1 += 16)
5473                             {
5474                                 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
5475                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
5476                                 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
5477                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
5478
5479                                 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
5480                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
5481                                 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
5482                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
5483
5484                                 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
5485
5486                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
5487                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
5488                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
5489                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
5490                             }
5491                         }
5492                         #endif
5493                         for( ; x1 < bw; x1++ )
5494                         {
5495                             int X = (X0 + adelta[x+x1]) >> AB_BITS;
5496                             int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
5497                             xy[x1*2] = saturate_cast<short>(X);
5498                             xy[x1*2+1] = saturate_cast<short>(Y);
5499                         }
5500                     }
5501                     else
5502                     {
5503                         short* alpha = A + y1*bw;
5504                         x1 = 0;
5505                     #if CV_SSE2
5506                         if( useSSE2 )
5507                         {
5508                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
5509                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
5510                             for( ; x1 <= bw - 8; x1 += 8 )
5511                             {
5512                                 __m128i tx0, tx1, ty0, ty1;
5513                                 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
5514                                 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
5515                                 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
5516                                 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
5517
5518                                 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
5519                                 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
5520                                 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
5521                                 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
5522
5523                                 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
5524                                                             _mm_and_si128(tx1, fxy_mask));
5525                                 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
5526                                                             _mm_and_si128(ty1, fxy_mask));
5527                                 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
5528                                                             _mm_srai_epi32(tx1, INTER_BITS));
5529                                 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
5530                                                     _mm_srai_epi32(ty1, INTER_BITS));
5531                                 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
5532
5533                                 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
5534                                 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
5535                                 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
5536                             }
5537                         }
5538                     #elif CV_NEON
5539                         int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
5540                         for( ; x1 <= bw - 8; x1 += 8 )
5541                         {
5542                             int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
5543                             int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
5544                             int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
5545                             int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
5546
5547                             int16x8x2_t v_xy;
5548                             v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
5549                             v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
5550
5551                             vst2q_s16(xy + (x1 << 1), v_xy);
5552
5553                             int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
5554                                                                      vandq_s32(v_X0, v_mask)));
5555                             int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
5556                                                                      vandq_s32(v_X1, v_mask)));
5557                             vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
5558                         }
5559                     #endif
5560                         for( ; x1 < bw; x1++ )
5561                         {
5562                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
5563                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
5564                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
5565                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
5566                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
5567                                     (X & (INTER_TAB_SIZE-1)));
5568                         }
5569                     }
5570                 }
5571
5572                 if( interpolation == INTER_NEAREST )
5573                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
5574                 else
5575                 {
5576                     Mat _matA(bh, bw, CV_16U, A);
5577                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
5578                 }
5579             }
5580         }
5581     }
5582
5583 private:
5584     Mat src;
5585     Mat dst;
5586     int interpolation, borderType;
5587     Scalar borderValue;
5588     int *adelta, *bdelta;
5589     const double *M;
5590 };
5591
5592
5593 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
5594 class IPPWarpAffineInvoker :
5595     public ParallelLoopBody
5596 {
5597 public:
5598     IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
5599                          const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
5600         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
5601         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
5602     {
5603         *ok = true;
5604     }
5605
5606     virtual void operator() (const Range& range) const
5607     {
5608         IppiSize srcsize = { src.cols, src.rows };
5609         IppiRect srcroi = { 0, 0, src.cols, src.rows };
5610         IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
5611         int cnn = src.channels();
5612         if( borderType == BORDER_CONSTANT )
5613         {
5614             IppiSize setSize = { dst.cols, range.end - range.start };
5615             void *dataPointer = dst.ptr(range.start);
5616             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
5617             {
5618                 *ok = false;
5619                 return;
5620             }
5621         }
5622
5623         // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
5624         IppStatus status = CV_INSTRUMENT_FUN_PTR_CALL_IPP(func,( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
5625                                 (int)dst.step[0], dstroi, coeffs, mode ));
5626         if( status < 0)
5627             *ok = false;
5628         else
5629         {
5630             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5631         }
5632     }
5633 private:
5634     Mat &src;
5635     Mat &dst;
5636     int mode;
5637     double (&coeffs)[2][3];
5638     int borderType;
5639     Scalar borderValue;
5640     ippiWarpAffineBackFunc func;
5641     bool *ok;
5642     const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
5643 };
5644 #endif
5645
5646 #ifdef HAVE_OPENCL
5647
5648 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
5649
5650 static bool ocl_warpTransform_cols4(InputArray _src, OutputArray _dst, InputArray _M0,
5651                                     Size dsize, int flags, int borderType, const Scalar& borderValue,
5652                                     int op_type)
5653 {
5654     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
5655     const ocl::Device & dev = ocl::Device::getDefault();
5656     int type = _src.type(), dtype = _dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5657
5658     int interpolation = flags & INTER_MAX;
5659     if( interpolation == INTER_AREA )
5660         interpolation = INTER_LINEAR;
5661
5662     if ( !dev.isIntel() || !(type == CV_8UC1) ||
5663          !(dtype == CV_8UC1) || !(_dst.cols() % 4 == 0) ||
5664          !(borderType == cv::BORDER_CONSTANT &&
5665           (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)))
5666         return false;
5667
5668     const char * const warp_op[2] = { "Affine", "Perspective" };
5669     const char * const interpolationMap[3] = { "nearest", "linear", "cubic" };
5670     ocl::ProgramSource program = ocl::imgproc::warp_transform_oclsrc;
5671     String kernelName = format("warp%s_%s_8u", warp_op[op_type], interpolationMap[interpolation]);
5672
5673     bool is32f = (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
5674     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
5675     int sctype = CV_MAKETYPE(wdepth, cn);
5676
5677     ocl::Kernel k;
5678     String opts = format("-D ST=%s", ocl::typeToStr(sctype));
5679
5680     k.create(kernelName.c_str(), program, opts);
5681     if (k.empty())
5682         return false;
5683
5684     float borderBuf[] = { 0, 0, 0, 0 };
5685     scalarToRawData(borderValue, borderBuf, sctype);
5686
5687     UMat src = _src.getUMat(), M0;
5688     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5689     UMat dst = _dst.getUMat();
5690
5691     float M[9];
5692     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
5693     Mat matM(matRows, 3, CV_32F, M), M1 = _M0.getMat();
5694     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) && M1.rows == matRows && M1.cols == 3 );
5695     M1.convertTo(matM, matM.type());
5696
5697     if( !(flags & WARP_INVERSE_MAP) )
5698     {
5699         if (op_type == OCL_OP_PERSPECTIVE)
5700             invert(matM, matM);
5701         else
5702         {
5703             float D = M[0]*M[4] - M[1]*M[3];
5704             D = D != 0 ? 1.f/D : 0;
5705             float A11 = M[4]*D, A22=M[0]*D;
5706             M[0] = A11; M[1] *= -D;
5707             M[3] *= -D; M[4] = A22;
5708             float b1 = -M[0]*M[2] - M[1]*M[5];
5709             float b2 = -M[3]*M[2] - M[4]*M[5];
5710             M[2] = b1; M[5] = b2;
5711         }
5712     }
5713     matM.convertTo(M0, CV_32F);
5714
5715     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
5716            ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
5717
5718     size_t globalThreads[2];
5719     globalThreads[0] = (size_t)(dst.cols / 4);
5720     globalThreads[1] = (size_t)dst.rows;
5721
5722     return k.run(2, globalThreads, NULL, false);
5723 }
5724
5725 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
5726                               Size dsize, int flags, int borderType, const Scalar& borderValue,
5727                               int op_type)
5728 {
5729     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
5730     const ocl::Device & dev = ocl::Device::getDefault();
5731
5732     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5733     const bool doubleSupport = dev.doubleFPConfig() > 0;
5734
5735     int interpolation = flags & INTER_MAX;
5736     if( interpolation == INTER_AREA )
5737         interpolation = INTER_LINEAR;
5738     int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
5739
5740     if ( !(borderType == cv::BORDER_CONSTANT &&
5741            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
5742          (!doubleSupport && depth == CV_64F) || cn > 4)
5743         return false;
5744
5745     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
5746     ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
5747                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
5748     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
5749
5750     int scalarcn = cn == 3 ? 4 : cn;
5751     bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
5752     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
5753     int sctype = CV_MAKETYPE(wdepth, scalarcn);
5754
5755     ocl::Kernel k;
5756     String opts;
5757     if (interpolation == INTER_NEAREST)
5758     {
5759         opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
5760                       ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
5761                       ocl::typeToStr(CV_MAT_DEPTH(type)),
5762                       ocl::typeToStr(sctype), cn, rowsPerWI);
5763     }
5764     else
5765     {
5766         char cvt[2][50];
5767         opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
5768                       " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
5769                       interpolationMap[interpolation], ocl::typeToStr(type),
5770                       ocl::typeToStr(CV_MAT_DEPTH(type)),
5771                       ocl::typeToStr(sctype),
5772                       ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
5773                       ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
5774                       ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
5775                       doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
5776     }
5777
5778     k.create(kernelName, program, opts);
5779     if (k.empty())
5780         return false;
5781
5782     double borderBuf[] = { 0, 0, 0, 0 };
5783     scalarToRawData(borderValue, borderBuf, sctype);
5784
5785     UMat src = _src.getUMat(), M0;
5786     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5787     UMat dst = _dst.getUMat();
5788
5789     double M[9];
5790     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
5791     Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
5792     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
5793                M1.rows == matRows && M1.cols == 3 );
5794     M1.convertTo(matM, matM.type());
5795
5796     if( !(flags & WARP_INVERSE_MAP) )
5797     {
5798         if (op_type == OCL_OP_PERSPECTIVE)
5799             invert(matM, matM);
5800         else
5801         {
5802             double D = M[0]*M[4] - M[1]*M[3];
5803             D = D != 0 ? 1./D : 0;
5804             double A11 = M[4]*D, A22=M[0]*D;
5805             M[0] = A11; M[1] *= -D;
5806             M[3] *= -D; M[4] = A22;
5807             double b1 = -M[0]*M[2] - M[1]*M[5];
5808             double b2 = -M[3]*M[2] - M[4]*M[5];
5809             M[2] = b1; M[5] = b2;
5810         }
5811     }
5812     matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
5813
5814     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
5815            ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
5816
5817     size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
5818     return k.run(2, globalThreads, NULL, false);
5819 }
5820
5821 #endif
5822
5823 namespace hal {
5824
5825 void warpAffine(int src_type,
5826                 const uchar * src_data, size_t src_step, int src_width, int src_height,
5827                 uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
5828                 const double M[6], int interpolation, int borderType, const double borderValue[4])
5829 {
5830     CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
5831
5832     Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
5833     Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
5834
5835     int x;
5836     AutoBuffer<int> _abdelta(dst.cols*2);
5837     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
5838     const int AB_BITS = MAX(10, (int)INTER_BITS);
5839     const int AB_SCALE = 1 << AB_BITS;
5840
5841     for( x = 0; x < dst.cols; x++ )
5842     {
5843         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
5844         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
5845     }
5846
5847     Range range(0, dst.rows);
5848     WarpAffineInvoker invoker(src, dst, interpolation, borderType,
5849                               Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
5850                               adelta, bdelta, M);
5851     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5852 }
5853
5854 } // hal::
5855 } // cv::
5856
5857
5858 void cv::warpAffine( InputArray _src, OutputArray _dst,
5859                      InputArray _M0, Size dsize,
5860                      int flags, int borderType, const Scalar& borderValue )
5861 {
5862     CV_INSTRUMENT_REGION()
5863
5864     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
5865                _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
5866                ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType,
5867                                        borderValue, OCL_OP_AFFINE))
5868
5869     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
5870                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
5871                                  borderValue, OCL_OP_AFFINE))
5872
5873     Mat src = _src.getMat(), M0 = _M0.getMat();
5874     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5875     Mat dst = _dst.getMat();
5876     CV_Assert( src.cols > 0 && src.rows > 0 );
5877     if( dst.data == src.data )
5878         src = src.clone();
5879
5880     double M[6];
5881     Mat matM(2, 3, CV_64F, M);
5882     int interpolation = flags & INTER_MAX;
5883     if( interpolation == INTER_AREA )
5884         interpolation = INTER_LINEAR;
5885
5886     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
5887     M0.convertTo(matM, matM.type());
5888
5889     if( !(flags & WARP_INVERSE_MAP) )
5890     {
5891         double D = M[0]*M[4] - M[1]*M[3];
5892         D = D != 0 ? 1./D : 0;
5893         double A11 = M[4]*D, A22=M[0]*D;
5894         M[0] = A11; M[1] *= -D;
5895         M[3] *= -D; M[4] = A22;
5896         double b1 = -M[0]*M[2] - M[1]*M[5];
5897         double b2 = -M[3]*M[2] - M[4]*M[5];
5898         M[2] = b1; M[5] = b2;
5899     }
5900
5901 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
5902     CV_IPP_CHECK()
5903     {
5904         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5905         if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
5906            ( cn == 1 || cn == 3 || cn == 4 ) &&
5907            ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
5908            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
5909         {
5910             ippiWarpAffineBackFunc ippFunc = 0;
5911             if ((flags & WARP_INVERSE_MAP) != 0)
5912             {
5913                 ippFunc =
5914                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
5915                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
5916                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
5917                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
5918                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
5919                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
5920                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
5921                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
5922                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
5923                 0;
5924             }
5925             else
5926             {
5927                 ippFunc =
5928                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
5929                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
5930                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
5931                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
5932                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
5933                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
5934                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
5935                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
5936                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
5937                 0;
5938             }
5939             int mode =
5940             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5941             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5942             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
5943             0;
5944             CV_Assert(mode && ippFunc);
5945
5946             double coeffs[2][3];
5947             for( int i = 0; i < 2; i++ )
5948                 for( int j = 0; j < 3; j++ )
5949                     coeffs[i][j] = matM.at<double>(i, j);
5950
5951             bool ok;
5952             Range range(0, dst.rows);
5953             IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5954             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5955             if( ok )
5956             {
5957                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5958                 return;
5959             }
5960             setIppErrorStatus();
5961         }
5962     }
5963 #endif
5964
5965     hal::warpAffine(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
5966                     M, interpolation, borderType, borderValue.val);
5967 }
5968
5969
5970 namespace cv
5971 {
5972
5973 class WarpPerspectiveInvoker :
5974     public ParallelLoopBody
5975 {
5976 public:
5977     WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, const double *_M, int _interpolation,
5978                            int _borderType, const Scalar &_borderValue) :
5979         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
5980         borderType(_borderType), borderValue(_borderValue)
5981     {
5982     }
5983
5984     virtual void operator() (const Range& range) const
5985     {
5986         const int BLOCK_SZ = 32;
5987         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
5988         int x, y, x1, y1, width = dst.cols, height = dst.rows;
5989
5990         int bh0 = std::min(BLOCK_SZ/2, height);
5991         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
5992         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
5993
5994         #if CV_SSE4_1
5995         bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
5996         __m128d v_M0 = _mm_set1_pd(M[0]);
5997         __m128d v_M3 = _mm_set1_pd(M[3]);
5998         __m128d v_M6 = _mm_set1_pd(M[6]);
5999         __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
6000         __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
6001         __m128d v_2 = _mm_set1_pd(2),
6002                 v_zero = _mm_setzero_pd(),
6003                 v_1 = _mm_set1_pd(1),
6004                 v_its = _mm_set1_pd(INTER_TAB_SIZE);
6005         __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
6006         #endif
6007
6008         for( y = range.start; y < range.end; y += bh0 )
6009         {
6010             for( x = 0; x < width; x += bw0 )
6011             {
6012                 int bw = std::min( bw0, width - x);
6013                 int bh = std::min( bh0, range.end - y); // height
6014
6015                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
6016                 Mat dpart(dst, Rect(x, y, bw, bh));
6017
6018                 for( y1 = 0; y1 < bh; y1++ )
6019                 {
6020                     short* xy = XY + y1*bw*2;
6021                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
6022                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
6023                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
6024
6025                     if( interpolation == INTER_NEAREST )
6026                     {
6027                         x1 = 0;
6028
6029                         #if CV_SSE4_1
6030                         if (haveSSE4_1)
6031                         {
6032                             __m128d v_X0d = _mm_set1_pd(X0);
6033                             __m128d v_Y0d = _mm_set1_pd(Y0);
6034                             __m128d v_W0 = _mm_set1_pd(W0);
6035                             __m128d v_x1 = _mm_set_pd(1, 0);
6036
6037                             for( ; x1 <= bw - 16; x1 += 16 )
6038                             {
6039                                 // 0-3
6040                                 __m128i v_X0, v_Y0;
6041                                 {
6042                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6043                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6044                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6045                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6046                                     v_x1 = _mm_add_pd(v_x1, v_2);
6047
6048                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6049                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6050                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6051                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6052                                     v_x1 = _mm_add_pd(v_x1, v_2);
6053
6054                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6055                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6056                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6057                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6058                                 }
6059
6060                                 // 4-8
6061                                 __m128i v_X1, v_Y1;
6062                                 {
6063                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6064                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6065                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6066                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6067                                     v_x1 = _mm_add_pd(v_x1, v_2);
6068
6069                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6070                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6071                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6072                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6073                                     v_x1 = _mm_add_pd(v_x1, v_2);
6074
6075                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6076                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6077                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6078                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6079                                 }
6080
6081                                 // 8-11
6082                                 __m128i v_X2, v_Y2;
6083                                 {
6084                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6085                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6086                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6087                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6088                                     v_x1 = _mm_add_pd(v_x1, v_2);
6089
6090                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6091                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6092                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6093                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6094                                     v_x1 = _mm_add_pd(v_x1, v_2);
6095
6096                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6097                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6098                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6099                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6100                                 }
6101
6102                                 // 12-15
6103                                 __m128i v_X3, v_Y3;
6104                                 {
6105                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6106                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6107                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6108                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6109                                     v_x1 = _mm_add_pd(v_x1, v_2);
6110
6111                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6112                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6113                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6114                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6115                                     v_x1 = _mm_add_pd(v_x1, v_2);
6116
6117                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6118                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6119                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6120                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6121                                 }
6122
6123                                 // convert to 16s
6124                                 v_X0 = _mm_packs_epi32(v_X0, v_X1);
6125                                 v_X1 = _mm_packs_epi32(v_X2, v_X3);
6126                                 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
6127                                 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
6128
6129                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
6130
6131                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
6132                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
6133                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
6134                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
6135                             }
6136                         }
6137                         #endif
6138
6139                         for( ; x1 < bw; x1++ )
6140                         {
6141                             double W = W0 + M[6]*x1;
6142                             W = W ? 1./W : 0;
6143                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
6144                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
6145                             int X = saturate_cast<int>(fX);
6146                             int Y = saturate_cast<int>(fY);
6147
6148                             xy[x1*2] = saturate_cast<short>(X);
6149                             xy[x1*2+1] = saturate_cast<short>(Y);
6150                         }
6151                     }
6152                     else
6153                     {
6154                         short* alpha = A + y1*bw;
6155                         x1 = 0;
6156
6157                         #if CV_SSE4_1
6158                         if (haveSSE4_1)
6159                         {
6160                             __m128d v_X0d = _mm_set1_pd(X0);
6161                             __m128d v_Y0d = _mm_set1_pd(Y0);
6162                             __m128d v_W0 = _mm_set1_pd(W0);
6163                             __m128d v_x1 = _mm_set_pd(1, 0);
6164
6165                             for( ; x1 <= bw - 16; x1 += 16 )
6166                             {
6167                                 // 0-3
6168                                 __m128i v_X0, v_Y0;
6169                                 {
6170                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6171                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6172                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6173                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6174                                     v_x1 = _mm_add_pd(v_x1, v_2);
6175
6176                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6177                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6178                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6179                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6180                                     v_x1 = _mm_add_pd(v_x1, v_2);
6181
6182                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6183                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6184                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6185                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6186                                 }
6187
6188                                 // 4-8
6189                                 __m128i v_X1, v_Y1;
6190                                 {
6191                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6192                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6193                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6194                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6195                                     v_x1 = _mm_add_pd(v_x1, v_2);
6196
6197                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6198                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6199                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6200                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6201                                     v_x1 = _mm_add_pd(v_x1, v_2);
6202
6203                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6204                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6205                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6206                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6207                                 }
6208
6209                                 // 8-11
6210                                 __m128i v_X2, v_Y2;
6211                                 {
6212                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6213                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6214                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6215                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6216                                     v_x1 = _mm_add_pd(v_x1, v_2);
6217
6218                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6219                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6220                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6221                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6222                                     v_x1 = _mm_add_pd(v_x1, v_2);
6223
6224                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6225                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6226                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6227                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6228                                 }
6229
6230                                 // 12-15
6231                                 __m128i v_X3, v_Y3;
6232                                 {
6233                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6234                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6235                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6236                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6237                                     v_x1 = _mm_add_pd(v_x1, v_2);
6238
6239                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6240                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6241                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6242                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6243                                     v_x1 = _mm_add_pd(v_x1, v_2);
6244
6245                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6246                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6247                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6248                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6249                                 }
6250
6251                                 // store alpha
6252                                 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
6253                                                                  _mm_and_si128(v_X0, v_itsi1));
6254                                 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
6255                                                                  _mm_and_si128(v_X1, v_itsi1));
6256                                 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
6257
6258                                 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
6259                                                          _mm_and_si128(v_X2, v_itsi1));
6260                                 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
6261                                                          _mm_and_si128(v_X3, v_itsi1));
6262                                 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
6263
6264                                 // convert to 16s
6265                                 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
6266                                 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
6267                                 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
6268                                 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
6269
6270                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
6271
6272                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
6273                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
6274                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
6275                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
6276                             }
6277                         }
6278                         #endif
6279
6280                         for( ; x1 < bw; x1++ )
6281                         {
6282                             double W = W0 + M[6]*x1;
6283                             W = W ? INTER_TAB_SIZE/W : 0;
6284                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
6285                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
6286                             int X = saturate_cast<int>(fX);
6287                             int Y = saturate_cast<int>(fY);
6288
6289                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
6290                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
6291                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
6292                                                 (X & (INTER_TAB_SIZE-1)));
6293                         }
6294                     }
6295                 }
6296
6297                 if( interpolation == INTER_NEAREST )
6298                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
6299                 else
6300                 {
6301                     Mat _matA(bh, bw, CV_16U, A);
6302                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
6303                 }
6304             }
6305         }
6306     }
6307
6308 private:
6309     Mat src;
6310     Mat dst;
6311     const double* M;
6312     int interpolation, borderType;
6313     Scalar borderValue;
6314 };
6315
6316 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
6317 class IPPWarpPerspectiveInvoker :
6318     public ParallelLoopBody
6319 {
6320 public:
6321     IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
6322                               int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
6323         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
6324         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
6325     {
6326         *ok = true;
6327     }
6328
6329     virtual void operator() (const Range& range) const
6330     {
6331         IppiSize srcsize = {src.cols, src.rows};
6332         IppiRect srcroi = {0, 0, src.cols, src.rows};
6333         IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
6334         int cnn = src.channels();
6335
6336         if( borderType == BORDER_CONSTANT )
6337         {
6338             IppiSize setSize = {dst.cols, range.end - range.start};
6339             void *dataPointer = dst.ptr(range.start);
6340             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
6341             {
6342                 *ok = false;
6343                 return;
6344             }
6345         }
6346
6347         IppStatus status = CV_INSTRUMENT_FUN_PTR_CALL_IPP(func,(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode));
6348         if (status != ippStsNoErr)
6349             *ok = false;
6350         else
6351         {
6352             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6353         }
6354     }
6355 private:
6356     Mat &src;
6357     Mat &dst;
6358     int mode;
6359     double (&coeffs)[3][3];
6360     int borderType;
6361     const Scalar borderValue;
6362     ippiWarpPerspectiveFunc func;
6363     bool *ok;
6364
6365     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
6366 };
6367 #endif
6368
6369 namespace hal {
6370
6371 void warpPerspectve(int src_type,
6372                     const uchar * src_data, size_t src_step, int src_width, int src_height,
6373                     uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
6374                     const double M[9], int interpolation, int borderType, const double borderValue[4])
6375 {
6376     CALL_HAL(warpPerspective, cv_hal_warpPerspective, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
6377     Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
6378     Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
6379
6380     Range range(0, dst.rows);
6381     WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
6382     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
6383 }
6384
6385 } // hal::
6386 } // cv::
6387
6388 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
6389                           Size dsize, int flags, int borderType, const Scalar& borderValue )
6390 {
6391     CV_INSTRUMENT_REGION()
6392
6393     CV_Assert( _src.total() > 0 );
6394
6395     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
6396                _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
6397                ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType, borderValue,
6398                                        OCL_OP_PERSPECTIVE))
6399
6400     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
6401                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
6402                               OCL_OP_PERSPECTIVE))
6403
6404     Mat src = _src.getMat(), M0 = _M0.getMat();
6405     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
6406     Mat dst = _dst.getMat();
6407
6408     if( dst.data == src.data )
6409         src = src.clone();
6410
6411     double M[9];
6412     Mat matM(3, 3, CV_64F, M);
6413     int interpolation = flags & INTER_MAX;
6414     if( interpolation == INTER_AREA )
6415         interpolation = INTER_LINEAR;
6416
6417     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
6418     M0.convertTo(matM, matM.type());
6419
6420 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
6421     CV_IPP_CHECK()
6422     {
6423         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
6424         if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
6425            (cn == 1 || cn == 3 || cn == 4) &&
6426            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
6427            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
6428         {
6429             ippiWarpPerspectiveFunc ippFunc = 0;
6430             if ((flags & WARP_INVERSE_MAP) != 0)
6431             {
6432                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
6433                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
6434                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
6435                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
6436                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
6437                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
6438                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
6439                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
6440                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
6441             }
6442             else
6443             {
6444                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
6445                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
6446                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
6447                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
6448                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
6449                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
6450                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
6451                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
6452                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
6453             }
6454             int mode =
6455             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
6456             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
6457             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
6458             CV_Assert(mode && ippFunc);
6459
6460             double coeffs[3][3];
6461             for( int i = 0; i < 3; i++ )
6462                 for( int j = 0; j < 3; j++ )
6463                     coeffs[i][j] = matM.at<double>(i, j);
6464
6465             bool ok;
6466             Range range(0, dst.rows);
6467             IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
6468             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
6469             if( ok )
6470             {
6471                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6472                 return;
6473             }
6474             setIppErrorStatus();
6475         }
6476     }
6477 #endif
6478
6479     if( !(flags & WARP_INVERSE_MAP) )
6480         invert(matM, matM);
6481
6482     hal::warpPerspectve(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
6483                         matM.ptr<double>(), interpolation, borderType, borderValue.val);
6484 }
6485
6486
6487 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
6488 {
6489     CV_INSTRUMENT_REGION()
6490
6491     angle *= CV_PI/180;
6492     double alpha = cos(angle)*scale;
6493     double beta = sin(angle)*scale;
6494
6495     Mat M(2, 3, CV_64F);
6496     double* m = M.ptr<double>();
6497
6498     m[0] = alpha;
6499     m[1] = beta;
6500     m[2] = (1-alpha)*center.x - beta*center.y;
6501     m[3] = -beta;
6502     m[4] = alpha;
6503     m[5] = beta*center.x + (1-alpha)*center.y;
6504
6505     return M;
6506 }
6507
6508 /* Calculates coefficients of perspective transformation
6509  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
6510  *
6511  *      c00*xi + c01*yi + c02
6512  * ui = ---------------------
6513  *      c20*xi + c21*yi + c22
6514  *
6515  *      c10*xi + c11*yi + c12
6516  * vi = ---------------------
6517  *      c20*xi + c21*yi + c22
6518  *
6519  * Coefficients are calculated by solving linear system:
6520  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
6521  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
6522  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
6523  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
6524  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
6525  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
6526  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
6527  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
6528  *
6529  * where:
6530  *   cij - matrix coefficients, c22 = 1
6531  */
6532 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
6533 {
6534     CV_INSTRUMENT_REGION()
6535
6536     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
6537     double a[8][8], b[8];
6538     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
6539
6540     for( int i = 0; i < 4; ++i )
6541     {
6542         a[i][0] = a[i+4][3] = src[i].x;
6543         a[i][1] = a[i+4][4] = src[i].y;
6544         a[i][2] = a[i+4][5] = 1;
6545         a[i][3] = a[i][4] = a[i][5] =
6546         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
6547         a[i][6] = -src[i].x*dst[i].x;
6548         a[i][7] = -src[i].y*dst[i].x;
6549         a[i+4][6] = -src[i].x*dst[i].y;
6550         a[i+4][7] = -src[i].y*dst[i].y;
6551         b[i] = dst[i].x;
6552         b[i+4] = dst[i].y;
6553     }
6554
6555     solve( A, B, X, DECOMP_SVD );
6556     M.ptr<double>()[8] = 1.;
6557
6558     return M;
6559 }
6560
6561 /* Calculates coefficients of affine transformation
6562  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
6563  *
6564  * ui = c00*xi + c01*yi + c02
6565  *
6566  * vi = c10*xi + c11*yi + c12
6567  *
6568  * Coefficients are calculated by solving linear system:
6569  * / x0 y0  1  0  0  0 \ /c00\ /u0\
6570  * | x1 y1  1  0  0  0 | |c01| |u1|
6571  * | x2 y2  1  0  0  0 | |c02| |u2|
6572  * |  0  0  0 x0 y0  1 | |c10| |v0|
6573  * |  0  0  0 x1 y1  1 | |c11| |v1|
6574  * \  0  0  0 x2 y2  1 / |c12| |v2|
6575  *
6576  * where:
6577  *   cij - matrix coefficients
6578  */
6579
6580 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
6581 {
6582     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
6583     double a[6*6], b[6];
6584     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
6585
6586     for( int i = 0; i < 3; i++ )
6587     {
6588         int j = i*12;
6589         int k = i*12+6;
6590         a[j] = a[k+3] = src[i].x;
6591         a[j+1] = a[k+4] = src[i].y;
6592         a[j+2] = a[k+5] = 1;
6593         a[j+3] = a[j+4] = a[j+5] = 0;
6594         a[k] = a[k+1] = a[k+2] = 0;
6595         b[i*2] = dst[i].x;
6596         b[i*2+1] = dst[i].y;
6597     }
6598
6599     solve( A, B, X );
6600     return M;
6601 }
6602
6603 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
6604 {
6605     Mat matM = _matM.getMat();
6606     CV_Assert(matM.rows == 2 && matM.cols == 3);
6607     __iM.create(2, 3, matM.type());
6608     Mat _iM = __iM.getMat();
6609
6610     if( matM.type() == CV_32F )
6611     {
6612         const float* M = matM.ptr<float>();
6613         float* iM = _iM.ptr<float>();
6614         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
6615
6616         double D = M[0]*M[step+1] - M[1]*M[step];
6617         D = D != 0 ? 1./D : 0;
6618         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
6619         double b1 = -A11*M[2] - A12*M[step+2];
6620         double b2 = -A21*M[2] - A22*M[step+2];
6621
6622         iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
6623         iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
6624     }
6625     else if( matM.type() == CV_64F )
6626     {
6627         const double* M = matM.ptr<double>();
6628         double* iM = _iM.ptr<double>();
6629         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
6630
6631         double D = M[0]*M[step+1] - M[1]*M[step];
6632         D = D != 0 ? 1./D : 0;
6633         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
6634         double b1 = -A11*M[2] - A12*M[step+2];
6635         double b2 = -A21*M[2] - A22*M[step+2];
6636
6637         iM[0] = A11; iM[1] = A12; iM[2] = b1;
6638         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
6639     }
6640     else
6641         CV_Error( CV_StsUnsupportedFormat, "" );
6642 }
6643
6644 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
6645 {
6646     Mat src = _src.getMat(), dst = _dst.getMat();
6647     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
6648     return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
6649 }
6650
6651 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
6652 {
6653     Mat src = _src.getMat(), dst = _dst.getMat();
6654     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
6655     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
6656 }
6657
6658 CV_IMPL void
6659 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
6660 {
6661     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6662     CV_Assert( src.type() == dst.type() );
6663     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
6664         (double)dst.rows/src.rows, method );
6665 }
6666
6667
6668 CV_IMPL void
6669 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
6670               int flags, CvScalar fillval )
6671 {
6672     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6673     cv::Mat matrix = cv::cvarrToMat(marr);
6674     CV_Assert( src.type() == dst.type() );
6675     cv::warpAffine( src, dst, matrix, dst.size(), flags,
6676         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
6677         fillval );
6678 }
6679
6680 CV_IMPL void
6681 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
6682                    int flags, CvScalar fillval )
6683 {
6684     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6685     cv::Mat matrix = cv::cvarrToMat(marr);
6686     CV_Assert( src.type() == dst.type() );
6687     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
6688         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
6689         fillval );
6690 }
6691
6692 CV_IMPL void
6693 cvRemap( const CvArr* srcarr, CvArr* dstarr,
6694          const CvArr* _mapx, const CvArr* _mapy,
6695          int flags, CvScalar fillval )
6696 {
6697     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
6698     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
6699     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
6700     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
6701         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
6702         fillval );
6703     CV_Assert( dst0.data == dst.data );
6704 }
6705
6706
6707 CV_IMPL CvMat*
6708 cv2DRotationMatrix( CvPoint2D32f center, double angle,
6709                     double scale, CvMat* matrix )
6710 {
6711     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
6712     CV_Assert( M.size() == M0.size() );
6713     M.convertTo(M0, M0.type());
6714     return matrix;
6715 }
6716
6717
6718 CV_IMPL CvMat*
6719 cvGetPerspectiveTransform( const CvPoint2D32f* src,
6720                           const CvPoint2D32f* dst,
6721                           CvMat* matrix )
6722 {
6723     cv::Mat M0 = cv::cvarrToMat(matrix),
6724         M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
6725     CV_Assert( M.size() == M0.size() );
6726     M.convertTo(M0, M0.type());
6727     return matrix;
6728 }
6729
6730
6731 CV_IMPL CvMat*
6732 cvGetAffineTransform( const CvPoint2D32f* src,
6733                           const CvPoint2D32f* dst,
6734                           CvMat* matrix )
6735 {
6736     cv::Mat M0 = cv::cvarrToMat(matrix),
6737         M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
6738     CV_Assert( M.size() == M0.size() );
6739     M.convertTo(M0, M0.type());
6740     return matrix;
6741 }
6742
6743
6744 CV_IMPL void
6745 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
6746 {
6747     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
6748     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
6749
6750     if( arr2 )
6751         map2 = cv::cvarrToMat(arr2);
6752     if( dstarr2 )
6753     {
6754         dstmap2 = cv::cvarrToMat(dstarr2);
6755         if( dstmap2.type() == CV_16SC1 )
6756             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
6757     }
6758
6759     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
6760 }
6761
6762 /****************************************************************************************\
6763 *                                   Log-Polar Transform                                  *
6764 \****************************************************************************************/
6765
6766 /* now it is done via Remap; more correct implementation should use
6767    some super-sampling technique outside of the "fovea" circle */
6768 CV_IMPL void
6769 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
6770             CvPoint2D32f center, double M, int flags )
6771 {
6772     Mat src_with_border; // don't scope this variable (it holds image data)
6773
6774     cv::Ptr<CvMat> mapx, mapy;
6775
6776     CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
6777     CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
6778     CvSize dsize;
6779
6780     if( !CV_ARE_TYPES_EQ( src, dst ))
6781         CV_Error( CV_StsUnmatchedFormats, "" );
6782
6783     if( M <= 0 )
6784         CV_Error( CV_StsOutOfRange, "M should be >0" );
6785
6786     dsize = cvGetMatSize(dst);
6787
6788     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
6789     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
6790
6791     if( !(flags & CV_WARP_INVERSE_MAP) )
6792     {
6793         int phi, rho;
6794         cv::AutoBuffer<double> _exp_tab(dsize.width);
6795         double* exp_tab = _exp_tab;
6796
6797         for( rho = 0; rho < dst->width; rho++ )
6798             exp_tab[rho] = std::exp(rho/M) - 1.0;
6799
6800         for( phi = 0; phi < dsize.height; phi++ )
6801         {
6802             double cp = cos(phi*2*CV_PI/dsize.height);
6803             double sp = sin(phi*2*CV_PI/dsize.height);
6804             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
6805             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
6806
6807             for( rho = 0; rho < dsize.width; rho++ )
6808             {
6809                 double r = exp_tab[rho];
6810                 double x = r*cp + center.x;
6811                 double y = r*sp + center.y;
6812
6813                 mx[rho] = (float)x;
6814                 my[rho] = (float)y;
6815             }
6816         }
6817     }
6818     else
6819     {
6820         const int ANGLE_BORDER = 1;
6821         Mat src_ = cv::cvarrToMat(src);
6822         cv::copyMakeBorder(src_, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
6823         srcstub = src_with_border; src = &srcstub;
6824         CvSize ssize = cvGetMatSize(src);
6825         ssize.height -= 2*ANGLE_BORDER;
6826
6827         int x, y;
6828         CvMat bufx, bufy, bufp, bufa;
6829         double ascale = ssize.height/(2*CV_PI);
6830         cv::AutoBuffer<float> _buf(4*dsize.width);
6831         float* buf = _buf;
6832
6833         bufx = cvMat( 1, dsize.width, CV_32F, buf );
6834         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
6835         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
6836         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
6837
6838         for( x = 0; x < dsize.width; x++ )
6839             bufx.data.fl[x] = (float)x - center.x;
6840
6841         for( y = 0; y < dsize.height; y++ )
6842         {
6843             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
6844             float* my = (float*)(mapy->data.ptr + y*mapy->step);
6845
6846             for( x = 0; x < dsize.width; x++ )
6847                 bufy.data.fl[x] = (float)y - center.y;
6848
6849 #if 1
6850             cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
6851
6852             for( x = 0; x < dsize.width; x++ )
6853                 bufp.data.fl[x] += 1.f;
6854
6855             cvLog( &bufp, &bufp );
6856
6857             for( x = 0; x < dsize.width; x++ )
6858             {
6859                 double rho = bufp.data.fl[x]*M;
6860                 double phi = bufa.data.fl[x]*ascale;
6861
6862                 mx[x] = (float)rho;
6863                 my[x] = (float)phi + ANGLE_BORDER;
6864             }
6865 #else
6866             for( x = 0; x < dsize.width; x++ )
6867             {
6868                 double xx = bufx.data.fl[x];
6869                 double yy = bufy.data.fl[x];
6870
6871                 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
6872                 double a = atan2(yy,xx);
6873                 if( a < 0 )
6874                     a = 2*CV_PI + a;
6875                 a *= ascale;
6876
6877                 mx[x] = (float)p;
6878                 my[x] = (float)a;
6879             }
6880 #endif
6881         }
6882     }
6883
6884     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
6885 }
6886
6887 void cv::logPolar( InputArray _src, OutputArray _dst,
6888                    Point2f center, double M, int flags )
6889 {
6890     CV_INSTRUMENT_REGION()
6891
6892     CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
6893         ocl_logPolar(_src, _dst, center, M, flags));
6894     Mat src_with_border; // don't scope this variable (it holds image data)
6895
6896     Mat mapx, mapy;
6897
6898     Mat srcstub, src = _src.getMat();
6899     _dst.create(src.size(), src.type());
6900     Size dsize = src.size();
6901
6902     if (M <= 0)
6903         CV_Error(CV_StsOutOfRange, "M should be >0");
6904
6905
6906     mapx.create(dsize, CV_32F);
6907     mapy.create(dsize, CV_32F);
6908
6909     if (!(flags & CV_WARP_INVERSE_MAP))
6910     {
6911         int phi, rho;
6912         cv::AutoBuffer<double> _exp_tab(dsize.width);
6913         double* exp_tab = _exp_tab;
6914
6915         for (rho = 0; rho < dsize.width; rho++)
6916             exp_tab[rho] = std::exp(rho / M) - 1.0;
6917
6918         for (phi = 0; phi < dsize.height; phi++)
6919         {
6920             double cp = cos(phi * 2 * CV_PI / dsize.height);
6921             double sp = sin(phi * 2 * CV_PI / dsize.height);
6922             float* mx = (float*)(mapx.data + phi*mapx.step);
6923             float* my = (float*)(mapy.data + phi*mapy.step);
6924
6925             for (rho = 0; rho < dsize.width; rho++)
6926             {
6927                 double r = exp_tab[rho];
6928                 double x = r*cp + center.x;
6929                 double y = r*sp + center.y;
6930
6931                 mx[rho] = (float)x;
6932                 my[rho] = (float)y;
6933             }
6934         }
6935     }
6936     else
6937     {
6938         const int ANGLE_BORDER = 1;
6939         cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
6940         srcstub = src_with_border; src = srcstub;
6941         Size ssize = src.size();
6942         ssize.height -= 2 * ANGLE_BORDER;
6943
6944         int x, y;
6945         Mat bufx, bufy, bufp, bufa;
6946         double ascale = ssize.height / (2 * CV_PI);
6947
6948         bufx = Mat(1, dsize.width, CV_32F);
6949         bufy = Mat(1, dsize.width, CV_32F);
6950         bufp = Mat(1, dsize.width, CV_32F);
6951         bufa = Mat(1, dsize.width, CV_32F);
6952
6953         for (x = 0; x < dsize.width; x++)
6954             bufx.at<float>(0, x) = (float)x - center.x;
6955
6956         for (y = 0; y < dsize.height; y++)
6957         {
6958             float* mx = (float*)(mapx.data + y*mapx.step);
6959             float* my = (float*)(mapy.data + y*mapy.step);
6960
6961             for (x = 0; x < dsize.width; x++)
6962                 bufy.at<float>(0, x) = (float)y - center.y;
6963
6964 #if 1
6965             cartToPolar(bufx, bufy, bufp, bufa);
6966
6967             for (x = 0; x < dsize.width; x++)
6968                 bufp.at<float>(0, x) += 1.f;
6969
6970             log(bufp, bufp);
6971
6972             for (x = 0; x < dsize.width; x++)
6973             {
6974                 double rho = bufp.at<float>(0, x) * M;
6975                 double phi = bufa.at<float>(0, x) * ascale;
6976
6977                 mx[x] = (float)rho;
6978                 my[x] = (float)phi + ANGLE_BORDER;
6979             }
6980 #else
6981             for (x = 0; x < dsize.width; x++)
6982             {
6983                 double xx = bufx.at<float>(0, x);
6984                 double yy = bufy.at<float>(0, x);
6985                 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
6986                 double a = atan2(yy, xx);
6987                 if (a < 0)
6988                     a = 2 * CV_PI + a;
6989                 a *= ascale;
6990                 mx[x] = (float)p;
6991                 my[x] = (float)a;
6992             }
6993 #endif
6994         }
6995     }
6996
6997     remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX,
6998         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
6999 }
7000
7001 /****************************************************************************************
7002                                    Linear-Polar Transform
7003   J.L. Blanco, Apr 2009
7004  ****************************************************************************************/
7005 CV_IMPL
7006 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
7007             CvPoint2D32f center, double maxRadius, int flags )
7008 {
7009     Mat src_with_border; // don't scope this variable (it holds image data)
7010
7011     cv::Ptr<CvMat> mapx, mapy;
7012
7013     CvMat srcstub, *src = (CvMat*)srcarr;
7014     CvMat dststub, *dst = (CvMat*)dstarr;
7015     CvSize dsize;
7016
7017     src = cvGetMat( srcarr, &srcstub,0,0 );
7018     dst = cvGetMat( dstarr, &dststub,0,0 );
7019
7020     if( !CV_ARE_TYPES_EQ( src, dst ))
7021         CV_Error( CV_StsUnmatchedFormats, "" );
7022
7023     dsize = cvGetMatSize(dst);
7024
7025     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
7026     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
7027
7028     if( !(flags & CV_WARP_INVERSE_MAP) )
7029     {
7030         int phi, rho;
7031
7032         for( phi = 0; phi < dsize.height; phi++ )
7033         {
7034             double cp = cos(phi*2*CV_PI/dsize.height);
7035             double sp = sin(phi*2*CV_PI/dsize.height);
7036             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
7037             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
7038
7039             for( rho = 0; rho < dsize.width; rho++ )
7040             {
7041                 double r = maxRadius*rho/dsize.width;
7042                 double x = r*cp + center.x;
7043                 double y = r*sp + center.y;
7044
7045                 mx[rho] = (float)x;
7046                 my[rho] = (float)y;
7047             }
7048         }
7049     }
7050     else
7051     {
7052         const int ANGLE_BORDER = 1;
7053         Mat src_ = cv::cvarrToMat(src);
7054         cv::copyMakeBorder(src_, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
7055         srcstub = src_with_border; src = &srcstub;
7056         CvSize ssize = cvGetMatSize(src);
7057         ssize.height -= 2*ANGLE_BORDER;
7058
7059         int x, y;
7060         CvMat bufx, bufy, bufp, bufa;
7061         const double ascale = ssize.height/(2*CV_PI);
7062         const double pscale = ssize.width/maxRadius;
7063
7064         cv::AutoBuffer<float> _buf(4*dsize.width);
7065         float* buf = _buf;
7066
7067         bufx = cvMat( 1, dsize.width, CV_32F, buf );
7068         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
7069         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
7070         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
7071
7072         for( x = 0; x < dsize.width; x++ )
7073             bufx.data.fl[x] = (float)x - center.x;
7074
7075         for( y = 0; y < dsize.height; y++ )
7076         {
7077             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
7078             float* my = (float*)(mapy->data.ptr + y*mapy->step);
7079
7080             for( x = 0; x < dsize.width; x++ )
7081                 bufy.data.fl[x] = (float)y - center.y;
7082
7083             cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
7084
7085             for( x = 0; x < dsize.width; x++ )
7086             {
7087                 double rho = bufp.data.fl[x]*pscale;
7088                 double phi = bufa.data.fl[x]*ascale;
7089                 mx[x] = (float)rho;
7090                 my[x] = (float)phi + ANGLE_BORDER;
7091             }
7092         }
7093     }
7094
7095     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
7096 }
7097
7098 void cv::linearPolar( InputArray _src, OutputArray _dst,
7099                       Point2f center, double maxRadius, int flags )
7100 {
7101     CV_INSTRUMENT_REGION()
7102
7103     CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
7104         ocl_linearPolar(_src, _dst, center, maxRadius, flags));
7105     Mat src_with_border; // don't scope this variable (it holds image data)
7106
7107     Mat mapx, mapy;
7108     Mat srcstub, src = _src.getMat();
7109     _dst.create(src.size(), src.type());
7110     Size dsize = src.size();
7111
7112
7113     mapx.create(dsize, CV_32F);
7114     mapy.create(dsize, CV_32F);
7115
7116     if (!(flags & CV_WARP_INVERSE_MAP))
7117     {
7118         int phi, rho;
7119
7120         for (phi = 0; phi < dsize.height; phi++)
7121         {
7122             double cp = cos(phi * 2 * CV_PI / dsize.height);
7123             double sp = sin(phi * 2 * CV_PI / dsize.height);
7124             float* mx = (float*)(mapx.data + phi*mapx.step);
7125             float* my = (float*)(mapy.data + phi*mapy.step);
7126
7127             for (rho = 0; rho < dsize.width; rho++)
7128             {
7129                 double r = maxRadius*rho / dsize.width;
7130                 double x = r*cp + center.x;
7131                 double y = r*sp + center.y;
7132
7133                 mx[rho] = (float)x;
7134                 my[rho] = (float)y;
7135             }
7136         }
7137     }
7138     else
7139     {
7140         const int ANGLE_BORDER = 1;
7141
7142         cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
7143         src = src_with_border;
7144         Size ssize = src_with_border.size();
7145         ssize.height -= 2 * ANGLE_BORDER;
7146
7147         int x, y;
7148         Mat bufx, bufy, bufp, bufa;
7149         const double ascale = ssize.height / (2 * CV_PI);
7150         const double pscale = ssize.width / maxRadius;
7151
7152
7153
7154         bufx = Mat(1, dsize.width, CV_32F);
7155         bufy = Mat(1, dsize.width, CV_32F);
7156         bufp = Mat(1, dsize.width, CV_32F);
7157         bufa = Mat(1, dsize.width, CV_32F);
7158
7159         for (x = 0; x < dsize.width; x++)
7160             bufx.at<float>(0, x) = (float)x - center.x;
7161
7162         for (y = 0; y < dsize.height; y++)
7163         {
7164             float* mx = (float*)(mapx.data + y*mapx.step);
7165             float* my = (float*)(mapy.data + y*mapy.step);
7166
7167             for (x = 0; x < dsize.width; x++)
7168                 bufy.at<float>(0, x) = (float)y - center.y;
7169
7170             cartToPolar(bufx, bufy, bufp, bufa, 0);
7171
7172             for (x = 0; x < dsize.width; x++)
7173             {
7174                 double rho = bufp.at<float>(0, x) * pscale;
7175                 double phi = bufa.at<float>(0, x) * ascale;
7176                 mx[x] = (float)rho;
7177                 my[x] = (float)phi + ANGLE_BORDER;
7178             }
7179         }
7180     }
7181
7182     remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
7183 }
7184
7185 /* End of file. */