modules/imgproc/src/imgwarp.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 /* ////////////////////////////////////////////////////////////////////
  44 //
  45 //  Geometrical transforms on images and matrices: rotation, zoom etc.
  46 //
  47 // */
  48
  49 #include "precomp.hpp"
  50 #include "opencl_kernels_imgproc.hpp"
  51
  52 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
  53 static IppStatus sts = ippInit();
  54 #endif
  55
  56 namespace cv
  57 {
  58 #if IPP_VERSION_X100 >= 701
  59     typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
  60     typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
  61     typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
  62 #endif
  63
  64 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) && 0
  65     typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
  66     typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
  67     typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
  68
  69     template <int channels, typename Type>
  70     bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
  71     {
  72         Type values[channels];
  73         for( int i = 0; i < channels; i++ )
  74             values[i] = saturate_cast<Type>(value[i]);
  75         return func(values, dataPointer, step, size) >= 0;
  76     }
  77
  78     static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
  79     {
  80         if( channels == 1 )
  81         {
  82             switch( depth )
  83             {
  84             case CV_8U:
  85                 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
  86             case CV_16U:
  87                 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
  88             case CV_32F:
  89                 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
  90             }
  91         }
  92         else
  93         {
  94             if( channels == 3 )
  95             {
  96                 switch( depth )
  97                 {
  98                 case CV_8U:
  99                     return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
 100                 case CV_16U:
 101                     return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
 102                 case CV_32F:
 103                     return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
 104                 }
 105             }
 106             else if( channels == 4 )
 107             {
 108                 switch( depth )
 109                 {
 110                 case CV_8U:
 111                     return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
 112                 case CV_16U:
 113                     return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
 114                 case CV_32F:
 115                     return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
 116                 }
 117             }
 118         }
 119         return false;
 120     }
 121 #endif
 122
 123 /************** interpolation formulas and tables ***************/
 124
 125 const int INTER_RESIZE_COEF_BITS=11;
 126 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
 127
 128 const int INTER_REMAP_COEF_BITS=15;
 129 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
 130
 131 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
 132
 133 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
 134 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
 135
 136 #if CV_SSE2 || CV_NEON
 137 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
 138 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
 139 #endif
 140
 141 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
 142 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
 143
 144 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
 145 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
 146
 147 static inline void interpolateLinear( float x, float* coeffs )
 148 {
 149     coeffs[0] = 1.f - x;
 150     coeffs[1] = x;
 151 }
 152
 153 static inline void interpolateCubic( float x, float* coeffs )
 154 {
 155     const float A = -0.75f;
 156
 157     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
 158     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
 159     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
 160     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
 161 }
 162
 163 static inline void interpolateLanczos4( float x, float* coeffs )
 164 {
 165     static const double s45 = 0.70710678118654752440084436210485;
 166     static const double cs[][2]=
 167     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
 168
 169     if( x < FLT_EPSILON )
 170     {
 171         for( int i = 0; i < 8; i++ )
 172             coeffs[i] = 0;
 173         coeffs[3] = 1;
 174         return;
 175     }
 176
 177     float sum = 0;
 178     double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
 179     for(int i = 0; i < 8; i++ )
 180     {
 181         double y = -(x+3-i)*CV_PI*0.25;
 182         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
 183         sum += coeffs[i];
 184     }
 185
 186     sum = 1.f/sum;
 187     for(int i = 0; i < 8; i++ )
 188         coeffs[i] *= sum;
 189 }
 190
 191 static void initInterTab1D(int method, float* tab, int tabsz)
 192 {
 193     float scale = 1.f/tabsz;
 194     if( method == INTER_LINEAR )
 195     {
 196         for( int i = 0; i < tabsz; i++, tab += 2 )
 197             interpolateLinear( i*scale, tab );
 198     }
 199     else if( method == INTER_CUBIC )
 200     {
 201         for( int i = 0; i < tabsz; i++, tab += 4 )
 202             interpolateCubic( i*scale, tab );
 203     }
 204     else if( method == INTER_LANCZOS4 )
 205     {
 206         for( int i = 0; i < tabsz; i++, tab += 8 )
 207             interpolateLanczos4( i*scale, tab );
 208     }
 209     else
 210         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
 211 }
 212
 213
 214 static const void* initInterTab2D( int method, bool fixpt )
 215 {
 216     static bool inittab[INTER_MAX+1] = {false};
 217     float* tab = 0;
 218     short* itab = 0;
 219     int ksize = 0;
 220     if( method == INTER_LINEAR )
 221         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
 222     else if( method == INTER_CUBIC )
 223         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
 224     else if( method == INTER_LANCZOS4 )
 225         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
 226     else
 227         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
 228
 229     if( !inittab[method] )
 230     {
 231         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
 232         int i, j, k1, k2;
 233         initInterTab1D(method, _tab, INTER_TAB_SIZE);
 234         for( i = 0; i < INTER_TAB_SIZE; i++ )
 235             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
 236             {
 237                 int isum = 0;
 238                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
 239                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
 240
 241                 for( k1 = 0; k1 < ksize; k1++ )
 242                 {
 243                     float vy = _tab[i*ksize + k1];
 244                     for( k2 = 0; k2 < ksize; k2++ )
 245                     {
 246                         float v = vy*_tab[j*ksize + k2];
 247                         tab[k1*ksize + k2] = v;
 248                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
 249                     }
 250                 }
 251
 252                 if( isum != INTER_REMAP_COEF_SCALE )
 253                 {
 254                     int diff = isum - INTER_REMAP_COEF_SCALE;
 255                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
 256                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
 257                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
 258                         {
 259                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
 260                                 mk1 = k1, mk2 = k2;
 261                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
 262                                 Mk1 = k1, Mk2 = k2;
 263                         }
 264                     if( diff < 0 )
 265                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
 266                     else
 267                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
 268                 }
 269             }
 270         tab -= INTER_TAB_SIZE2*ksize*ksize;
 271         itab -= INTER_TAB_SIZE2*ksize*ksize;
 272 #if CV_SSE2 || CV_NEON
 273         if( method == INTER_LINEAR )
 274         {
 275             for( i = 0; i < INTER_TAB_SIZE2; i++ )
 276                 for( j = 0; j < 4; j++ )
 277                 {
 278                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
 279                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
 280                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
 281                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
 282                 }
 283         }
 284 #endif
 285         inittab[method] = true;
 286     }
 287     return fixpt ? (const void*)itab : (const void*)tab;
 288 }
 289
 290 #ifndef __MINGW32__
 291 static bool initAllInterTab2D()
 292 {
 293     return  initInterTab2D( INTER_LINEAR, false ) &&
 294             initInterTab2D( INTER_LINEAR, true ) &&
 295             initInterTab2D( INTER_CUBIC, false ) &&
 296             initInterTab2D( INTER_CUBIC, true ) &&
 297             initInterTab2D( INTER_LANCZOS4, false ) &&
 298             initInterTab2D( INTER_LANCZOS4, true );
 299 }
 300
 301 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
 302 #endif
 303
 304 template<typename ST, typename DT> struct Cast
 305 {
 306     typedef ST type1;
 307     typedef DT rtype;
 308
 309     DT operator()(ST val) const { return saturate_cast<DT>(val); }
 310 };
 311
 312 template<typename ST, typename DT, int bits> struct FixedPtCast
 313 {
 314     typedef ST type1;
 315     typedef DT rtype;
 316     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
 317
 318     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
 319 };
 320
 321 /****************************************************************************************\
 322 *                                         Resize                                         *
 323 \****************************************************************************************/
 324
 325 class resizeNNInvoker :
 326     public ParallelLoopBody
 327 {
 328 public:
 329     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
 330         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
 331         ify(_ify)
 332     {
 333     }
 334
 335     virtual void operator() (const Range& range) const
 336     {
 337         Size ssize = src.size(), dsize = dst.size();
 338         int y, x, pix_size = (int)src.elemSize();
 339
 340         for( y = range.start; y < range.end; y++ )
 341         {
 342             uchar* D = dst.data + dst.step*y;
 343             int sy = std::min(cvFloor(y*ify), ssize.height-1);
 344             const uchar* S = src.ptr(sy);
 345
 346             switch( pix_size )
 347             {
 348             case 1:
 349                 for( x = 0; x <= dsize.width - 2; x += 2 )
 350                 {
 351                     uchar t0 = S[x_ofs[x]];
 352                     uchar t1 = S[x_ofs[x+1]];
 353                     D[x] = t0;
 354                     D[x+1] = t1;
 355                 }
 356
 357                 for( ; x < dsize.width; x++ )
 358                     D[x] = S[x_ofs[x]];
 359                 break;
 360             case 2:
 361                 for( x = 0; x < dsize.width; x++ )
 362                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
 363                 break;
 364             case 3:
 365                 for( x = 0; x < dsize.width; x++, D += 3 )
 366                 {
 367                     const uchar* _tS = S + x_ofs[x];
 368                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
 369                 }
 370                 break;
 371             case 4:
 372                 for( x = 0; x < dsize.width; x++ )
 373                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
 374                 break;
 375             case 6:
 376                 for( x = 0; x < dsize.width; x++, D += 6 )
 377                 {
 378                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
 379                     ushort* _tD = (ushort*)D;
 380                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
 381                 }
 382                 break;
 383             case 8:
 384                 for( x = 0; x < dsize.width; x++, D += 8 )
 385                 {
 386                     const int* _tS = (const int*)(S + x_ofs[x]);
 387                     int* _tD = (int*)D;
 388                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
 389                 }
 390                 break;
 391             case 12:
 392                 for( x = 0; x < dsize.width; x++, D += 12 )
 393                 {
 394                     const int* _tS = (const int*)(S + x_ofs[x]);
 395                     int* _tD = (int*)D;
 396                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
 397                 }
 398                 break;
 399             default:
 400                 for( x = 0; x < dsize.width; x++, D += pix_size )
 401                 {
 402                     const int* _tS = (const int*)(S + x_ofs[x]);
 403                     int* _tD = (int*)D;
 404                     for( int k = 0; k < pix_size4; k++ )
 405                         _tD[k] = _tS[k];
 406                 }
 407             }
 408         }
 409     }
 410
 411 private:
 412     const Mat src;
 413     Mat dst;
 414     int* x_ofs, pix_size4;
 415     double ify;
 416
 417     resizeNNInvoker(const resizeNNInvoker&);
 418     resizeNNInvoker& operator=(const resizeNNInvoker&);
 419 };
 420
 421 static void
 422 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
 423 {
 424     Size ssize = src.size(), dsize = dst.size();
 425     AutoBuffer<int> _x_ofs(dsize.width);
 426     int* x_ofs = _x_ofs;
 427     int pix_size = (int)src.elemSize();
 428     int pix_size4 = (int)(pix_size / sizeof(int));
 429     double ifx = 1./fx, ify = 1./fy;
 430     int x;
 431
 432     for( x = 0; x < dsize.width; x++ )
 433     {
 434         int sx = cvFloor(x*ifx);
 435         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
 436     }
 437
 438     Range range(0, dsize.height);
 439     resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
 440     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 441 }
 442
 443
 444 struct VResizeNoVec
 445 {
 446     int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
 447 };
 448
 449 struct HResizeNoVec
 450 {
 451     int operator()(const uchar**, uchar**, int, const int*,
 452         const uchar*, int, int, int, int, int) const { return 0; }
 453 };
 454
 455 #if CV_SSE2
 456
 457 struct VResizeLinearVec_32s8u
 458 {
 459     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
 460     {
 461         if( !checkHardwareSupport(CV_CPU_SSE2) )
 462             return 0;
 463
 464         const int** src = (const int**)_src;
 465         const short* beta = (const short*)_beta;
 466         const int *S0 = src[0], *S1 = src[1];
 467         int x = 0;
 468         __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
 469         __m128i delta = _mm_set1_epi16(2);
 470
 471         if( (((size_t)S0|(size_t)S1)&15) == 0 )
 472             for( ; x <= width - 16; x += 16 )
 473             {
 474                 __m128i x0, x1, x2, y0, y1, y2;
 475                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
 476                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
 477                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
 478                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
 479                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
 480                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
 481
 482                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
 483                 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
 484                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
 485                 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
 486                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
 487                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
 488
 489                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
 490                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
 491
 492                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
 493                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
 494                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
 495             }
 496         else
 497             for( ; x <= width - 16; x += 16 )
 498             {
 499                 __m128i x0, x1, x2, y0, y1, y2;
 500                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
 501                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
 502                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
 503                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
 504                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
 505                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
 506
 507                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
 508                 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
 509                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
 510                 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
 511                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
 512                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
 513
 514                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
 515                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
 516
 517                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
 518                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
 519                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
 520             }
 521
 522         for( ; x < width - 4; x += 4 )
 523         {
 524             __m128i x0, y0;
 525             x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
 526             y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
 527             x0 = _mm_packs_epi32(x0, x0);
 528             y0 = _mm_packs_epi32(y0, y0);
 529             x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
 530             x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
 531             x0 = _mm_packus_epi16(x0, x0);
 532             *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
 533         }
 534
 535         return x;
 536     }
 537 };
 538
 539
 540 template<int shiftval> struct VResizeLinearVec_32f16
 541 {
 542     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 543     {
 544         if( !checkHardwareSupport(CV_CPU_SSE2) )
 545             return 0;
 546
 547         const float** src = (const float**)_src;
 548         const float* beta = (const float*)_beta;
 549         const float *S0 = src[0], *S1 = src[1];
 550         ushort* dst = (ushort*)_dst;
 551         int x = 0;
 552
 553         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
 554         __m128i preshift = _mm_set1_epi32(shiftval);
 555         __m128i postshift = _mm_set1_epi16((short)shiftval);
 556
 557         if( (((size_t)S0|(size_t)S1)&15) == 0 )
 558             for( ; x <= width - 16; x += 16 )
 559             {
 560                 __m128 x0, x1, y0, y1;
 561                 __m128i t0, t1, t2;
 562                 x0 = _mm_load_ps(S0 + x);
 563                 x1 = _mm_load_ps(S0 + x + 4);
 564                 y0 = _mm_load_ps(S1 + x);
 565                 y1 = _mm_load_ps(S1 + x + 4);
 566
 567                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 568                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 569                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 570                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 571                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
 572
 573                 x0 = _mm_load_ps(S0 + x + 8);
 574                 x1 = _mm_load_ps(S0 + x + 12);
 575                 y0 = _mm_load_ps(S1 + x + 8);
 576                 y1 = _mm_load_ps(S1 + x + 12);
 577
 578                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 579                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 580                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 581                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 582                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
 583
 584                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
 585                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
 586             }
 587         else
 588             for( ; x <= width - 16; x += 16 )
 589             {
 590                 __m128 x0, x1, y0, y1;
 591                 __m128i t0, t1, t2;
 592                 x0 = _mm_loadu_ps(S0 + x);
 593                 x1 = _mm_loadu_ps(S0 + x + 4);
 594                 y0 = _mm_loadu_ps(S1 + x);
 595                 y1 = _mm_loadu_ps(S1 + x + 4);
 596
 597                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 598                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 599                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 600                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 601                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
 602
 603                 x0 = _mm_loadu_ps(S0 + x + 8);
 604                 x1 = _mm_loadu_ps(S0 + x + 12);
 605                 y0 = _mm_loadu_ps(S1 + x + 8);
 606                 y1 = _mm_loadu_ps(S1 + x + 12);
 607
 608                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 609                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 610                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 611                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 612                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
 613
 614                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
 615                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
 616             }
 617
 618         for( ; x < width - 4; x += 4 )
 619         {
 620             __m128 x0, y0;
 621             __m128i t0;
 622             x0 = _mm_loadu_ps(S0 + x);
 623             y0 = _mm_loadu_ps(S1 + x);
 624
 625             x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 626             t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 627             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
 628             _mm_storel_epi64( (__m128i*)(dst + x), t0);
 629         }
 630
 631         return x;
 632     }
 633 };
 634
 635 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
 636 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
 637
 638 struct VResizeLinearVec_32f
 639 {
 640     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 641     {
 642         if( !checkHardwareSupport(CV_CPU_SSE) )
 643             return 0;
 644
 645         const float** src = (const float**)_src;
 646         const float* beta = (const float*)_beta;
 647         const float *S0 = src[0], *S1 = src[1];
 648         float* dst = (float*)_dst;
 649         int x = 0;
 650
 651         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
 652
 653         if( (((size_t)S0|(size_t)S1)&15) == 0 )
 654             for( ; x <= width - 8; x += 8 )
 655             {
 656                 __m128 x0, x1, y0, y1;
 657                 x0 = _mm_load_ps(S0 + x);
 658                 x1 = _mm_load_ps(S0 + x + 4);
 659                 y0 = _mm_load_ps(S1 + x);
 660                 y1 = _mm_load_ps(S1 + x + 4);
 661
 662                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 663                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 664
 665                 _mm_storeu_ps( dst + x, x0);
 666                 _mm_storeu_ps( dst + x + 4, x1);
 667             }
 668         else
 669             for( ; x <= width - 8; x += 8 )
 670             {
 671                 __m128 x0, x1, y0, y1;
 672                 x0 = _mm_loadu_ps(S0 + x);
 673                 x1 = _mm_loadu_ps(S0 + x + 4);
 674                 y0 = _mm_loadu_ps(S1 + x);
 675                 y1 = _mm_loadu_ps(S1 + x + 4);
 676
 677                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 678                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 679
 680                 _mm_storeu_ps( dst + x, x0);
 681                 _mm_storeu_ps( dst + x + 4, x1);
 682             }
 683
 684         return x;
 685     }
 686 };
 687
 688
 689 struct VResizeCubicVec_32s8u
 690 {
 691     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
 692     {
 693         if( !checkHardwareSupport(CV_CPU_SSE2) )
 694             return 0;
 695
 696         const int** src = (const int**)_src;
 697         const short* beta = (const short*)_beta;
 698         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
 699         int x = 0;
 700         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
 701         __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
 702             b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
 703
 704         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
 705             for( ; x <= width - 8; x += 8 )
 706             {
 707                 __m128i x0, x1, y0, y1;
 708                 __m128 s0, s1, f0, f1;
 709                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
 710                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
 711                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
 712                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
 713
 714                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
 715                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
 716                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
 717                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
 718                 s0 = _mm_add_ps(s0, f0);
 719                 s1 = _mm_add_ps(s1, f1);
 720
 721                 x0 = _mm_load_si128((const __m128i*)(S2 + x));
 722                 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
 723                 y0 = _mm_load_si128((const __m128i*)(S3 + x));
 724                 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
 725
 726                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
 727                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
 728                 s0 = _mm_add_ps(s0, f0);
 729                 s1 = _mm_add_ps(s1, f1);
 730                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
 731                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
 732                 s0 = _mm_add_ps(s0, f0);
 733                 s1 = _mm_add_ps(s1, f1);
 734
 735                 x0 = _mm_cvtps_epi32(s0);
 736                 x1 = _mm_cvtps_epi32(s1);
 737
 738                 x0 = _mm_packs_epi32(x0, x1);
 739                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
 740             }
 741         else
 742             for( ; x <= width - 8; x += 8 )
 743             {
 744                 __m128i x0, x1, y0, y1;
 745                 __m128 s0, s1, f0, f1;
 746                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
 747                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
 748                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
 749                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
 750
 751                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
 752                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
 753                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
 754                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
 755                 s0 = _mm_add_ps(s0, f0);
 756                 s1 = _mm_add_ps(s1, f1);
 757
 758                 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
 759                 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
 760                 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
 761                 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
 762
 763                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
 764                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
 765                 s0 = _mm_add_ps(s0, f0);
 766                 s1 = _mm_add_ps(s1, f1);
 767                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
 768                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
 769                 s0 = _mm_add_ps(s0, f0);
 770                 s1 = _mm_add_ps(s1, f1);
 771
 772                 x0 = _mm_cvtps_epi32(s0);
 773                 x1 = _mm_cvtps_epi32(s1);
 774
 775                 x0 = _mm_packs_epi32(x0, x1);
 776                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
 777             }
 778
 779         return x;
 780     }
 781 };
 782
 783
 784 template<int shiftval> struct VResizeCubicVec_32f16
 785 {
 786     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 787     {
 788         if( !checkHardwareSupport(CV_CPU_SSE2) )
 789             return 0;
 790
 791         const float** src = (const float**)_src;
 792         const float* beta = (const float*)_beta;
 793         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
 794         ushort* dst = (ushort*)_dst;
 795         int x = 0;
 796         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
 797             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
 798         __m128i preshift = _mm_set1_epi32(shiftval);
 799         __m128i postshift = _mm_set1_epi16((short)shiftval);
 800
 801         for( ; x <= width - 8; x += 8 )
 802         {
 803             __m128 x0, x1, y0, y1, s0, s1;
 804             __m128i t0, t1;
 805             x0 = _mm_loadu_ps(S0 + x);
 806             x1 = _mm_loadu_ps(S0 + x + 4);
 807             y0 = _mm_loadu_ps(S1 + x);
 808             y1 = _mm_loadu_ps(S1 + x + 4);
 809
 810             s0 = _mm_mul_ps(x0, b0);
 811             s1 = _mm_mul_ps(x1, b0);
 812             y0 = _mm_mul_ps(y0, b1);
 813             y1 = _mm_mul_ps(y1, b1);
 814             s0 = _mm_add_ps(s0, y0);
 815             s1 = _mm_add_ps(s1, y1);
 816
 817             x0 = _mm_loadu_ps(S2 + x);
 818             x1 = _mm_loadu_ps(S2 + x + 4);
 819             y0 = _mm_loadu_ps(S3 + x);
 820             y1 = _mm_loadu_ps(S3 + x + 4);
 821
 822             x0 = _mm_mul_ps(x0, b2);
 823             x1 = _mm_mul_ps(x1, b2);
 824             y0 = _mm_mul_ps(y0, b3);
 825             y1 = _mm_mul_ps(y1, b3);
 826             s0 = _mm_add_ps(s0, x0);
 827             s1 = _mm_add_ps(s1, x1);
 828             s0 = _mm_add_ps(s0, y0);
 829             s1 = _mm_add_ps(s1, y1);
 830
 831             t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
 832             t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
 833
 834             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
 835             _mm_storeu_si128( (__m128i*)(dst + x), t0);
 836         }
 837
 838         return x;
 839     }
 840 };
 841
 842 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
 843 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
 844
 845 struct VResizeCubicVec_32f
 846 {
 847     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 848     {
 849         if( !checkHardwareSupport(CV_CPU_SSE) )
 850             return 0;
 851
 852         const float** src = (const float**)_src;
 853         const float* beta = (const float*)_beta;
 854         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
 855         float* dst = (float*)_dst;
 856         int x = 0;
 857         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
 858             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
 859
 860         for( ; x <= width - 8; x += 8 )
 861         {
 862             __m128 x0, x1, y0, y1, s0, s1;
 863             x0 = _mm_loadu_ps(S0 + x);
 864             x1 = _mm_loadu_ps(S0 + x + 4);
 865             y0 = _mm_loadu_ps(S1 + x);
 866             y1 = _mm_loadu_ps(S1 + x + 4);
 867
 868             s0 = _mm_mul_ps(x0, b0);
 869             s1 = _mm_mul_ps(x1, b0);
 870             y0 = _mm_mul_ps(y0, b1);
 871             y1 = _mm_mul_ps(y1, b1);
 872             s0 = _mm_add_ps(s0, y0);
 873             s1 = _mm_add_ps(s1, y1);
 874
 875             x0 = _mm_loadu_ps(S2 + x);
 876             x1 = _mm_loadu_ps(S2 + x + 4);
 877             y0 = _mm_loadu_ps(S3 + x);
 878             y1 = _mm_loadu_ps(S3 + x + 4);
 879
 880             x0 = _mm_mul_ps(x0, b2);
 881             x1 = _mm_mul_ps(x1, b2);
 882             y0 = _mm_mul_ps(y0, b3);
 883             y1 = _mm_mul_ps(y1, b3);
 884             s0 = _mm_add_ps(s0, x0);
 885             s1 = _mm_add_ps(s1, x1);
 886             s0 = _mm_add_ps(s0, y0);
 887             s1 = _mm_add_ps(s1, y1);
 888
 889             _mm_storeu_ps( dst + x, s0);
 890             _mm_storeu_ps( dst + x + 4, s1);
 891         }
 892
 893         return x;
 894     }
 895 };
 896
 897 #elif CV_NEON
 898
 899 struct VResizeLinearVec_32s8u
 900 {
 901     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
 902     {
 903         const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
 904         const short* beta = (const short*)_beta;
 905         int x = 0;
 906         int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
 907
 908         for( ; x <= width - 16; x += 16)
 909         {
 910             int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
 911             int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
 912
 913             int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
 914             int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
 915
 916             int16x8_t v_dst0 = vmlaq_s16(vmulq_s16(v_src0, v_b0), v_src1, v_b1);
 917             v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
 918
 919             v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
 920             v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
 921
 922             v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
 923             v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
 924
 925             int16x8_t v_dst1 = vmlaq_s16(vmulq_s16(v_src0, v_b0), v_src1, v_b1);
 926             v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
 927
 928             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
 929         }
 930
 931         return x;
 932     }
 933 };
 934
 935 struct VResizeLinearVec_32f16u
 936 {
 937     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 938     {
 939         const float** src = (const float**)_src;
 940         const float* beta = (const float*)_beta;
 941         const float *S0 = src[0], *S1 = src[1];
 942         ushort* dst = (ushort*)_dst;
 943         int x = 0;
 944
 945         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
 946
 947         for( ; x <= width - 8; x += 8 )
 948         {
 949             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
 950             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
 951
 952             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
 953             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
 954
 955             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
 956                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
 957         }
 958
 959         return x;
 960     }
 961 };
 962
 963 struct VResizeLinearVec_32f16s
 964 {
 965     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 966     {
 967         const float** src = (const float**)_src;
 968         const float* beta = (const float*)_beta;
 969         const float *S0 = src[0], *S1 = src[1];
 970         short* dst = (short*)_dst;
 971         int x = 0;
 972
 973         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
 974
 975         for( ; x <= width - 8; x += 8 )
 976         {
 977             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
 978             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
 979
 980             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
 981             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
 982
 983             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
 984                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
 985         }
 986
 987         return x;
 988     }
 989 };
 990
 991 struct VResizeLinearVec_32f
 992 {
 993     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 994     {
 995         const float** src = (const float**)_src;
 996         const float* beta = (const float*)_beta;
 997         const float *S0 = src[0], *S1 = src[1];
 998         float* dst = (float*)_dst;
 999         int x = 0;
1000
1001         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1002
1003         for( ; x <= width - 8; x += 8 )
1004         {
1005             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1006             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1007
1008             vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
1009             vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
1010         }
1011
1012         return x;
1013     }
1014 };
1015
1016 typedef VResizeNoVec VResizeCubicVec_32s8u;
1017
1018 struct VResizeCubicVec_32f16u
1019 {
1020     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1021     {
1022         const float** src = (const float**)_src;
1023         const float* beta = (const float*)_beta;
1024         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1025         ushort* dst = (ushort*)_dst;
1026         int x = 0;
1027         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1028                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1029
1030         for( ; x <= width - 8; x += 8 )
1031         {
1032             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1033                                                                          v_b1, vld1q_f32(S1 + x)),
1034                                                                          v_b2, vld1q_f32(S2 + x)),
1035                                                                          v_b3, vld1q_f32(S3 + x));
1036             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1037                                                                          v_b1, vld1q_f32(S1 + x + 4)),
1038                                                                          v_b2, vld1q_f32(S2 + x + 4)),
1039                                                                          v_b3, vld1q_f32(S3 + x + 4));
1040
1041             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1042                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1043         }
1044
1045         return x;
1046     }
1047 };
1048
1049 struct VResizeCubicVec_32f16s
1050 {
1051     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1052     {
1053         const float** src = (const float**)_src;
1054         const float* beta = (const float*)_beta;
1055         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1056         short* dst = (short*)_dst;
1057         int x = 0;
1058         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1059                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1060
1061         for( ; x <= width - 8; x += 8 )
1062         {
1063             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1064                                                                          v_b1, vld1q_f32(S1 + x)),
1065                                                                          v_b2, vld1q_f32(S2 + x)),
1066                                                                          v_b3, vld1q_f32(S3 + x));
1067             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1068                                                                          v_b1, vld1q_f32(S1 + x + 4)),
1069                                                                          v_b2, vld1q_f32(S2 + x + 4)),
1070                                                                          v_b3, vld1q_f32(S3 + x + 4));
1071
1072             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1073                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1074         }
1075
1076         return x;
1077     }
1078 };
1079
1080 struct VResizeCubicVec_32f
1081 {
1082     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1083     {
1084         const float** src = (const float**)_src;
1085         const float* beta = (const float*)_beta;
1086         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1087         float* dst = (float*)_dst;
1088         int x = 0;
1089         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1090                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1091
1092         for( ; x <= width - 8; x += 8 )
1093         {
1094             vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1095                                                                        v_b1, vld1q_f32(S1 + x)),
1096                                                                        v_b2, vld1q_f32(S2 + x)),
1097                                                                        v_b3, vld1q_f32(S3 + x)));
1098             vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1099                                                                           v_b1, vld1q_f32(S1 + x + 4)),
1100                                                                           v_b2, vld1q_f32(S2 + x + 4)),
1101                                                                           v_b3, vld1q_f32(S3 + x + 4)));
1102         }
1103
1104         return x;
1105     }
1106 };
1107
1108 #else
1109
1110 typedef VResizeNoVec VResizeLinearVec_32s8u;
1111 typedef VResizeNoVec VResizeLinearVec_32f16u;
1112 typedef VResizeNoVec VResizeLinearVec_32f16s;
1113 typedef VResizeNoVec VResizeLinearVec_32f;
1114
1115 typedef VResizeNoVec VResizeCubicVec_32s8u;
1116 typedef VResizeNoVec VResizeCubicVec_32f16u;
1117 typedef VResizeNoVec VResizeCubicVec_32f16s;
1118 typedef VResizeNoVec VResizeCubicVec_32f;
1119
1120 #endif
1121
1122 typedef HResizeNoVec HResizeLinearVec_8u32s;
1123 typedef HResizeNoVec HResizeLinearVec_16u32f;
1124 typedef HResizeNoVec HResizeLinearVec_16s32f;
1125 typedef HResizeNoVec HResizeLinearVec_32f;
1126 typedef HResizeNoVec HResizeLinearVec_64f;
1127
1128
1129 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1130 struct HResizeLinear
1131 {
1132     typedef T value_type;
1133     typedef WT buf_type;
1134     typedef AT alpha_type;
1135
1136     void operator()(const T** src, WT** dst, int count,
1137                     const int* xofs, const AT* alpha,
1138                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1139     {
1140         int dx, k;
1141         VecOp vecOp;
1142
1143         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
1144             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
1145
1146         for( k = 0; k <= count - 2; k++ )
1147         {
1148             const T *S0 = src[k], *S1 = src[k+1];
1149             WT *D0 = dst[k], *D1 = dst[k+1];
1150             for( dx = dx0; dx < xmax; dx++ )
1151             {
1152                 int sx = xofs[dx];
1153                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1154                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1155                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1156                 D0[dx] = t0; D1[dx] = t1;
1157             }
1158
1159             for( ; dx < dwidth; dx++ )
1160             {
1161                 int sx = xofs[dx];
1162                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1163             }
1164         }
1165
1166         for( ; k < count; k++ )
1167         {
1168             const T *S = src[k];
1169             WT *D = dst[k];
1170             for( dx = 0; dx < xmax; dx++ )
1171             {
1172                 int sx = xofs[dx];
1173                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1174             }
1175
1176             for( ; dx < dwidth; dx++ )
1177                 D[dx] = WT(S[xofs[dx]]*ONE);
1178         }
1179     }
1180 };
1181
1182
1183 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1184 struct VResizeLinear
1185 {
1186     typedef T value_type;
1187     typedef WT buf_type;
1188     typedef AT alpha_type;
1189
1190     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1191     {
1192         WT b0 = beta[0], b1 = beta[1];
1193         const WT *S0 = src[0], *S1 = src[1];
1194         CastOp castOp;
1195         VecOp vecOp;
1196
1197         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1198         #if CV_ENABLE_UNROLLED
1199         for( ; x <= width - 4; x += 4 )
1200         {
1201             WT t0, t1;
1202             t0 = S0[x]*b0 + S1[x]*b1;
1203             t1 = S0[x+1]*b0 + S1[x+1]*b1;
1204             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1205             t0 = S0[x+2]*b0 + S1[x+2]*b1;
1206             t1 = S0[x+3]*b0 + S1[x+3]*b1;
1207             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1208         }
1209         #endif
1210         for( ; x < width; x++ )
1211             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1212     }
1213 };
1214
1215 template<>
1216 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1217 {
1218     typedef uchar value_type;
1219     typedef int buf_type;
1220     typedef short alpha_type;
1221
1222     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1223     {
1224         alpha_type b0 = beta[0], b1 = beta[1];
1225         const buf_type *S0 = src[0], *S1 = src[1];
1226         VResizeLinearVec_32s8u vecOp;
1227
1228         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1229         #if CV_ENABLE_UNROLLED
1230         for( ; x <= width - 4; x += 4 )
1231         {
1232             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1233             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1234             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1235             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1236         }
1237         #endif
1238         for( ; x < width; x++ )
1239             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1240     }
1241 };
1242
1243
1244 template<typename T, typename WT, typename AT>
1245 struct HResizeCubic
1246 {
1247     typedef T value_type;
1248     typedef WT buf_type;
1249     typedef AT alpha_type;
1250
1251     void operator()(const T** src, WT** dst, int count,
1252                     const int* xofs, const AT* alpha,
1253                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1254     {
1255         for( int k = 0; k < count; k++ )
1256         {
1257             const T *S = src[k];
1258             WT *D = dst[k];
1259             int dx = 0, limit = xmin;
1260             for(;;)
1261             {
1262                 for( ; dx < limit; dx++, alpha += 4 )
1263                 {
1264                     int j, sx = xofs[dx] - cn;
1265                     WT v = 0;
1266                     for( j = 0; j < 4; j++ )
1267                     {
1268                         int sxj = sx + j*cn;
1269                         if( (unsigned)sxj >= (unsigned)swidth )
1270                         {
1271                             while( sxj < 0 )
1272                                 sxj += cn;
1273                             while( sxj >= swidth )
1274                                 sxj -= cn;
1275                         }
1276                         v += S[sxj]*alpha[j];
1277                     }
1278                     D[dx] = v;
1279                 }
1280                 if( limit == dwidth )
1281                     break;
1282                 for( ; dx < xmax; dx++, alpha += 4 )
1283                 {
1284                     int sx = xofs[dx];
1285                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1286                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1287                 }
1288                 limit = dwidth;
1289             }
1290             alpha -= dwidth*4;
1291         }
1292     }
1293 };
1294
1295
1296 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1297 struct VResizeCubic
1298 {
1299     typedef T value_type;
1300     typedef WT buf_type;
1301     typedef AT alpha_type;
1302
1303     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1304     {
1305         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1306         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1307         CastOp castOp;
1308         VecOp vecOp;
1309
1310         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1311         for( ; x < width; x++ )
1312             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
1313     }
1314 };
1315
1316
1317 template<typename T, typename WT, typename AT>
1318 struct HResizeLanczos4
1319 {
1320     typedef T value_type;
1321     typedef WT buf_type;
1322     typedef AT alpha_type;
1323
1324     void operator()(const T** src, WT** dst, int count,
1325                     const int* xofs, const AT* alpha,
1326                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1327     {
1328         for( int k = 0; k < count; k++ )
1329         {
1330             const T *S = src[k];
1331             WT *D = dst[k];
1332             int dx = 0, limit = xmin;
1333             for(;;)
1334             {
1335                 for( ; dx < limit; dx++, alpha += 8 )
1336                 {
1337                     int j, sx = xofs[dx] - cn*3;
1338                     WT v = 0;
1339                     for( j = 0; j < 8; j++ )
1340                     {
1341                         int sxj = sx + j*cn;
1342                         if( (unsigned)sxj >= (unsigned)swidth )
1343                         {
1344                             while( sxj < 0 )
1345                                 sxj += cn;
1346                             while( sxj >= swidth )
1347                                 sxj -= cn;
1348                         }
1349                         v += S[sxj]*alpha[j];
1350                     }
1351                     D[dx] = v;
1352                 }
1353                 if( limit == dwidth )
1354                     break;
1355                 for( ; dx < xmax; dx++, alpha += 8 )
1356                 {
1357                     int sx = xofs[dx];
1358                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
1359                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
1360                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
1361                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
1362                 }
1363                 limit = dwidth;
1364             }
1365             alpha -= dwidth*8;
1366         }
1367     }
1368 };
1369
1370
1371 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1372 struct VResizeLanczos4
1373 {
1374     typedef T value_type;
1375     typedef WT buf_type;
1376     typedef AT alpha_type;
1377
1378     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1379     {
1380         CastOp castOp;
1381         VecOp vecOp;
1382         int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1383         #if CV_ENABLE_UNROLLED
1384         for( ; x <= width - 4; x += 4 )
1385         {
1386             WT b = beta[0];
1387             const WT* S = src[0];
1388             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
1389
1390             for( k = 1; k < 8; k++ )
1391             {
1392                 b = beta[k]; S = src[k];
1393                 s0 += S[x]*b; s1 += S[x+1]*b;
1394                 s2 += S[x+2]*b; s3 += S[x+3]*b;
1395             }
1396
1397             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
1398             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
1399         }
1400         #endif
1401         for( ; x < width; x++ )
1402         {
1403             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
1404                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
1405                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
1406         }
1407     }
1408 };
1409
1410
1411 static inline int clip(int x, int a, int b)
1412 {
1413     return x >= a ? (x < b ? x : b-1) : a;
1414 }
1415
1416 static const int MAX_ESIZE=16;
1417
1418 template <typename HResize, typename VResize>
1419 class resizeGeneric_Invoker :
1420     public ParallelLoopBody
1421 {
1422 public:
1423     typedef typename HResize::value_type T;
1424     typedef typename HResize::buf_type WT;
1425     typedef typename HResize::alpha_type AT;
1426
1427     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
1428         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
1429         int _ksize, int _xmin, int _xmax) :
1430         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
1431         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
1432         ksize(_ksize), xmin(_xmin), xmax(_xmax)
1433     {
1434         CV_Assert(ksize <= MAX_ESIZE);
1435     }
1436
1437 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1438 # pragma GCC diagnostic push
1439 # pragma GCC diagnostic ignored "-Warray-bounds"
1440 #endif
1441     virtual void operator() (const Range& range) const
1442     {
1443         int dy, cn = src.channels();
1444         HResize hresize;
1445         VResize vresize;
1446
1447         int bufstep = (int)alignSize(dsize.width, 16);
1448         AutoBuffer<WT> _buffer(bufstep*ksize);
1449         const T* srows[MAX_ESIZE]={0};
1450         WT* rows[MAX_ESIZE]={0};
1451         int prev_sy[MAX_ESIZE];
1452
1453         for(int k = 0; k < ksize; k++ )
1454         {
1455             prev_sy[k] = -1;
1456             rows[k] = (WT*)_buffer + bufstep*k;
1457         }
1458
1459         const AT* beta = _beta + ksize * range.start;
1460
1461         for( dy = range.start; dy < range.end; dy++, beta += ksize )
1462         {
1463             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
1464
1465             for(int k = 0; k < ksize; k++ )
1466             {
1467                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
1468                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
1469                 {
1470                     if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
1471                     {
1472                         if( k1 > k )
1473                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
1474                         break;
1475                     }
1476                 }
1477                 if( k1 == ksize )
1478                     k0 = std::min(k0, k); // remember the first row that needs to be computed
1479                 srows[k] = src.template ptr<T>(sy);
1480                 prev_sy[k] = sy;
1481             }
1482
1483             if( k0 < ksize )
1484                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
1485                         ssize.width, dsize.width, cn, xmin, xmax );
1486             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
1487         }
1488     }
1489 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1490 # pragma GCC diagnostic pop
1491 #endif
1492
1493 private:
1494     Mat src;
1495     Mat dst;
1496     const int* xofs, *yofs;
1497     const AT* alpha, *_beta;
1498     Size ssize, dsize;
1499     const int ksize, xmin, xmax;
1500
1501     resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
1502 };
1503
1504 template<class HResize, class VResize>
1505 static void resizeGeneric_( const Mat& src, Mat& dst,
1506                             const int* xofs, const void* _alpha,
1507                             const int* yofs, const void* _beta,
1508                             int xmin, int xmax, int ksize )
1509 {
1510     typedef typename HResize::alpha_type AT;
1511
1512     const AT* beta = (const AT*)_beta;
1513     Size ssize = src.size(), dsize = dst.size();
1514     int cn = src.channels();
1515     ssize.width *= cn;
1516     dsize.width *= cn;
1517     xmin *= cn;
1518     xmax *= cn;
1519     // image resize is a separable operation. In case of not too strong
1520
1521     Range range(0, dsize.height);
1522     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
1523         ssize, dsize, ksize, xmin, xmax);
1524     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1525 }
1526
1527 template <typename T, typename WT>
1528 struct ResizeAreaFastNoVec
1529 {
1530     ResizeAreaFastNoVec(int, int) { }
1531     ResizeAreaFastNoVec(int, int, int, int) { }
1532     int operator() (const T*, T*, int) const
1533     { return 0; }
1534 };
1535
1536 #if CV_NEON
1537
1538 class ResizeAreaFastVec_SIMD_8u
1539 {
1540 public:
1541     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1542         cn(_cn), step(_step)
1543     {
1544     }
1545
1546     int operator() (const uchar* S, uchar* D, int w) const
1547     {
1548         int dx = 0;
1549         const uchar* S0 = S, * S1 = S0 + step;
1550
1551         uint16x8_t v_2 = vdupq_n_u16(2);
1552
1553         if (cn == 1)
1554         {
1555             for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
1556             {
1557                 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
1558
1559                 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
1560                 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
1561                 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
1562
1563                 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
1564                 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
1565                 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
1566
1567                 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
1568             }
1569         }
1570         else if (cn == 4)
1571         {
1572             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1573             {
1574                 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
1575
1576                 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
1577                 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
1578                 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
1579                 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
1580
1581                 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
1582                                            vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
1583                 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
1584                                            vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
1585                 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
1586
1587                 vst1_u8(D, vmovn_u16(v_dst));
1588             }
1589         }
1590
1591         return dx;
1592     }
1593
1594 private:
1595     int cn, step;
1596 };
1597
1598 class ResizeAreaFastVec_SIMD_16u
1599 {
1600 public:
1601     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1602         cn(_cn), step(_step)
1603     {
1604     }
1605
1606     int operator() (const ushort * S, ushort * D, int w) const
1607     {
1608         int dx = 0;
1609         const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
1610
1611         uint32x4_t v_2 = vdupq_n_u32(2);
1612
1613         if (cn == 1)
1614         {
1615             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1616             {
1617                 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
1618
1619                 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
1620                 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
1621                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
1622
1623                 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
1624                 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
1625                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
1626
1627                 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
1628             }
1629         }
1630         else if (cn == 4)
1631         {
1632             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1633             {
1634                 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
1635                 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
1636                                              vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
1637                 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
1638             }
1639         }
1640
1641         return dx;
1642     }
1643
1644 private:
1645     int cn, step;
1646 };
1647
1648 #elif CV_SSE2
1649
1650 class ResizeAreaFastVec_SIMD_8u
1651 {
1652 public:
1653     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1654         cn(_cn), step(_step)
1655     {
1656         use_simd = checkHardwareSupport(CV_CPU_SSE2);
1657     }
1658
1659     int operator() (const uchar* S, uchar* D, int w) const
1660     {
1661         if (!use_simd)
1662             return 0;
1663
1664         int dx = 0;
1665         const uchar* S0 = S;
1666         const uchar* S1 = S0 + step;
1667         __m128i zero = _mm_setzero_si128();
1668         __m128i delta2 = _mm_set1_epi16(2);
1669
1670         if (cn == 1)
1671         {
1672             __m128i masklow = _mm_set1_epi16(0x00ff);
1673             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1674             {
1675                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1676                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1677
1678                 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
1679                 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
1680                 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
1681                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1682
1683                 _mm_storel_epi64((__m128i*)D, s0);
1684             }
1685         }
1686         else if (cn == 3)
1687             for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
1688             {
1689                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1690                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1691
1692                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1693                 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
1694                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1695                 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
1696
1697                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
1698                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
1699                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1700                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1701                 _mm_storel_epi64((__m128i*)D, s0);
1702
1703                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
1704                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
1705                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1706                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1707                 _mm_storel_epi64((__m128i*)(D+3), s0);
1708             }
1709         else
1710         {
1711             CV_Assert(cn == 4);
1712             int v[] = { 0, 0, -1, -1 };
1713             __m128i mask = _mm_loadu_si128((const __m128i*)v);
1714
1715             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1716             {
1717                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1718                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1719
1720                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1721                 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
1722                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1723                 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
1724
1725                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
1726                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
1727                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1728                 __m128i res0 = _mm_srli_epi16(s0, 2);
1729
1730                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
1731                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
1732                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1733                 __m128i res1 = _mm_srli_epi16(s0, 2);
1734                 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
1735                                                    _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
1736                 _mm_storel_epi64((__m128i*)(D), s0);
1737             }
1738         }
1739
1740         return dx;
1741     }
1742
1743 private:
1744     int cn;
1745     bool use_simd;
1746     int step;
1747 };
1748
1749 class ResizeAreaFastVec_SIMD_16u
1750 {
1751 public:
1752     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1753         cn(_cn), step(_step)
1754     {
1755         use_simd = checkHardwareSupport(CV_CPU_SSE2);
1756     }
1757
1758     int operator() (const ushort* S, ushort* D, int w) const
1759     {
1760         if (!use_simd)
1761             return 0;
1762
1763         int dx = 0;
1764         const ushort* S0 = (const ushort*)S;
1765         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
1766         __m128i masklow = _mm_set1_epi32(0x0000ffff);
1767         __m128i zero = _mm_setzero_si128();
1768         __m128i delta2 = _mm_set1_epi32(2);
1769
1770 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
1771
1772         if (cn == 1)
1773         {
1774             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1775             {
1776                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1777                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1778
1779                 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
1780                 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
1781                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
1782                 s0 = _mm_srli_epi32(s0, 2);
1783                 s0 = _mm_packus_epi32(s0, zero);
1784
1785                 _mm_storel_epi64((__m128i*)D, s0);
1786             }
1787         }
1788         else if (cn == 3)
1789             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
1790             {
1791                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1792                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1793
1794                 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
1795                 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
1796                 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
1797                 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
1798
1799                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
1800                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
1801                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
1802                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
1803                 _mm_storel_epi64((__m128i*)D, s0);
1804             }
1805         else
1806         {
1807             CV_Assert(cn == 4);
1808             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1809             {
1810                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1811                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1812
1813                 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
1814                 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
1815                 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
1816                 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
1817
1818                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
1819                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
1820                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
1821                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
1822                 _mm_storel_epi64((__m128i*)D, s0);
1823             }
1824         }
1825
1826 #undef _mm_packus_epi32
1827
1828         return dx;
1829     }
1830
1831 private:
1832     int cn;
1833     int step;
1834     bool use_simd;
1835 };
1836
1837 #else
1838 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
1839 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
1840 #endif
1841
1842 template<typename T, typename SIMDVecOp>
1843 struct ResizeAreaFastVec
1844 {
1845     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
1846         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
1847     {
1848         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
1849     }
1850
1851     int operator() (const T* S, T* D, int w) const
1852     {
1853         if (!fast_mode)
1854             return 0;
1855
1856         const T* nextS = (const T*)((const uchar*)S + step);
1857         int dx = vecOp(S, D, w);
1858
1859         if (cn == 1)
1860             for( ; dx < w; ++dx )
1861             {
1862                 int index = dx*2;
1863                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
1864             }
1865         else if (cn == 3)
1866             for( ; dx < w; dx += 3 )
1867             {
1868                 int index = dx*2;
1869                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
1870                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
1871                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
1872             }
1873         else
1874             {
1875                 CV_Assert(cn == 4);
1876                 for( ; dx < w; dx += 4 )
1877                 {
1878                     int index = dx*2;
1879                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
1880                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
1881                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
1882                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
1883                 }
1884             }
1885
1886         return dx;
1887     }
1888
1889 private:
1890     int scale_x, scale_y;
1891     int cn;
1892     bool fast_mode;
1893     int step;
1894     SIMDVecOp vecOp;
1895 };
1896
1897 template <typename T, typename WT, typename VecOp>
1898 class resizeAreaFast_Invoker :
1899     public ParallelLoopBody
1900 {
1901 public:
1902     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
1903         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
1904         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
1905         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
1906     {
1907     }
1908
1909     virtual void operator() (const Range& range) const
1910     {
1911         Size ssize = src.size(), dsize = dst.size();
1912         int cn = src.channels();
1913         int area = scale_x*scale_y;
1914         float scale = 1.f/(area);
1915         int dwidth1 = (ssize.width/scale_x)*cn;
1916         dsize.width *= cn;
1917         ssize.width *= cn;
1918         int dy, dx, k = 0;
1919
1920         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
1921
1922         for( dy = range.start; dy < range.end; dy++ )
1923         {
1924             T* D = (T*)(dst.data + dst.step*dy);
1925             int sy0 = dy*scale_y;
1926             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
1927
1928             if( sy0 >= ssize.height )
1929             {
1930                 for( dx = 0; dx < dsize.width; dx++ )
1931                     D[dx] = 0;
1932                 continue;
1933             }
1934
1935             dx = vop(src.template ptr<T>(sy0), D, w);
1936             for( ; dx < w; dx++ )
1937             {
1938                 const T* S = src.template ptr<T>(sy0) + xofs[dx];
1939                 WT sum = 0;
1940                 k = 0;
1941                 #if CV_ENABLE_UNROLLED
1942                 for( ; k <= area - 4; k += 4 )
1943                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
1944                 #endif
1945                 for( ; k < area; k++ )
1946                     sum += S[ofs[k]];
1947
1948                 D[dx] = saturate_cast<T>(sum * scale);
1949             }
1950
1951             for( ; dx < dsize.width; dx++ )
1952             {
1953                 WT sum = 0;
1954                 int count = 0, sx0 = xofs[dx];
1955                 if( sx0 >= ssize.width )
1956                     D[dx] = 0;
1957
1958                 for( int sy = 0; sy < scale_y; sy++ )
1959                 {
1960                     if( sy0 + sy >= ssize.height )
1961                         break;
1962                     const T* S = src.template ptr<T>(sy0 + sy) + sx0;
1963                     for( int sx = 0; sx < scale_x*cn; sx += cn )
1964                     {
1965                         if( sx0 + sx >= ssize.width )
1966                             break;
1967                         sum += S[sx];
1968                         count++;
1969                     }
1970                 }
1971
1972                 D[dx] = saturate_cast<T>((float)sum/count);
1973             }
1974         }
1975     }
1976
1977 private:
1978     Mat src;
1979     Mat dst;
1980     int scale_x, scale_y;
1981     const int *ofs, *xofs;
1982 };
1983
1984 template<typename T, typename WT, typename VecOp>
1985 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
1986                              int scale_x, int scale_y )
1987 {
1988     Range range(0, dst.rows);
1989     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
1990         scale_y, ofs, xofs);
1991     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1992 }
1993
1994 struct DecimateAlpha
1995 {
1996     int si, di;
1997     float alpha;
1998 };
1999
2000
2001 template<typename T, typename WT> class ResizeArea_Invoker :
2002     public ParallelLoopBody
2003 {
2004 public:
2005     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
2006                         const DecimateAlpha* _xtab, int _xtab_size,
2007                         const DecimateAlpha* _ytab, int _ytab_size,
2008                         const int* _tabofs )
2009     {
2010         src = &_src;
2011         dst = &_dst;
2012         xtab0 = _xtab;
2013         xtab_size0 = _xtab_size;
2014         ytab = _ytab;
2015         ytab_size = _ytab_size;
2016         tabofs = _tabofs;
2017     }
2018
2019     virtual void operator() (const Range& range) const
2020     {
2021         Size dsize = dst->size();
2022         int cn = dst->channels();
2023         dsize.width *= cn;
2024         AutoBuffer<WT> _buffer(dsize.width*2);
2025         const DecimateAlpha* xtab = xtab0;
2026         int xtab_size = xtab_size0;
2027         WT *buf = _buffer, *sum = buf + dsize.width;
2028         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
2029
2030         for( dx = 0; dx < dsize.width; dx++ )
2031             sum[dx] = (WT)0;
2032
2033         for( j = j_start; j < j_end; j++ )
2034         {
2035             WT beta = ytab[j].alpha;
2036             int dy = ytab[j].di;
2037             int sy = ytab[j].si;
2038
2039             {
2040                 const T* S = src->template ptr<T>(sy);
2041                 for( dx = 0; dx < dsize.width; dx++ )
2042                     buf[dx] = (WT)0;
2043
2044                 if( cn == 1 )
2045                     for( k = 0; k < xtab_size; k++ )
2046                     {
2047                         int dxn = xtab[k].di;
2048                         WT alpha = xtab[k].alpha;
2049                         buf[dxn] += S[xtab[k].si]*alpha;
2050                     }
2051                 else if( cn == 2 )
2052                     for( k = 0; k < xtab_size; k++ )
2053                     {
2054                         int sxn = xtab[k].si;
2055                         int dxn = xtab[k].di;
2056                         WT alpha = xtab[k].alpha;
2057                         WT t0 = buf[dxn] + S[sxn]*alpha;
2058                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2059                         buf[dxn] = t0; buf[dxn+1] = t1;
2060                     }
2061                 else if( cn == 3 )
2062                     for( k = 0; k < xtab_size; k++ )
2063                     {
2064                         int sxn = xtab[k].si;
2065                         int dxn = xtab[k].di;
2066                         WT alpha = xtab[k].alpha;
2067                         WT t0 = buf[dxn] + S[sxn]*alpha;
2068                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2069                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
2070                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
2071                     }
2072                 else if( cn == 4 )
2073                 {
2074                     for( k = 0; k < xtab_size; k++ )
2075                     {
2076                         int sxn = xtab[k].si;
2077                         int dxn = xtab[k].di;
2078                         WT alpha = xtab[k].alpha;
2079                         WT t0 = buf[dxn] + S[sxn]*alpha;
2080                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2081                         buf[dxn] = t0; buf[dxn+1] = t1;
2082                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
2083                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
2084                         buf[dxn+2] = t0; buf[dxn+3] = t1;
2085                     }
2086                 }
2087                 else
2088                 {
2089                     for( k = 0; k < xtab_size; k++ )
2090                     {
2091                         int sxn = xtab[k].si;
2092                         int dxn = xtab[k].di;
2093                         WT alpha = xtab[k].alpha;
2094                         for( int c = 0; c < cn; c++ )
2095                             buf[dxn + c] += S[sxn + c]*alpha;
2096                     }
2097                 }
2098             }
2099
2100             if( dy != prev_dy )
2101             {
2102                 T* D = dst->template ptr<T>(prev_dy);
2103
2104                 for( dx = 0; dx < dsize.width; dx++ )
2105                 {
2106                     D[dx] = saturate_cast<T>(sum[dx]);
2107                     sum[dx] = beta*buf[dx];
2108                 }
2109                 prev_dy = dy;
2110             }
2111             else
2112             {
2113                 for( dx = 0; dx < dsize.width; dx++ )
2114                     sum[dx] += beta*buf[dx];
2115             }
2116         }
2117
2118         {
2119         T* D = dst->template ptr<T>(prev_dy);
2120         for( dx = 0; dx < dsize.width; dx++ )
2121             D[dx] = saturate_cast<T>(sum[dx]);
2122         }
2123     }
2124
2125 private:
2126     const Mat* src;
2127     Mat* dst;
2128     const DecimateAlpha* xtab0;
2129     const DecimateAlpha* ytab;
2130     int xtab_size0, ytab_size;
2131     const int* tabofs;
2132 };
2133
2134
2135 template <typename T, typename WT>
2136 static void resizeArea_( const Mat& src, Mat& dst,
2137                          const DecimateAlpha* xtab, int xtab_size,
2138                          const DecimateAlpha* ytab, int ytab_size,
2139                          const int* tabofs )
2140 {
2141     parallel_for_(Range(0, dst.rows),
2142                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
2143                  dst.total()/((double)(1 << 16)));
2144 }
2145
2146
2147 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
2148                             const int* xofs, const void* alpha,
2149                             const int* yofs, const void* beta,
2150                             int xmin, int xmax, int ksize );
2151
2152 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
2153                                     const int* ofs, const int *xofs,
2154                                     int scale_x, int scale_y );
2155
2156 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
2157                                 const DecimateAlpha* xtab, int xtab_size,
2158                                 const DecimateAlpha* ytab, int ytab_size,
2159                                 const int* yofs);
2160
2161
2162 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
2163 {
2164     int k = 0;
2165     for(int dx = 0; dx < dsize; dx++ )
2166     {
2167         double fsx1 = dx * scale;
2168         double fsx2 = fsx1 + scale;
2169         double cellWidth = std::min(scale, ssize - fsx1);
2170
2171         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2172
2173         sx2 = std::min(sx2, ssize - 1);
2174         sx1 = std::min(sx1, sx2);
2175
2176         if( sx1 - fsx1 > 1e-3 )
2177         {
2178             assert( k < ssize*2 );
2179             tab[k].di = dx * cn;
2180             tab[k].si = (sx1 - 1) * cn;
2181             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
2182         }
2183
2184         for(int sx = sx1; sx < sx2; sx++ )
2185         {
2186             assert( k < ssize*2 );
2187             tab[k].di = dx * cn;
2188             tab[k].si = sx * cn;
2189             tab[k++].alpha = float(1.0 / cellWidth);
2190         }
2191
2192         if( fsx2 - sx2 > 1e-3 )
2193         {
2194             assert( k < ssize*2 );
2195             tab[k].di = dx * cn;
2196             tab[k].si = sx2 * cn;
2197             tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2198         }
2199     }
2200     return k;
2201 }
2202
2203 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
2204
2205 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
2206     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2207     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2208     specBuf.allocate(specSize);\
2209     pSpec = (uchar*)specBuf;\
2210     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
2211
2212 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
2213     if (mode == (int)ippCubic) { *ok = false; return; } \
2214     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2215     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2216     specBuf.allocate(specSize);\
2217     pSpec = (uchar*)specBuf;\
2218     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
2219     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
2220     getSrcOffsetFunc =  (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
2221
2222 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
2223     func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
2224     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2225     specBuf.allocate(specSize);\
2226     pSpec = (uchar*)specBuf;\
2227     AutoBuffer<uchar> buf(initSize);\
2228     uchar* pInit = (uchar*)buf;\
2229     CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
2230
2231 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
2232     if (mode == (int)ippLinear)     { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
2233     else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
2234     else { *ok = false; return; } \
2235     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
2236     getSrcOffsetFunc =  (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
2237
2238 #if IPP_VERSION_X100 >= 701
2239 class IPPresizeInvoker :
2240     public ParallelLoopBody
2241 {
2242 public:
2243     IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
2244         ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
2245         inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
2246         func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
2247     {
2248         *ok = true;
2249         IppiSize srcSize, dstSize;
2250         int type = src.type(), specSize = 0, initSize = 0;
2251         srcSize.width  = src.cols;
2252         srcSize.height = src.rows;
2253         dstSize.width  = dst.cols;
2254         dstSize.height = dst.rows;
2255
2256         switch (type)
2257         {
2258 #if 0 // disabled since it breaks tests for CascadeClassifier
2259             case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
2260             case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
2261             case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
2262 #endif
2263             case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
2264             case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
2265             case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
2266             case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
2267             case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
2268             case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
2269             case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
2270             case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
2271             case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
2272             case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
2273             case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
2274             case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
2275             default: { *ok = false; return; } break;
2276         }
2277     }
2278
2279     ~IPPresizeInvoker()
2280     {
2281     }
2282
2283     virtual void operator() (const Range& range) const
2284     {
2285         if (*ok == false)
2286             return;
2287
2288         int cn = src.channels();
2289         int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
2290         int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
2291         int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
2292
2293         IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
2294         IppiSize  dstSize   = { dstwidth, dstheight - dsty };
2295         int bufsize = 0, itemSize = (int)src.elemSize1();
2296
2297         CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
2298         CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
2299
2300         const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
2301         Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
2302
2303         AutoBuffer<uchar> buf(bufsize + 64);
2304         uchar* bufptr = alignPtr((uchar*)buf, 32);
2305
2306         if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
2307             *ok = false;
2308         else
2309         {
2310             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2311         }
2312     }
2313 private:
2314     const Mat & src;
2315     Mat & dst;
2316     double inv_scale_x;
2317     double inv_scale_y;
2318     void *pSpec;
2319     AutoBuffer<uchar> specBuf;
2320     int mode;
2321     ippiResizeFunc func;
2322     ippiResizeGetBufferSize getBufferSizeFunc;
2323     ippiResizeGetSrcOffset getSrcOffsetFunc;
2324     bool *ok;
2325     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
2326 };
2327
2328 #endif
2329
2330 #ifdef HAVE_OPENCL
2331
2332 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
2333                                       float * const alpha_tab, int * const ofs_tab)
2334 {
2335     int k = 0, dx = 0;
2336     for ( ; dx < dsize; dx++)
2337     {
2338         ofs_tab[dx] = k;
2339
2340         double fsx1 = dx * scale;
2341         double fsx2 = fsx1 + scale;
2342         double cellWidth = std::min(scale, ssize - fsx1);
2343
2344         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2345
2346         sx2 = std::min(sx2, ssize - 1);
2347         sx1 = std::min(sx1, sx2);
2348
2349         if (sx1 - fsx1 > 1e-3)
2350         {
2351             map_tab[k] = sx1 - 1;
2352             alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
2353         }
2354
2355         for (int sx = sx1; sx < sx2; sx++)
2356         {
2357             map_tab[k] = sx;
2358             alpha_tab[k++] = float(1.0 / cellWidth);
2359         }
2360
2361         if (fsx2 - sx2 > 1e-3)
2362         {
2363             map_tab[k] = sx2;
2364             alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2365         }
2366     }
2367     ofs_tab[dx] = k;
2368 }
2369
2370 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
2371                         double fx, double fy, int interpolation)
2372 {
2373     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2374
2375     double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
2376     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
2377     int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
2378     bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
2379         std::abs(inv_fy - iscale_y) < DBL_EPSILON;
2380
2381     // in case of scale_x && scale_y is equal to 2
2382     // INTER_AREA (fast) also is equal to INTER_LINEAR
2383     if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2384         /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
2385
2386     if( !(cn <= 4 &&
2387            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
2388             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
2389         return false;
2390
2391     UMat src = _src.getUMat();
2392     _dst.create(dsize, type);
2393     UMat dst = _dst.getUMat();
2394
2395     Size ssize = src.size();
2396     ocl::Kernel k;
2397     size_t globalsize[] = { dst.cols, dst.rows };
2398
2399     ocl::Image2D srcImage;
2400
2401     // See if this could be done with a sampler.  We stick with integer
2402     // datatypes because the observed error is low.
2403     bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
2404                        ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
2405                        ocl::Image2D::isFormatSupported(depth, cn, true) &&
2406                        src.offset==0);
2407     if (useSampler)
2408     {
2409         int wdepth = std::max(depth, CV_32S);
2410         char buf[2][32];
2411         cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
2412                         "-D convertToDT=%s -D cn=%d",
2413                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
2414                         ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2415                         cn);
2416         k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
2417
2418         if (k.empty())
2419             useSampler = false;
2420         else
2421         {
2422             // Convert the input into an OpenCL image type, using normalized channel data types
2423             // and aliasing the UMat.
2424             srcImage = ocl::Image2D(src, true, true);
2425             k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
2426                    (float)inv_fx, (float)inv_fy);
2427         }
2428     }
2429
2430     if (interpolation == INTER_LINEAR && !useSampler)
2431     {
2432         char buf[2][32];
2433
2434         // integer path is slower because of CPU part, so it's disabled
2435         if (depth == CV_8U && ((void)0, 0))
2436         {
2437             AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
2438             int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
2439             short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
2440             float fxx, fyy;
2441             int sx, sy;
2442
2443             for (int dx = 0; dx < dsize.width; dx++)
2444             {
2445                 fxx = (float)((dx+0.5)*inv_fx - 0.5);
2446                 sx = cvFloor(fxx);
2447                 fxx -= sx;
2448
2449                 if (sx < 0)
2450                     fxx = 0, sx = 0;
2451
2452                 if (sx >= ssize.width-1)
2453                     fxx = 0, sx = ssize.width-1;
2454
2455                 xofs[dx] = sx;
2456                 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
2457                 ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
2458             }
2459
2460             for (int dy = 0; dy < dsize.height; dy++)
2461             {
2462                 fyy = (float)((dy+0.5)*inv_fy - 0.5);
2463                 sy = cvFloor(fyy);
2464                 fyy -= sy;
2465
2466                 yofs[dy] = sy;
2467                 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
2468                 ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
2469             }
2470
2471             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2472             UMat coeffs;
2473             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
2474
2475             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2476                      format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
2477                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2478                             "-D INTER_RESIZE_COEF_BITS=%d",
2479                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2480                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2481                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2482                             cn, INTER_RESIZE_COEF_BITS));
2483             if (k.empty())
2484                 return false;
2485
2486             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2487                    ocl::KernelArg::PtrReadOnly(coeffs));
2488         }
2489         else
2490         {
2491             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2492             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2493                      format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
2494                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2495                             "-D INTER_RESIZE_COEF_BITS=%d",
2496                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2497                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2498                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2499                             cn, INTER_RESIZE_COEF_BITS));
2500             if (k.empty())
2501                 return false;
2502
2503             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2504                    (float)inv_fx, (float)inv_fy);
2505         }
2506     }
2507     else if (interpolation == INTER_NEAREST)
2508     {
2509         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
2510                  format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
2511                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
2512         if (k.empty())
2513             return false;
2514
2515         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2516                (float)inv_fx, (float)inv_fy);
2517     }
2518     else if (interpolation == INTER_AREA)
2519     {
2520         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
2521         int wtype = CV_MAKE_TYPE(wdepth, cn);
2522
2523         char cvt[2][40];
2524         String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
2525                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2526                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
2527
2528         UMat alphaOcl, tabofsOcl, mapOcl;
2529         UMat dmap, smap;
2530
2531         if (is_area_fast)
2532         {
2533             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
2534             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
2535                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
2536                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
2537                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
2538                                     iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
2539
2540             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
2541             if (k.empty())
2542                 return false;
2543         }
2544         else
2545         {
2546             buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
2547             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
2548             if (k.empty())
2549                 return false;
2550
2551             int xytab_size = (ssize.width + ssize.height) << 1;
2552             int tabofs_size = dsize.height + dsize.width + 2;
2553
2554             AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
2555             AutoBuffer<float> _xyalpha_tab(xytab_size);
2556             int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
2557             float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
2558             int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
2559
2560             ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
2561             ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
2562
2563             // loading precomputed arrays to GPU
2564             Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
2565             Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
2566             Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
2567         }
2568
2569         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
2570
2571         if (is_area_fast)
2572             k.args(srcarg, dstarg);
2573         else
2574             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
2575                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
2576
2577         return k.run(2, globalsize, NULL, false);
2578     }
2579
2580     return k.run(2, globalsize, 0, false);
2581 }
2582
2583 #endif
2584
2585 }
2586
2587 //////////////////////////////////////////////////////////////////////////////////////////
2588
2589 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
2590                  double inv_scale_x, double inv_scale_y, int interpolation )
2591 {
2592     static ResizeFunc linear_tab[] =
2593     {
2594         resizeGeneric_<
2595             HResizeLinear<uchar, int, short,
2596                 INTER_RESIZE_COEF_SCALE,
2597                 HResizeLinearVec_8u32s>,
2598             VResizeLinear<uchar, int, short,
2599                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2600                 VResizeLinearVec_32s8u> >,
2601         0,
2602         resizeGeneric_<
2603             HResizeLinear<ushort, float, float, 1,
2604                 HResizeLinearVec_16u32f>,
2605             VResizeLinear<ushort, float, float, Cast<float, ushort>,
2606                 VResizeLinearVec_32f16u> >,
2607         resizeGeneric_<
2608             HResizeLinear<short, float, float, 1,
2609                 HResizeLinearVec_16s32f>,
2610             VResizeLinear<short, float, float, Cast<float, short>,
2611                 VResizeLinearVec_32f16s> >,
2612         0,
2613         resizeGeneric_<
2614             HResizeLinear<float, float, float, 1,
2615                 HResizeLinearVec_32f>,
2616             VResizeLinear<float, float, float, Cast<float, float>,
2617                 VResizeLinearVec_32f> >,
2618         resizeGeneric_<
2619             HResizeLinear<double, double, float, 1,
2620                 HResizeNoVec>,
2621             VResizeLinear<double, double, float, Cast<double, double>,
2622                 VResizeNoVec> >,
2623         0
2624     };
2625
2626     static ResizeFunc cubic_tab[] =
2627     {
2628         resizeGeneric_<
2629             HResizeCubic<uchar, int, short>,
2630             VResizeCubic<uchar, int, short,
2631                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2632                 VResizeCubicVec_32s8u> >,
2633         0,
2634         resizeGeneric_<
2635             HResizeCubic<ushort, float, float>,
2636             VResizeCubic<ushort, float, float, Cast<float, ushort>,
2637             VResizeCubicVec_32f16u> >,
2638         resizeGeneric_<
2639             HResizeCubic<short, float, float>,
2640             VResizeCubic<short, float, float, Cast<float, short>,
2641             VResizeCubicVec_32f16s> >,
2642         0,
2643         resizeGeneric_<
2644             HResizeCubic<float, float, float>,
2645             VResizeCubic<float, float, float, Cast<float, float>,
2646             VResizeCubicVec_32f> >,
2647         resizeGeneric_<
2648             HResizeCubic<double, double, float>,
2649             VResizeCubic<double, double, float, Cast<double, double>,
2650             VResizeNoVec> >,
2651         0
2652     };
2653
2654     static ResizeFunc lanczos4_tab[] =
2655     {
2656         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
2657             VResizeLanczos4<uchar, int, short,
2658             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2659             VResizeNoVec> >,
2660         0,
2661         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
2662             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
2663             VResizeNoVec> >,
2664         resizeGeneric_<HResizeLanczos4<short, float, float>,
2665             VResizeLanczos4<short, float, float, Cast<float, short>,
2666             VResizeNoVec> >,
2667         0,
2668         resizeGeneric_<HResizeLanczos4<float, float, float>,
2669             VResizeLanczos4<float, float, float, Cast<float, float>,
2670             VResizeNoVec> >,
2671         resizeGeneric_<HResizeLanczos4<double, double, float>,
2672             VResizeLanczos4<double, double, float, Cast<double, double>,
2673             VResizeNoVec> >,
2674         0
2675     };
2676
2677     static ResizeAreaFastFunc areafast_tab[] =
2678     {
2679         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
2680         0,
2681         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
2682         resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastNoVec<short, float> > >,
2683         0,
2684         resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
2685         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
2686         0
2687     };
2688
2689     static ResizeAreaFunc area_tab[] =
2690     {
2691         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
2692         resizeArea_<short, float>, 0, resizeArea_<float, float>,
2693         resizeArea_<double, double>, 0
2694     };
2695
2696     Size ssize = _src.size();
2697
2698     CV_Assert( ssize.area() > 0 );
2699     CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
2700     if( dsize.area() == 0 )
2701     {
2702         dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
2703                      saturate_cast<int>(ssize.height*inv_scale_y));
2704         CV_Assert( dsize.area() > 0 );
2705     }
2706     else
2707     {
2708         inv_scale_x = (double)dsize.width/ssize.width;
2709         inv_scale_y = (double)dsize.height/ssize.height;
2710     }
2711
2712     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
2713                ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
2714
2715     Mat src = _src.getMat();
2716     _dst.create(dsize, src.type());
2717     Mat dst = _dst.getMat();
2718
2719 #ifdef HAVE_TEGRA_OPTIMIZATION
2720     if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
2721         return;
2722 #endif
2723
2724     int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2725     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
2726     int k, sx, sy, dx, dy;
2727
2728     int iscale_x = saturate_cast<int>(scale_x);
2729     int iscale_y = saturate_cast<int>(scale_y);
2730
2731     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
2732             std::abs(scale_y - iscale_y) < DBL_EPSILON;
2733
2734 #if IPP_VERSION_X100 >= 701
2735     CV_IPP_CHECK()
2736     {
2737 #define IPP_RESIZE_EPS 1e-10
2738
2739         double ex = fabs((double)dsize.width / src.cols  - inv_scale_x) / inv_scale_x;
2740         double ey = fabs((double)dsize.height / src.rows - inv_scale_y) / inv_scale_y;
2741
2742         if ( ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
2743              (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
2744              !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U))
2745         {
2746             int mode = -1;
2747             if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
2748                 mode = ippLinear;
2749             else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
2750                 mode = ippCubic;
2751
2752             if( mode >= 0 && (cn == 1 || cn == 3 || cn == 4) &&
2753                 (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
2754                 (depth == CV_64F && mode == ippLinear)))
2755             {
2756                 bool ok = true;
2757                 Range range(0, src.rows);
2758                 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
2759                 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2760                 if( ok )
2761                 {
2762                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2763                     return;
2764                 }
2765                 setIppErrorStatus();
2766             }
2767         }
2768 #undef IPP_RESIZE_EPS
2769     }
2770 #endif
2771
2772     if( interpolation == INTER_NEAREST )
2773     {
2774         resizeNN( src, dst, inv_scale_x, inv_scale_y );
2775         return;
2776     }
2777
2778     {
2779         // in case of scale_x && scale_y is equal to 2
2780         // INTER_AREA (fast) also is equal to INTER_LINEAR
2781         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2782             interpolation = INTER_AREA;
2783
2784         // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
2785         // In other cases it is emulated using some variant of bilinear interpolation
2786         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
2787         {
2788             if( is_area_fast )
2789             {
2790                 int area = iscale_x*iscale_y;
2791                 size_t srcstep = src.step / src.elemSize1();
2792                 AutoBuffer<int> _ofs(area + dsize.width*cn);
2793                 int* ofs = _ofs;
2794                 int* xofs = ofs + area;
2795                 ResizeAreaFastFunc func = areafast_tab[depth];
2796                 CV_Assert( func != 0 );
2797
2798                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
2799                     for( sx = 0; sx < iscale_x; sx++ )
2800                         ofs[k++] = (int)(sy*srcstep + sx*cn);
2801
2802                 for( dx = 0; dx < dsize.width; dx++ )
2803                 {
2804                     int j = dx * cn;
2805                     sx = iscale_x * j;
2806                     for( k = 0; k < cn; k++ )
2807                         xofs[j + k] = sx + k;
2808                 }
2809
2810                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
2811                 return;
2812             }
2813
2814             ResizeAreaFunc func = area_tab[depth];
2815             CV_Assert( func != 0 && cn <= 4 );
2816
2817             AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
2818             DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
2819
2820             int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
2821             int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
2822
2823             AutoBuffer<int> _tabofs(dsize.height + 1);
2824             int* tabofs = _tabofs;
2825             for( k = 0, dy = 0; k < ytab_size; k++ )
2826             {
2827                 if( k == 0 || ytab[k].di != ytab[k-1].di )
2828                 {
2829                     assert( ytab[k].di == dy );
2830                     tabofs[dy++] = k;
2831                 }
2832             }
2833             tabofs[dy] = ytab_size;
2834
2835             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
2836             return;
2837         }
2838     }
2839
2840     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
2841     bool area_mode = interpolation == INTER_AREA;
2842     bool fixpt = depth == CV_8U;
2843     float fx, fy;
2844     ResizeFunc func=0;
2845     int ksize=0, ksize2;
2846     if( interpolation == INTER_CUBIC )
2847         ksize = 4, func = cubic_tab[depth];
2848     else if( interpolation == INTER_LANCZOS4 )
2849         ksize = 8, func = lanczos4_tab[depth];
2850     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
2851         ksize = 2, func = linear_tab[depth];
2852     else
2853         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
2854     ksize2 = ksize/2;
2855
2856     CV_Assert( func != 0 );
2857
2858     AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
2859     int* xofs = (int*)(uchar*)_buffer;
2860     int* yofs = xofs + width;
2861     float* alpha = (float*)(yofs + dsize.height);
2862     short* ialpha = (short*)alpha;
2863     float* beta = alpha + width*ksize;
2864     short* ibeta = ialpha + width*ksize;
2865     float cbuf[MAX_ESIZE];
2866
2867     for( dx = 0; dx < dsize.width; dx++ )
2868     {
2869         if( !area_mode )
2870         {
2871             fx = (float)((dx+0.5)*scale_x - 0.5);
2872             sx = cvFloor(fx);
2873             fx -= sx;
2874         }
2875         else
2876         {
2877             sx = cvFloor(dx*scale_x);
2878             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
2879             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
2880         }
2881
2882         if( sx < ksize2-1 )
2883         {
2884             xmin = dx+1;
2885             if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
2886                 fx = 0, sx = 0;
2887         }
2888
2889         if( sx + ksize2 >= ssize.width )
2890         {
2891             xmax = std::min( xmax, dx );
2892             if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
2893                 fx = 0, sx = ssize.width-1;
2894         }
2895
2896         for( k = 0, sx *= cn; k < cn; k++ )
2897             xofs[dx*cn + k] = sx + k;
2898
2899         if( interpolation == INTER_CUBIC )
2900             interpolateCubic( fx, cbuf );
2901         else if( interpolation == INTER_LANCZOS4 )
2902             interpolateLanczos4( fx, cbuf );
2903         else
2904         {
2905             cbuf[0] = 1.f - fx;
2906             cbuf[1] = fx;
2907         }
2908         if( fixpt )
2909         {
2910             for( k = 0; k < ksize; k++ )
2911                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
2912             for( ; k < cn*ksize; k++ )
2913                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
2914         }
2915         else
2916         {
2917             for( k = 0; k < ksize; k++ )
2918                 alpha[dx*cn*ksize + k] = cbuf[k];
2919             for( ; k < cn*ksize; k++ )
2920                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
2921         }
2922     }
2923
2924     for( dy = 0; dy < dsize.height; dy++ )
2925     {
2926         if( !area_mode )
2927         {
2928             fy = (float)((dy+0.5)*scale_y - 0.5);
2929             sy = cvFloor(fy);
2930             fy -= sy;
2931         }
2932         else
2933         {
2934             sy = cvFloor(dy*scale_y);
2935             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
2936             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
2937         }
2938
2939         yofs[dy] = sy;
2940         if( interpolation == INTER_CUBIC )
2941             interpolateCubic( fy, cbuf );
2942         else if( interpolation == INTER_LANCZOS4 )
2943             interpolateLanczos4( fy, cbuf );
2944         else
2945         {
2946             cbuf[0] = 1.f - fy;
2947             cbuf[1] = fy;
2948         }
2949
2950         if( fixpt )
2951         {
2952             for( k = 0; k < ksize; k++ )
2953                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
2954         }
2955         else
2956         {
2957             for( k = 0; k < ksize; k++ )
2958                 beta[dy*ksize + k] = cbuf[k];
2959         }
2960     }
2961
2962     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
2963           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
2964 }
2965
2966
2967 /****************************************************************************************\
2968 *                       General warping (affine, perspective, remap)                     *
2969 \****************************************************************************************/
2970
2971 namespace cv
2972 {
2973
2974 template<typename T>
2975 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
2976                           int borderType, const Scalar& _borderValue )
2977 {
2978     Size ssize = _src.size(), dsize = _dst.size();
2979     int cn = _src.channels();
2980     const T* S0 = _src.ptr<T>();
2981     size_t sstep = _src.step/sizeof(S0[0]);
2982     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
2983         saturate_cast<T>(_borderValue[1]),
2984         saturate_cast<T>(_borderValue[2]),
2985         saturate_cast<T>(_borderValue[3]));
2986     int dx, dy;
2987
2988     unsigned width1 = ssize.width, height1 = ssize.height;
2989
2990     if( _dst.isContinuous() && _xy.isContinuous() )
2991     {
2992         dsize.width *= dsize.height;
2993         dsize.height = 1;
2994     }
2995
2996     for( dy = 0; dy < dsize.height; dy++ )
2997     {
2998         T* D = _dst.ptr<T>(dy);
2999         const short* XY = _xy.ptr<short>(dy);
3000
3001         if( cn == 1 )
3002         {
3003             for( dx = 0; dx < dsize.width; dx++ )
3004             {
3005                 int sx = XY[dx*2], sy = XY[dx*2+1];
3006                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3007                     D[dx] = S0[sy*sstep + sx];
3008                 else
3009                 {
3010                     if( borderType == BORDER_REPLICATE )
3011                     {
3012                         sx = clip(sx, 0, ssize.width);
3013                         sy = clip(sy, 0, ssize.height);
3014                         D[dx] = S0[sy*sstep + sx];
3015                     }
3016                     else if( borderType == BORDER_CONSTANT )
3017                         D[dx] = cval[0];
3018                     else if( borderType != BORDER_TRANSPARENT )
3019                     {
3020                         sx = borderInterpolate(sx, ssize.width, borderType);
3021                         sy = borderInterpolate(sy, ssize.height, borderType);
3022                         D[dx] = S0[sy*sstep + sx];
3023                     }
3024                 }
3025             }
3026         }
3027         else
3028         {
3029             for( dx = 0; dx < dsize.width; dx++, D += cn )
3030             {
3031                 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3032                 const T *S;
3033                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3034                 {
3035                     if( cn == 3 )
3036                     {
3037                         S = S0 + sy*sstep + sx*3;
3038                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
3039                     }
3040                     else if( cn == 4 )
3041                     {
3042                         S = S0 + sy*sstep + sx*4;
3043                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
3044                     }
3045                     else
3046                     {
3047                         S = S0 + sy*sstep + sx*cn;
3048                         for( k = 0; k < cn; k++ )
3049                             D[k] = S[k];
3050                     }
3051                 }
3052                 else if( borderType != BORDER_TRANSPARENT )
3053                 {
3054                     if( borderType == BORDER_REPLICATE )
3055                     {
3056                         sx = clip(sx, 0, ssize.width);
3057                         sy = clip(sy, 0, ssize.height);
3058                         S = S0 + sy*sstep + sx*cn;
3059                     }
3060                     else if( borderType == BORDER_CONSTANT )
3061                         S = &cval[0];
3062                     else
3063                     {
3064                         sx = borderInterpolate(sx, ssize.width, borderType);
3065                         sy = borderInterpolate(sy, ssize.height, borderType);
3066                         S = S0 + sy*sstep + sx*cn;
3067                     }
3068                     for( k = 0; k < cn; k++ )
3069                         D[k] = S[k];
3070                 }
3071             }
3072         }
3073     }
3074 }
3075
3076
3077 struct RemapNoVec
3078 {
3079     int operator()( const Mat&, void*, const short*, const ushort*,
3080                     const void*, int ) const { return 0; }
3081 };
3082
3083 #if CV_SSE2
3084
3085 struct RemapVec_8u
3086 {
3087     int operator()( const Mat& _src, void* _dst, const short* XY,
3088                     const ushort* FXY, const void* _wtab, int width ) const
3089     {
3090         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
3091
3092         if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
3093             sstep > 0x8000 )
3094             return 0;
3095
3096         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
3097         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
3098         uchar* D = (uchar*)_dst;
3099         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
3100         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
3101         __m128i z = _mm_setzero_si128();
3102         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
3103
3104         if( cn == 1 )
3105         {
3106             for( ; x <= width - 8; x += 8 )
3107             {
3108                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3109                 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
3110                 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
3111                 unsigned i0, i1;
3112
3113                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3114                 xy1 = _mm_madd_epi16( xy1, xy2ofs );
3115                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3116                 _mm_store_si128( (__m128i*)iofs1, xy1 );
3117
3118                 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
3119                 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
3120                 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3121                 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
3122                 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
3123                 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3124                 v0 = _mm_unpacklo_epi8(v0, z);
3125                 v1 = _mm_unpacklo_epi8(v1, z);
3126
3127                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
3128                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
3129                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
3130                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
3131                 b0 = _mm_unpacklo_epi64(a0, a1);
3132                 b1 = _mm_unpackhi_epi64(a0, a1);
3133                 v0 = _mm_madd_epi16(v0, b0);
3134                 v1 = _mm_madd_epi16(v1, b1);
3135                 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
3136
3137                 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
3138                 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
3139                 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3140                 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
3141                 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
3142                 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3143                 v2 = _mm_unpacklo_epi8(v2, z);
3144                 v3 = _mm_unpacklo_epi8(v3, z);
3145
3146                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
3147                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
3148                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
3149                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
3150                 b0 = _mm_unpacklo_epi64(a0, a1);
3151                 b1 = _mm_unpackhi_epi64(a0, a1);
3152                 v2 = _mm_madd_epi16(v2, b0);
3153                 v3 = _mm_madd_epi16(v3, b1);
3154                 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
3155
3156                 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
3157                 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
3158                 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
3159                 _mm_storel_epi64( (__m128i*)(D + x), v0 );
3160             }
3161         }
3162         else if( cn == 3 )
3163         {
3164             for( ; x <= width - 5; x += 4, D += 12 )
3165             {
3166                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3167                 __m128i u0, v0, u1, v1;
3168
3169                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3170                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3171                 const __m128i *w0, *w1;
3172                 w0 = (const __m128i*)(wtab + FXY[x]*16);
3173                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3174
3175                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3176                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
3177                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3178                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
3179                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3180                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
3181                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3182                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
3183                 u0 = _mm_unpacklo_epi8(u0, z);
3184                 v0 = _mm_unpacklo_epi8(v0, z);
3185                 u1 = _mm_unpacklo_epi8(u1, z);
3186                 v1 = _mm_unpacklo_epi8(v1, z);
3187                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3188                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3189                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3190                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3191                 u0 = _mm_slli_si128(u0, 4);
3192                 u0 = _mm_packs_epi32(u0, u1);
3193                 u0 = _mm_packus_epi16(u0, u0);
3194                 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
3195
3196                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3197                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3198
3199                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3200                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
3201                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3202                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
3203                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3204                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
3205                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3206                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
3207                 u0 = _mm_unpacklo_epi8(u0, z);
3208                 v0 = _mm_unpacklo_epi8(v0, z);
3209                 u1 = _mm_unpacklo_epi8(u1, z);
3210                 v1 = _mm_unpacklo_epi8(v1, z);
3211                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3212                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3213                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3214                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3215                 u0 = _mm_slli_si128(u0, 4);
3216                 u0 = _mm_packs_epi32(u0, u1);
3217                 u0 = _mm_packus_epi16(u0, u0);
3218                 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
3219             }
3220         }
3221         else if( cn == 4 )
3222         {
3223             for( ; x <= width - 4; x += 4, D += 16 )
3224             {
3225                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3226                 __m128i u0, v0, u1, v1;
3227
3228                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3229                 _mm_store_si128( (__m128i*)iofs0, xy0 );
3230                 const __m128i *w0, *w1;
3231                 w0 = (const __m128i*)(wtab + FXY[x]*16);
3232                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3233
3234                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3235                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
3236                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3237                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
3238                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3239                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
3240                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3241                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
3242                 u0 = _mm_unpacklo_epi8(u0, z);
3243                 v0 = _mm_unpacklo_epi8(v0, z);
3244                 u1 = _mm_unpacklo_epi8(u1, z);
3245                 v1 = _mm_unpacklo_epi8(v1, z);
3246                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3247                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3248                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3249                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3250                 u0 = _mm_packs_epi32(u0, u1);
3251                 u0 = _mm_packus_epi16(u0, u0);
3252                 _mm_storel_epi64((__m128i*)D, u0);
3253
3254                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3255                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3256
3257                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3258                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
3259                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3260                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
3261                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3262                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
3263                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3264                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
3265                 u0 = _mm_unpacklo_epi8(u0, z);
3266                 v0 = _mm_unpacklo_epi8(v0, z);
3267                 u1 = _mm_unpacklo_epi8(u1, z);
3268                 v1 = _mm_unpacklo_epi8(v1, z);
3269                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3270                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3271                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3272                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3273                 u0 = _mm_packs_epi32(u0, u1);
3274                 u0 = _mm_packus_epi16(u0, u0);
3275                 _mm_storel_epi64((__m128i*)(D + 8), u0);
3276             }
3277         }
3278
3279         return x;
3280     }
3281 };
3282
3283 #else
3284
3285 typedef RemapNoVec RemapVec_8u;
3286
3287 #endif
3288
3289
3290 template<class CastOp, class VecOp, typename AT>
3291 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
3292                            const Mat& _fxy, const void* _wtab,
3293                            int borderType, const Scalar& _borderValue )
3294 {
3295     typedef typename CastOp::rtype T;
3296     typedef typename CastOp::type1 WT;
3297     Size ssize = _src.size(), dsize = _dst.size();
3298     int cn = _src.channels();
3299     const AT* wtab = (const AT*)_wtab;
3300     const T* S0 = _src.ptr<T>();
3301     size_t sstep = _src.step/sizeof(S0[0]);
3302     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3303         saturate_cast<T>(_borderValue[1]),
3304         saturate_cast<T>(_borderValue[2]),
3305         saturate_cast<T>(_borderValue[3]));
3306     int dx, dy;
3307     CastOp castOp;
3308     VecOp vecOp;
3309
3310     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
3311     CV_Assert( cn <= 4 && ssize.area() > 0 );
3312 #if CV_SSE2
3313     if( _src.type() == CV_8UC3 )
3314         width1 = std::max(ssize.width-2, 0);
3315 #endif
3316
3317     for( dy = 0; dy < dsize.height; dy++ )
3318     {
3319         T* D = _dst.ptr<T>(dy);
3320         const short* XY = _xy.ptr<short>(dy);
3321         const ushort* FXY = _fxy.ptr<ushort>(dy);
3322         int X0 = 0;
3323         bool prevInlier = false;
3324
3325         for( dx = 0; dx <= dsize.width; dx++ )
3326         {
3327             bool curInlier = dx < dsize.width ?
3328                 (unsigned)XY[dx*2] < width1 &&
3329                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
3330             if( curInlier == prevInlier )
3331                 continue;
3332
3333             int X1 = dx;
3334             dx = X0;
3335             X0 = X1;
3336             prevInlier = curInlier;
3337
3338             if( !curInlier )
3339             {
3340                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
3341                 D += len*cn;
3342                 dx += len;
3343
3344                 if( cn == 1 )
3345                 {
3346                     for( ; dx < X1; dx++, D++ )
3347                     {
3348                         int sx = XY[dx*2], sy = XY[dx*2+1];
3349                         const AT* w = wtab + FXY[dx]*4;
3350                         const T* S = S0 + sy*sstep + sx;
3351                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
3352                     }
3353                 }
3354                 else if( cn == 2 )
3355                     for( ; dx < X1; dx++, D += 2 )
3356                     {
3357                         int sx = XY[dx*2], sy = XY[dx*2+1];
3358                         const AT* w = wtab + FXY[dx]*4;
3359                         const T* S = S0 + sy*sstep + sx*2;
3360                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
3361                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
3362                         D[0] = castOp(t0); D[1] = castOp(t1);
3363                     }
3364                 else if( cn == 3 )
3365                     for( ; dx < X1; dx++, D += 3 )
3366                     {
3367                         int sx = XY[dx*2], sy = XY[dx*2+1];
3368                         const AT* w = wtab + FXY[dx]*4;
3369                         const T* S = S0 + sy*sstep + sx*3;
3370                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
3371                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
3372                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
3373                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
3374                     }
3375                 else
3376                     for( ; dx < X1; dx++, D += 4 )
3377                     {
3378                         int sx = XY[dx*2], sy = XY[dx*2+1];
3379                         const AT* w = wtab + FXY[dx]*4;
3380                         const T* S = S0 + sy*sstep + sx*4;
3381                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
3382                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
3383                         D[0] = castOp(t0); D[1] = castOp(t1);
3384                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
3385                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
3386                         D[2] = castOp(t0); D[3] = castOp(t1);
3387                     }
3388             }
3389             else
3390             {
3391                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
3392                 {
3393                     D += (X1 - dx)*cn;
3394                     dx = X1;
3395                     continue;
3396                 }
3397
3398                 if( cn == 1 )
3399                     for( ; dx < X1; dx++, D++ )
3400                     {
3401                         int sx = XY[dx*2], sy = XY[dx*2+1];
3402                         if( borderType == BORDER_CONSTANT &&
3403                             (sx >= ssize.width || sx+1 < 0 ||
3404                              sy >= ssize.height || sy+1 < 0) )
3405                         {
3406                             D[0] = cval[0];
3407                         }
3408                         else
3409                         {
3410                             int sx0, sx1, sy0, sy1;
3411                             T v0, v1, v2, v3;
3412                             const AT* w = wtab + FXY[dx]*4;
3413                             if( borderType == BORDER_REPLICATE )
3414                             {
3415                                 sx0 = clip(sx, 0, ssize.width);
3416                                 sx1 = clip(sx+1, 0, ssize.width);
3417                                 sy0 = clip(sy, 0, ssize.height);
3418                                 sy1 = clip(sy+1, 0, ssize.height);
3419                                 v0 = S0[sy0*sstep + sx0];
3420                                 v1 = S0[sy0*sstep + sx1];
3421                                 v2 = S0[sy1*sstep + sx0];
3422                                 v3 = S0[sy1*sstep + sx1];
3423                             }
3424                             else
3425                             {
3426                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
3427                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3428                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
3429                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3430                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
3431                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
3432                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
3433                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
3434                             }
3435                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
3436                         }
3437                     }
3438                 else
3439                     for( ; dx < X1; dx++, D += cn )
3440                     {
3441                         int sx = XY[dx*2], sy = XY[dx*2+1], k;
3442                         if( borderType == BORDER_CONSTANT &&
3443                             (sx >= ssize.width || sx+1 < 0 ||
3444                              sy >= ssize.height || sy+1 < 0) )
3445                         {
3446                             for( k = 0; k < cn; k++ )
3447                                 D[k] = cval[k];
3448                         }
3449                         else
3450                         {
3451                             int sx0, sx1, sy0, sy1;
3452                             const T *v0, *v1, *v2, *v3;
3453                             const AT* w = wtab + FXY[dx]*4;
3454                             if( borderType == BORDER_REPLICATE )
3455                             {
3456                                 sx0 = clip(sx, 0, ssize.width);
3457                                 sx1 = clip(sx+1, 0, ssize.width);
3458                                 sy0 = clip(sy, 0, ssize.height);
3459                                 sy1 = clip(sy+1, 0, ssize.height);
3460                                 v0 = S0 + sy0*sstep + sx0*cn;
3461                                 v1 = S0 + sy0*sstep + sx1*cn;
3462                                 v2 = S0 + sy1*sstep + sx0*cn;
3463                                 v3 = S0 + sy1*sstep + sx1*cn;
3464                             }
3465                             else if( borderType == BORDER_TRANSPARENT &&
3466                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
3467                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
3468                                 continue;
3469                             else
3470                             {
3471                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
3472                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3473                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
3474                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3475                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
3476                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
3477                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
3478                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
3479                             }
3480                             for( k = 0; k < cn; k++ )
3481                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
3482                         }
3483                     }
3484             }
3485         }
3486     }
3487 }
3488
3489
3490 template<class CastOp, typename AT, int ONE>
3491 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
3492                           const Mat& _fxy, const void* _wtab,
3493                           int borderType, const Scalar& _borderValue )
3494 {
3495     typedef typename CastOp::rtype T;
3496     typedef typename CastOp::type1 WT;
3497     Size ssize = _src.size(), dsize = _dst.size();
3498     int cn = _src.channels();
3499     const AT* wtab = (const AT*)_wtab;
3500     const T* S0 = _src.ptr<T>();
3501     size_t sstep = _src.step/sizeof(S0[0]);
3502     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3503         saturate_cast<T>(_borderValue[1]),
3504         saturate_cast<T>(_borderValue[2]),
3505         saturate_cast<T>(_borderValue[3]));
3506     int dx, dy;
3507     CastOp castOp;
3508     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3509
3510     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
3511
3512     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3513     {
3514         dsize.width *= dsize.height;
3515         dsize.height = 1;
3516     }
3517
3518     for( dy = 0; dy < dsize.height; dy++ )
3519     {
3520         T* D = _dst.ptr<T>(dy);
3521         const short* XY = _xy.ptr<short>(dy);
3522         const ushort* FXY = _fxy.ptr<ushort>(dy);
3523
3524         for( dx = 0; dx < dsize.width; dx++, D += cn )
3525         {
3526             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
3527             const AT* w = wtab + FXY[dx]*16;
3528             int i, k;
3529             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3530             {
3531                 const T* S = S0 + sy*sstep + sx*cn;
3532                 for( k = 0; k < cn; k++ )
3533                 {
3534                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
3535                     S += sstep;
3536                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
3537                     S += sstep;
3538                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
3539                     S += sstep;
3540                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
3541                     S += 1 - sstep*3;
3542                     D[k] = castOp(sum);
3543                 }
3544             }
3545             else
3546             {
3547                 int x[4], y[4];
3548                 if( borderType == BORDER_TRANSPARENT &&
3549                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
3550                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
3551                     continue;
3552
3553                 if( borderType1 == BORDER_CONSTANT &&
3554                     (sx >= ssize.width || sx+4 <= 0 ||
3555                     sy >= ssize.height || sy+4 <= 0))
3556                 {
3557                     for( k = 0; k < cn; k++ )
3558                         D[k] = cval[k];
3559                     continue;
3560                 }
3561
3562                 for( i = 0; i < 4; i++ )
3563                 {
3564                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3565                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3566                 }
3567
3568                 for( k = 0; k < cn; k++, S0++, w -= 16 )
3569                 {
3570                     WT cv = cval[k], sum = cv*ONE;
3571                     for( i = 0; i < 4; i++, w += 4 )
3572                     {
3573                         int yi = y[i];
3574                         const T* S = S0 + yi*sstep;
3575                         if( yi < 0 )
3576                             continue;
3577                         if( x[0] >= 0 )
3578                             sum += (S[x[0]] - cv)*w[0];
3579                         if( x[1] >= 0 )
3580                             sum += (S[x[1]] - cv)*w[1];
3581                         if( x[2] >= 0 )
3582                             sum += (S[x[2]] - cv)*w[2];
3583                         if( x[3] >= 0 )
3584                             sum += (S[x[3]] - cv)*w[3];
3585                     }
3586                     D[k] = castOp(sum);
3587                 }
3588                 S0 -= cn;
3589             }
3590         }
3591     }
3592 }
3593
3594
3595 template<class CastOp, typename AT, int ONE>
3596 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
3597                            const Mat& _fxy, const void* _wtab,
3598                            int borderType, const Scalar& _borderValue )
3599 {
3600     typedef typename CastOp::rtype T;
3601     typedef typename CastOp::type1 WT;
3602     Size ssize = _src.size(), dsize = _dst.size();
3603     int cn = _src.channels();
3604     const AT* wtab = (const AT*)_wtab;
3605     const T* S0 = _src.ptr<T>();
3606     size_t sstep = _src.step/sizeof(S0[0]);
3607     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3608         saturate_cast<T>(_borderValue[1]),
3609         saturate_cast<T>(_borderValue[2]),
3610         saturate_cast<T>(_borderValue[3]));
3611     int dx, dy;
3612     CastOp castOp;
3613     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3614
3615     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
3616
3617     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3618     {
3619         dsize.width *= dsize.height;
3620         dsize.height = 1;
3621     }
3622
3623     for( dy = 0; dy < dsize.height; dy++ )
3624     {
3625         T* D = _dst.ptr<T>(dy);
3626         const short* XY = _xy.ptr<short>(dy);
3627         const ushort* FXY = _fxy.ptr<ushort>(dy);
3628
3629         for( dx = 0; dx < dsize.width; dx++, D += cn )
3630         {
3631             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
3632             const AT* w = wtab + FXY[dx]*64;
3633             const T* S = S0 + sy*sstep + sx*cn;
3634             int i, k;
3635             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3636             {
3637                 for( k = 0; k < cn; k++ )
3638                 {
3639                     WT sum = 0;
3640                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
3641                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
3642                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
3643                     w -= 64;
3644                     S -= sstep*8 - 1;
3645                     D[k] = castOp(sum);
3646                 }
3647             }
3648             else
3649             {
3650                 int x[8], y[8];
3651                 if( borderType == BORDER_TRANSPARENT &&
3652                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
3653                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
3654                     continue;
3655
3656                 if( borderType1 == BORDER_CONSTANT &&
3657                     (sx >= ssize.width || sx+8 <= 0 ||
3658                     sy >= ssize.height || sy+8 <= 0))
3659                 {
3660                     for( k = 0; k < cn; k++ )
3661                         D[k] = cval[k];
3662                     continue;
3663                 }
3664
3665                 for( i = 0; i < 8; i++ )
3666                 {
3667                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3668                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3669                 }
3670
3671                 for( k = 0; k < cn; k++, S0++, w -= 64 )
3672                 {
3673                     WT cv = cval[k], sum = cv*ONE;
3674                     for( i = 0; i < 8; i++, w += 8 )
3675                     {
3676                         int yi = y[i];
3677                         const T* S1 = S0 + yi*sstep;
3678                         if( yi < 0 )
3679                             continue;
3680                         if( x[0] >= 0 )
3681                             sum += (S1[x[0]] - cv)*w[0];
3682                         if( x[1] >= 0 )
3683                             sum += (S1[x[1]] - cv)*w[1];
3684                         if( x[2] >= 0 )
3685                             sum += (S1[x[2]] - cv)*w[2];
3686                         if( x[3] >= 0 )
3687                             sum += (S1[x[3]] - cv)*w[3];
3688                         if( x[4] >= 0 )
3689                             sum += (S1[x[4]] - cv)*w[4];
3690                         if( x[5] >= 0 )
3691                             sum += (S1[x[5]] - cv)*w[5];
3692                         if( x[6] >= 0 )
3693                             sum += (S1[x[6]] - cv)*w[6];
3694                         if( x[7] >= 0 )
3695                             sum += (S1[x[7]] - cv)*w[7];
3696                     }
3697                     D[k] = castOp(sum);
3698                 }
3699                 S0 -= cn;
3700             }
3701         }
3702     }
3703 }
3704
3705
3706 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3707                             int borderType, const Scalar& _borderValue );
3708
3709 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3710                           const Mat& _fxy, const void* _wtab,
3711                           int borderType, const Scalar& _borderValue);
3712
3713 class RemapInvoker :
3714     public ParallelLoopBody
3715 {
3716 public:
3717     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
3718                  const Mat *_m2, int _borderType, const Scalar &_borderValue,
3719                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
3720         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
3721         borderType(_borderType), borderValue(_borderValue),
3722         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
3723     {
3724     }
3725
3726     virtual void operator() (const Range& range) const
3727     {
3728         int x, y, x1, y1;
3729         const int buf_size = 1 << 14;
3730         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
3731         int bcols0 = std::min(buf_size/brows0, dst->cols);
3732         brows0 = std::min(buf_size/bcols0, dst->rows);
3733     #if CV_SSE2
3734         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
3735     #endif
3736
3737         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
3738         if( !nnfunc )
3739             _bufa.create(brows0, bcols0, CV_16UC1);
3740
3741         for( y = range.start; y < range.end; y += brows0 )
3742         {
3743             for( x = 0; x < dst->cols; x += bcols0 )
3744             {
3745                 int brows = std::min(brows0, range.end - y);
3746                 int bcols = std::min(bcols0, dst->cols - x);
3747                 Mat dpart(*dst, Rect(x, y, bcols, brows));
3748                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
3749
3750                 if( nnfunc )
3751                 {
3752                     if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
3753                         bufxy = (*m1)(Rect(x, y, bcols, brows));
3754                     else if( map_depth != CV_32F )
3755                     {
3756                         for( y1 = 0; y1 < brows; y1++ )
3757                         {
3758                             short* XY = bufxy.ptr<short>(y1);
3759                             const short* sXY = m1->ptr<short>(y+y1) + x*2;
3760                             const ushort* sA = m2->ptr<ushort>(y+y1) + x;
3761
3762                             for( x1 = 0; x1 < bcols; x1++ )
3763                             {
3764                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
3765                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
3766                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
3767                             }
3768                         }
3769                     }
3770                     else if( !planar_input )
3771                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
3772                     else
3773                     {
3774                         for( y1 = 0; y1 < brows; y1++ )
3775                         {
3776                             short* XY = bufxy.ptr<short>(y1);
3777                             const float* sX = m1->ptr<float>(y+y1) + x;
3778                             const float* sY = m2->ptr<float>(y+y1) + x;
3779                             x1 = 0;
3780
3781                         #if CV_SSE2
3782                             if( useSIMD )
3783                             {
3784                                 for( ; x1 <= bcols - 8; x1 += 8 )
3785                                 {
3786                                     __m128 fx0 = _mm_loadu_ps(sX + x1);
3787                                     __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
3788                                     __m128 fy0 = _mm_loadu_ps(sY + x1);
3789                                     __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
3790                                     __m128i ix0 = _mm_cvtps_epi32(fx0);
3791                                     __m128i ix1 = _mm_cvtps_epi32(fx1);
3792                                     __m128i iy0 = _mm_cvtps_epi32(fy0);
3793                                     __m128i iy1 = _mm_cvtps_epi32(fy1);
3794                                     ix0 = _mm_packs_epi32(ix0, ix1);
3795                                     iy0 = _mm_packs_epi32(iy0, iy1);
3796                                     ix1 = _mm_unpacklo_epi16(ix0, iy0);
3797                                     iy1 = _mm_unpackhi_epi16(ix0, iy0);
3798                                     _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
3799                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
3800                                 }
3801                             }
3802                         #endif
3803
3804                             for( ; x1 < bcols; x1++ )
3805                             {
3806                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
3807                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
3808                             }
3809                         }
3810                     }
3811                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
3812                     continue;
3813                 }
3814
3815                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
3816                 for( y1 = 0; y1 < brows; y1++ )
3817                 {
3818                     short* XY = bufxy.ptr<short>(y1);
3819                     ushort* A = bufa.ptr<ushort>(y1);
3820
3821                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
3822                     {
3823                         bufxy = (*m1)(Rect(x, y, bcols, brows));
3824
3825                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
3826                         x1 = 0;
3827
3828                     #if CV_NEON
3829                         uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
3830                         for ( ; x1 <= bcols - 8; x1 += 8)
3831                             vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
3832                     #endif
3833
3834                         for( ; x1 < bcols; x1++ )
3835                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
3836                     }
3837                     else if( planar_input )
3838                     {
3839                         const float* sX = m1->ptr<float>(y+y1) + x;
3840                         const float* sY = m2->ptr<float>(y+y1) + x;
3841
3842                         x1 = 0;
3843                     #if CV_SSE2
3844                         if( useSIMD )
3845                         {
3846                             __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
3847                             __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
3848                             for( ; x1 <= bcols - 8; x1 += 8 )
3849                             {
3850                                 __m128 fx0 = _mm_loadu_ps(sX + x1);
3851                                 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
3852                                 __m128 fy0 = _mm_loadu_ps(sY + x1);
3853                                 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
3854                                 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
3855                                 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
3856                                 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
3857                                 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
3858                                 __m128i mx0 = _mm_and_si128(ix0, mask);
3859                                 __m128i mx1 = _mm_and_si128(ix1, mask);
3860                                 __m128i my0 = _mm_and_si128(iy0, mask);
3861                                 __m128i my1 = _mm_and_si128(iy1, mask);
3862                                 mx0 = _mm_packs_epi32(mx0, mx1);
3863                                 my0 = _mm_packs_epi32(my0, my1);
3864                                 my0 = _mm_slli_epi16(my0, INTER_BITS);
3865                                 mx0 = _mm_or_si128(mx0, my0);
3866                                 _mm_storeu_si128((__m128i*)(A + x1), mx0);
3867                                 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
3868                                 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
3869                                 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
3870                                 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
3871                                 ix0 = _mm_packs_epi32(ix0, ix1);
3872                                 iy0 = _mm_packs_epi32(iy0, iy1);
3873                                 ix1 = _mm_unpacklo_epi16(ix0, iy0);
3874                                 iy1 = _mm_unpackhi_epi16(ix0, iy0);
3875                                 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
3876                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
3877                             }
3878                         }
3879                     #elif CV_NEON
3880                         float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
3881                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
3882
3883                         for( ; x1 <= bcols - 4; x1 += 4 )
3884                         {
3885                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
3886                                       v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
3887                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
3888                                                       vandq_s32(v_sy, v_scale2));
3889                             vst1_u16(A + x1, vqmovun_s32(v_v));
3890
3891                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
3892                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
3893                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
3894                         }
3895                     #endif
3896
3897                         for( ; x1 < bcols; x1++ )
3898                         {
3899                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
3900                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
3901                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
3902                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
3903                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
3904                             A[x1] = (ushort)v;
3905                         }
3906                     }
3907                     else
3908                     {
3909                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
3910                         x1 = 0;
3911
3912                     #if CV_NEON
3913                         float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
3914                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
3915
3916                         for( ; x1 <= bcols - 4; x1 += 4 )
3917                         {
3918                             float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
3919                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
3920                             int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
3921                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
3922                                                       vandq_s32(v_sy, v_scale2));
3923                             vst1_u16(A + x1, vqmovun_s32(v_v));
3924
3925                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
3926                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
3927                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
3928                         }
3929                     #endif
3930
3931                         for( x1 = 0; x1 < bcols; x1++ )
3932                         {
3933                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
3934                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
3935                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
3936                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
3937                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
3938                             A[x1] = (ushort)v;
3939                         }
3940                     }
3941                 }
3942                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
3943             }
3944         }
3945     }
3946
3947 private:
3948     const Mat* src;
3949     Mat* dst;
3950     const Mat *m1, *m2;
3951     int borderType;
3952     Scalar borderValue;
3953     int planar_input;
3954     RemapNNFunc nnfunc;
3955     RemapFunc ifunc;
3956     const void *ctab;
3957 };
3958
3959 #ifdef HAVE_OPENCL
3960
3961 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
3962                       int interpolation, int borderType, const Scalar& borderValue)
3963 {
3964     const ocl::Device & dev = ocl::Device::getDefault();
3965     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
3966             rowsPerWI = dev.isIntel() ? 4 : 1;
3967
3968     if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
3969             || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
3970         return false;
3971
3972     UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
3973
3974     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
3975         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
3976     {
3977         if (map1.type() != CV_16SC2)
3978             std::swap(map1, map2);
3979     }
3980     else
3981         CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
3982
3983     _dst.create(map1.size(), type);
3984     UMat dst = _dst.getUMat();
3985
3986     String kernelName = "remap";
3987     if (map1.type() == CV_32FC2 && map2.empty())
3988         kernelName += "_32FC2";
3989     else if (map1.type() == CV_16SC2)
3990     {
3991         kernelName += "_16SC2";
3992         if (!map2.empty())
3993             kernelName += "_16UC1";
3994     }
3995     else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
3996         kernelName += "_2_32FC1";
3997     else
3998         CV_Error(Error::StsBadArg, "Unsupported map types");
3999
4000     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
4001     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4002                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
4003     String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
4004                                  interMap[interpolation], borderMap[borderType],
4005                                  ocl::typeToStr(type), rowsPerWI);
4006
4007     if (interpolation != INTER_NEAREST)
4008     {
4009         char cvt[3][40];
4010         int wdepth = std::max(CV_32F, depth);
4011         buildOptions = buildOptions
4012                       + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
4013                                " -D convertToWT2=%s -D WT2=%s",
4014                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
4015                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
4016                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
4017                                ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
4018                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
4019     }
4020     int scalarcn = cn == 3 ? 4 : cn;
4021     int sctype = CV_MAKETYPE(depth, scalarcn);
4022     buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
4023                            ocl::typeToStr(type), ocl::typeToStr(depth),
4024                            cn, ocl::typeToStr(sctype), depth);
4025
4026     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
4027
4028     Mat scalar(1, 1, sctype, borderValue);
4029     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
4030             map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
4031             scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
4032
4033     if (map2.empty())
4034         k.args(srcarg, dstarg, map1arg, scalararg);
4035     else
4036         k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
4037
4038     size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4039     return k.run(2, globalThreads, NULL, false);
4040 }
4041
4042 #endif
4043
4044 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4045
4046 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
4047                                            const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
4048                                            void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
4049
4050 class IPPRemapInvoker :
4051         public ParallelLoopBody
4052 {
4053 public:
4054     IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
4055                     int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
4056         ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
4057         ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
4058     {
4059         *ok = true;
4060     }
4061
4062     virtual void operator() (const Range & range) const
4063     {
4064         IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
4065         Mat dstRoi = dst.rowRange(range);
4066         IppiSize dstRoiSize = ippiSize(dstRoi.size());
4067         int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4068
4069         if (borderType == BORDER_CONSTANT &&
4070                 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
4071         {
4072             *ok = false;
4073             return;
4074         }
4075
4076         if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
4077                     map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
4078                     dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
4079             *ok = false;
4080         else
4081         {
4082             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4083         }
4084     }
4085
4086 private:
4087     Mat & src, & dst, & map1, & map2;
4088     ippiRemap ippFunc;
4089     int ippInterpolation, borderType;
4090     Scalar borderValue;
4091     bool * ok;
4092 };
4093
4094 #endif
4095
4096 }
4097
4098 void cv::remap( InputArray _src, OutputArray _dst,
4099                 InputArray _map1, InputArray _map2,
4100                 int interpolation, int borderType, const Scalar& borderValue )
4101 {
4102     static RemapNNFunc nn_tab[] =
4103     {
4104         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
4105         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
4106     };
4107
4108     static RemapFunc linear_tab[] =
4109     {
4110         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
4111         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
4112         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
4113         remapBilinear<Cast<float, float>, RemapNoVec, float>,
4114         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
4115     };
4116
4117     static RemapFunc cubic_tab[] =
4118     {
4119         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4120         remapBicubic<Cast<float, ushort>, float, 1>,
4121         remapBicubic<Cast<float, short>, float, 1>, 0,
4122         remapBicubic<Cast<float, float>, float, 1>,
4123         remapBicubic<Cast<double, double>, float, 1>, 0
4124     };
4125
4126     static RemapFunc lanczos4_tab[] =
4127     {
4128         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4129         remapLanczos4<Cast<float, ushort>, float, 1>,
4130         remapLanczos4<Cast<float, short>, float, 1>, 0,
4131         remapLanczos4<Cast<float, float>, float, 1>,
4132         remapLanczos4<Cast<double, double>, float, 1>, 0
4133     };
4134
4135     CV_Assert( _map1.size().area() > 0 );
4136     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
4137
4138     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4139                ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
4140
4141     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
4142     _dst.create( map1.size(), src.type() );
4143     Mat dst = _dst.getMat();
4144     if( dst.data == src.data )
4145         src = src.clone();
4146
4147     if( interpolation == INTER_AREA )
4148         interpolation = INTER_LINEAR;
4149
4150     int type = src.type(), depth = CV_MAT_DEPTH(type);
4151
4152 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4153     CV_IPP_CHECK()
4154     {
4155         if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
4156                 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
4157                 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
4158         {
4159             int ippInterpolation =
4160                 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4161                 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
4162
4163             ippiRemap ippFunc =
4164                 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
4165                 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
4166                 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
4167                 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
4168                 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
4169                 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
4170                 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
4171                 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
4172                 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
4173
4174             if (ippFunc)
4175             {
4176                 bool ok;
4177                 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
4178                                         borderType, borderValue, &ok);
4179                 Range range(0, dst.rows);
4180                 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
4181
4182                 if (ok)
4183                 {
4184                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4185                     return;
4186                 }
4187                 setIppErrorStatus();
4188             }
4189         }
4190     }
4191 #endif
4192
4193     RemapNNFunc nnfunc = 0;
4194     RemapFunc ifunc = 0;
4195     const void* ctab = 0;
4196     bool fixpt = depth == CV_8U;
4197     bool planar_input = false;
4198
4199     if( interpolation == INTER_NEAREST )
4200     {
4201         nnfunc = nn_tab[depth];
4202         CV_Assert( nnfunc != 0 );
4203     }
4204     else
4205     {
4206         if( interpolation == INTER_LINEAR )
4207             ifunc = linear_tab[depth];
4208         else if( interpolation == INTER_CUBIC )
4209             ifunc = cubic_tab[depth];
4210         else if( interpolation == INTER_LANCZOS4 )
4211             ifunc = lanczos4_tab[depth];
4212         else
4213             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
4214         CV_Assert( ifunc != 0 );
4215         ctab = initInterTab2D( interpolation, fixpt );
4216     }
4217
4218     const Mat *m1 = &map1, *m2 = &map2;
4219
4220     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
4221         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
4222     {
4223         if( map1.type() != CV_16SC2 )
4224             std::swap(m1, m2);
4225     }
4226     else
4227     {
4228         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
4229             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4230         planar_input = map1.channels() == 1;
4231     }
4232
4233     RemapInvoker invoker(src, dst, m1, m2,
4234                          borderType, borderValue, planar_input, nnfunc, ifunc,
4235                          ctab);
4236     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
4237 }
4238
4239
4240 void cv::convertMaps( InputArray _map1, InputArray _map2,
4241                       OutputArray _dstmap1, OutputArray _dstmap2,
4242                       int dstm1type, bool nninterpolate )
4243 {
4244     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
4245     Size size = map1.size();
4246     const Mat *m1 = &map1, *m2 = &map2;
4247     int m1type = m1->type(), m2type = m2->type();
4248
4249     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
4250                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
4251                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
4252                (m1type == CV_32FC2 && m2->empty()) );
4253
4254     if( m2type == CV_16SC2 )
4255     {
4256         std::swap( m1, m2 );
4257         std::swap( m1type, m2type );
4258     }
4259
4260     if( dstm1type <= 0 )
4261         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
4262     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
4263     _dstmap1.create( size, dstm1type );
4264     dstmap1 = _dstmap1.getMat();
4265
4266     if( !nninterpolate && dstm1type != CV_32FC2 )
4267     {
4268         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
4269         dstmap2 = _dstmap2.getMat();
4270     }
4271     else
4272         _dstmap2.release();
4273
4274     if( m1type == dstm1type || (nninterpolate &&
4275         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
4276         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
4277     {
4278         m1->convertTo( dstmap1, dstmap1.type() );
4279         if( !dstmap2.empty() && dstmap2.type() == m2->type() )
4280             m2->copyTo( dstmap2 );
4281         return;
4282     }
4283
4284     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
4285     {
4286         Mat vdata[] = { *m1, *m2 };
4287         merge( vdata, 2, dstmap1 );
4288         return;
4289     }
4290
4291     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
4292     {
4293         Mat mv[] = { dstmap1, dstmap2 };
4294         split( *m1, mv );
4295         return;
4296     }
4297
4298     if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
4299         dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
4300     {
4301         size.width *= size.height;
4302         size.height = 1;
4303     }
4304
4305     const float scale = 1.f/INTER_TAB_SIZE;
4306     int x, y;
4307     for( y = 0; y < size.height; y++ )
4308     {
4309         const float* src1f = m1->ptr<float>(y);
4310         const float* src2f = m2->ptr<float>(y);
4311         const short* src1 = (const short*)src1f;
4312         const ushort* src2 = (const ushort*)src2f;
4313
4314         float* dst1f = dstmap1.ptr<float>(y);
4315         float* dst2f = dstmap2.ptr<float>(y);
4316         short* dst1 = (short*)dst1f;
4317         ushort* dst2 = (ushort*)dst2f;
4318
4319         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
4320         {
4321             if( nninterpolate )
4322                 for( x = 0; x < size.width; x++ )
4323                 {
4324                     dst1[x*2] = saturate_cast<short>(src1f[x]);
4325                     dst1[x*2+1] = saturate_cast<short>(src2f[x]);
4326                 }
4327             else
4328                 for( x = 0; x < size.width; x++ )
4329                 {
4330                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
4331                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
4332                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4333                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4334                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4335                 }
4336         }
4337         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
4338         {
4339             if( nninterpolate )
4340                 for( x = 0; x < size.width; x++ )
4341                 {
4342                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
4343                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
4344                 }
4345             else
4346                 for( x = 0; x < size.width; x++ )
4347                 {
4348                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
4349                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
4350                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4351                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4352                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4353                 }
4354         }
4355         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
4356         {
4357             for( x = 0; x < size.width; x++ )
4358             {
4359                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
4360                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4361                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4362             }
4363         }
4364         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
4365         {
4366             for( x = 0; x < size.width; x++ )
4367             {
4368                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
4369                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4370                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4371             }
4372         }
4373         else
4374             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
4375     }
4376 }
4377
4378
4379 namespace cv
4380 {
4381
4382 class WarpAffineInvoker :
4383     public ParallelLoopBody
4384 {
4385 public:
4386     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
4387                       const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
4388         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
4389         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
4390         M(_M)
4391     {
4392     }
4393
4394     virtual void operator() (const Range& range) const
4395     {
4396         const int BLOCK_SZ = 64;
4397         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
4398         const int AB_BITS = MAX(10, (int)INTER_BITS);
4399         const int AB_SCALE = 1 << AB_BITS;
4400         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
4401     #if CV_SSE2
4402         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
4403     #endif
4404
4405         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
4406         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
4407         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
4408
4409         for( y = range.start; y < range.end; y += bh0 )
4410         {
4411             for( x = 0; x < dst.cols; x += bw0 )
4412             {
4413                 int bw = std::min( bw0, dst.cols - x);
4414                 int bh = std::min( bh0, range.end - y);
4415
4416                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
4417                 Mat dpart(dst, Rect(x, y, bw, bh));
4418
4419                 for( y1 = 0; y1 < bh; y1++ )
4420                 {
4421                     short* xy = XY + y1*bw*2;
4422                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
4423                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
4424
4425                     if( interpolation == INTER_NEAREST )
4426                         for( x1 = 0; x1 < bw; x1++ )
4427                         {
4428                             int X = (X0 + adelta[x+x1]) >> AB_BITS;
4429                             int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
4430                             xy[x1*2] = saturate_cast<short>(X);
4431                             xy[x1*2+1] = saturate_cast<short>(Y);
4432                         }
4433                     else
4434                     {
4435                         short* alpha = A + y1*bw;
4436                         x1 = 0;
4437                     #if CV_SSE2
4438                         if( useSIMD )
4439                         {
4440                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
4441                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
4442                             for( ; x1 <= bw - 8; x1 += 8 )
4443                             {
4444                                 __m128i tx0, tx1, ty0, ty1;
4445                                 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
4446                                 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
4447                                 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
4448                                 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
4449
4450                                 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
4451                                 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
4452                                 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
4453                                 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
4454
4455                                 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
4456                                                             _mm_and_si128(tx1, fxy_mask));
4457                                 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
4458                                                             _mm_and_si128(ty1, fxy_mask));
4459                                 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
4460                                                             _mm_srai_epi32(tx1, INTER_BITS));
4461                                 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
4462                                                     _mm_srai_epi32(ty1, INTER_BITS));
4463                                 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
4464
4465                                 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
4466                                 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
4467                                 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
4468                             }
4469                         }
4470                     #endif
4471                         for( ; x1 < bw; x1++ )
4472                         {
4473                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
4474                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
4475                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
4476                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
4477                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
4478                                     (X & (INTER_TAB_SIZE-1)));
4479                         }
4480                     }
4481                 }
4482
4483                 if( interpolation == INTER_NEAREST )
4484                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
4485                 else
4486                 {
4487                     Mat _matA(bh, bw, CV_16U, A);
4488                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
4489                 }
4490             }
4491         }
4492     }
4493
4494 private:
4495     Mat src;
4496     Mat dst;
4497     int interpolation, borderType;
4498     Scalar borderValue;
4499     int *adelta, *bdelta;
4500     double *M;
4501 };
4502
4503
4504 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4505 class IPPWarpAffineInvoker :
4506     public ParallelLoopBody
4507 {
4508 public:
4509     IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
4510                          const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
4511         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
4512         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
4513     {
4514         *ok = true;
4515     }
4516
4517     virtual void operator() (const Range& range) const
4518     {
4519         IppiSize srcsize = { src.cols, src.rows };
4520         IppiRect srcroi = { 0, 0, src.cols, src.rows };
4521         IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
4522         int cnn = src.channels();
4523         if( borderType == BORDER_CONSTANT )
4524         {
4525             IppiSize setSize = { dst.cols, range.end - range.start };
4526             void *dataPointer = dst.ptr(range.start);
4527             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
4528             {
4529                 *ok = false;
4530                 return;
4531             }
4532         }
4533
4534         // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
4535         IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
4536                                 (int)dst.step[0], dstroi, coeffs, mode );
4537         if( status < 0)
4538             *ok = false;
4539         else
4540         {
4541             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4542         }
4543     }
4544 private:
4545     Mat &src;
4546     Mat &dst;
4547     int mode;
4548     double (&coeffs)[2][3];
4549     int borderType;
4550     Scalar borderValue;
4551     ippiWarpAffineBackFunc func;
4552     bool *ok;
4553     const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
4554 };
4555 #endif
4556
4557 #ifdef HAVE_OPENCL
4558
4559 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
4560
4561 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
4562                               Size dsize, int flags, int borderType, const Scalar& borderValue,
4563                               int op_type)
4564 {
4565     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
4566     const ocl::Device & dev = ocl::Device::getDefault();
4567
4568     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4569     double doubleSupport = dev.doubleFPConfig() > 0;
4570
4571     int interpolation = flags & INTER_MAX;
4572     if( interpolation == INTER_AREA )
4573         interpolation = INTER_LINEAR;
4574     int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
4575
4576     if ( !(borderType == cv::BORDER_CONSTANT &&
4577            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
4578          (!doubleSupport && depth == CV_64F) || cn > 4)
4579         return false;
4580
4581     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
4582     ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
4583                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
4584     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
4585
4586     int scalarcn = cn == 3 ? 4 : cn;
4587     bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
4588     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
4589     int sctype = CV_MAKETYPE(wdepth, scalarcn);
4590
4591     ocl::Kernel k;
4592     String opts;
4593     if (interpolation == INTER_NEAREST)
4594     {
4595         opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
4596                       ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4597                       ocl::typeToStr(CV_MAT_DEPTH(type)),
4598                       ocl::typeToStr(sctype), cn, rowsPerWI);
4599     }
4600     else
4601     {
4602         char cvt[2][50];
4603         opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
4604                       " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
4605                       interpolationMap[interpolation], ocl::typeToStr(type),
4606                       ocl::typeToStr(CV_MAT_DEPTH(type)),
4607                       ocl::typeToStr(sctype),
4608                       ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
4609                       ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
4610                       ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
4611                       doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
4612     }
4613
4614     k.create(kernelName, program, opts);
4615     if (k.empty())
4616         return false;
4617
4618     double borderBuf[] = { 0, 0, 0, 0 };
4619     scalarToRawData(borderValue, borderBuf, sctype);
4620
4621     UMat src = _src.getUMat(), M0;
4622     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4623     UMat dst = _dst.getUMat();
4624
4625     double M[9];
4626     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
4627     Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
4628     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
4629                M1.rows == matRows && M1.cols == 3 );
4630     M1.convertTo(matM, matM.type());
4631
4632     if( !(flags & WARP_INVERSE_MAP) )
4633     {
4634         if (op_type == OCL_OP_PERSPECTIVE)
4635             invert(matM, matM);
4636         else
4637         {
4638             double D = M[0]*M[4] - M[1]*M[3];
4639             D = D != 0 ? 1./D : 0;
4640             double A11 = M[4]*D, A22=M[0]*D;
4641             M[0] = A11; M[1] *= -D;
4642             M[3] *= -D; M[4] = A22;
4643             double b1 = -M[0]*M[2] - M[1]*M[5];
4644             double b2 = -M[3]*M[2] - M[4]*M[5];
4645             M[2] = b1; M[5] = b2;
4646         }
4647     }
4648     matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
4649
4650     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
4651            ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
4652
4653     size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4654     return k.run(2, globalThreads, NULL, false);
4655 }
4656
4657 #endif
4658
4659 }
4660
4661
4662 void cv::warpAffine( InputArray _src, OutputArray _dst,
4663                      InputArray _M0, Size dsize,
4664                      int flags, int borderType, const Scalar& borderValue )
4665 {
4666     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4667                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
4668                                  borderValue, OCL_OP_AFFINE))
4669
4670     Mat src = _src.getMat(), M0 = _M0.getMat();
4671     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4672     Mat dst = _dst.getMat();
4673     CV_Assert( src.cols > 0 && src.rows > 0 );
4674     if( dst.data == src.data )
4675         src = src.clone();
4676
4677     double M[6];
4678     Mat matM(2, 3, CV_64F, M);
4679     int interpolation = flags & INTER_MAX;
4680     if( interpolation == INTER_AREA )
4681         interpolation = INTER_LINEAR;
4682
4683     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
4684     M0.convertTo(matM, matM.type());
4685
4686 #ifdef HAVE_TEGRA_OPTIMIZATION
4687     if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
4688         return;
4689 #endif
4690
4691     if( !(flags & WARP_INVERSE_MAP) )
4692     {
4693         double D = M[0]*M[4] - M[1]*M[3];
4694         D = D != 0 ? 1./D : 0;
4695         double A11 = M[4]*D, A22=M[0]*D;
4696         M[0] = A11; M[1] *= -D;
4697         M[3] *= -D; M[4] = A22;
4698         double b1 = -M[0]*M[2] - M[1]*M[5];
4699         double b2 = -M[3]*M[2] - M[4]*M[5];
4700         M[2] = b1; M[5] = b2;
4701     }
4702
4703     int x;
4704     AutoBuffer<int> _abdelta(dst.cols*2);
4705     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
4706     const int AB_BITS = MAX(10, (int)INTER_BITS);
4707     const int AB_SCALE = 1 << AB_BITS;
4708
4709 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4710     CV_IPP_CHECK()
4711     {
4712         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4713         if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
4714            ( cn == 1 || cn == 3 || cn == 4 ) &&
4715            ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
4716            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
4717         {
4718             ippiWarpAffineBackFunc ippFunc = 0;
4719             if ((flags & WARP_INVERSE_MAP) != 0)
4720             {
4721                 ippFunc =
4722                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
4723                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
4724                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
4725                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
4726                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
4727                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
4728                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
4729                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
4730                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
4731                 0;
4732             }
4733             else
4734             {
4735                 ippFunc =
4736                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
4737                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
4738                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
4739                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
4740                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
4741                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
4742                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
4743                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
4744                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
4745                 0;
4746             }
4747             int mode =
4748             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
4749             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4750             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
4751             0;
4752             CV_Assert(mode && ippFunc);
4753
4754             double coeffs[2][3];
4755             for( int i = 0; i < 2; i++ )
4756                 for( int j = 0; j < 3; j++ )
4757                     coeffs[i][j] = matM.at<double>(i, j);
4758
4759             bool ok;
4760             Range range(0, dst.rows);
4761             IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
4762             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
4763             if( ok )
4764             {
4765                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4766                 return;
4767             }
4768             setIppErrorStatus();
4769         }
4770     }
4771 #endif
4772
4773     for( x = 0; x < dst.cols; x++ )
4774     {
4775         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
4776         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
4777     }
4778
4779     Range range(0, dst.rows);
4780     WarpAffineInvoker invoker(src, dst, interpolation, borderType,
4781                               borderValue, adelta, bdelta, M);
4782     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
4783 }
4784
4785
4786 namespace cv
4787 {
4788
4789 class WarpPerspectiveInvoker :
4790     public ParallelLoopBody
4791 {
4792 public:
4793     WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
4794                            int _borderType, const Scalar &_borderValue) :
4795         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
4796         borderType(_borderType), borderValue(_borderValue)
4797     {
4798     }
4799
4800     virtual void operator() (const Range& range) const
4801     {
4802         const int BLOCK_SZ = 32;
4803         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
4804         int x, y, x1, y1, width = dst.cols, height = dst.rows;
4805
4806         int bh0 = std::min(BLOCK_SZ/2, height);
4807         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
4808         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
4809
4810         for( y = range.start; y < range.end; y += bh0 )
4811         {
4812             for( x = 0; x < width; x += bw0 )
4813             {
4814                 int bw = std::min( bw0, width - x);
4815                 int bh = std::min( bh0, range.end - y); // height
4816
4817                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
4818                 Mat dpart(dst, Rect(x, y, bw, bh));
4819
4820                 for( y1 = 0; y1 < bh; y1++ )
4821                 {
4822                     short* xy = XY + y1*bw*2;
4823                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
4824                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
4825                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
4826
4827                     if( interpolation == INTER_NEAREST )
4828                         for( x1 = 0; x1 < bw; x1++ )
4829                         {
4830                             double W = W0 + M[6]*x1;
4831                             W = W ? 1./W : 0;
4832                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
4833                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
4834                             int X = saturate_cast<int>(fX);
4835                             int Y = saturate_cast<int>(fY);
4836
4837                             xy[x1*2] = saturate_cast<short>(X);
4838                             xy[x1*2+1] = saturate_cast<short>(Y);
4839                         }
4840                     else
4841                     {
4842                         short* alpha = A + y1*bw;
4843                         for( x1 = 0; x1 < bw; x1++ )
4844                         {
4845                             double W = W0 + M[6]*x1;
4846                             W = W ? INTER_TAB_SIZE/W : 0;
4847                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
4848                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
4849                             int X = saturate_cast<int>(fX);
4850                             int Y = saturate_cast<int>(fY);
4851
4852                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
4853                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
4854                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
4855                                                 (X & (INTER_TAB_SIZE-1)));
4856                         }
4857                     }
4858                 }
4859
4860                 if( interpolation == INTER_NEAREST )
4861                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
4862                 else
4863                 {
4864                     Mat _matA(bh, bw, CV_16U, A);
4865                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
4866                 }
4867             }
4868         }
4869     }
4870
4871 private:
4872     Mat src;
4873     Mat dst;
4874     double* M;
4875     int interpolation, borderType;
4876     Scalar borderValue;
4877 };
4878
4879
4880 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4881 class IPPWarpPerspectiveInvoker :
4882     public ParallelLoopBody
4883 {
4884 public:
4885     IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
4886                               int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
4887         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
4888         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
4889     {
4890         *ok = true;
4891     }
4892
4893     virtual void operator() (const Range& range) const
4894     {
4895         IppiSize srcsize = {src.cols, src.rows};
4896         IppiRect srcroi = {0, 0, src.cols, src.rows};
4897         IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
4898         int cnn = src.channels();
4899
4900         if( borderType == BORDER_CONSTANT )
4901         {
4902             IppiSize setSize = {dst.cols, range.end - range.start};
4903             void *dataPointer = dst.ptr(range.start);
4904             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
4905             {
4906                 *ok = false;
4907                 return;
4908             }
4909         }
4910
4911         IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode);
4912         if (status != ippStsNoErr)
4913             *ok = false;
4914         else
4915         {
4916             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4917         }
4918     }
4919 private:
4920     Mat &src;
4921     Mat &dst;
4922     int mode;
4923     double (&coeffs)[3][3];
4924     int borderType;
4925     const Scalar borderValue;
4926     ippiWarpPerspectiveFunc func;
4927     bool *ok;
4928
4929     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
4930 };
4931 #endif
4932 }
4933
4934 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
4935                           Size dsize, int flags, int borderType, const Scalar& borderValue )
4936 {
4937     CV_Assert( _src.total() > 0 );
4938
4939     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4940                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
4941                               OCL_OP_PERSPECTIVE))
4942
4943     Mat src = _src.getMat(), M0 = _M0.getMat();
4944     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4945     Mat dst = _dst.getMat();
4946
4947     if( dst.data == src.data )
4948         src = src.clone();
4949
4950     double M[9];
4951     Mat matM(3, 3, CV_64F, M);
4952     int interpolation = flags & INTER_MAX;
4953     if( interpolation == INTER_AREA )
4954         interpolation = INTER_LINEAR;
4955
4956     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
4957     M0.convertTo(matM, matM.type());
4958
4959 #ifdef HAVE_TEGRA_OPTIMIZATION
4960     if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
4961         return;
4962 #endif
4963
4964
4965 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4966     CV_IPP_CHECK()
4967     {
4968         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4969         if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
4970            (cn == 1 || cn == 3 || cn == 4) &&
4971            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
4972            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
4973         {
4974             ippiWarpPerspectiveFunc ippFunc = 0;
4975             if ((flags & WARP_INVERSE_MAP) != 0)
4976             {
4977                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
4978                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
4979                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
4980                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
4981                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
4982                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
4983                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
4984                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
4985                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
4986             }
4987             else
4988             {
4989                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
4990                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
4991                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
4992                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
4993                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
4994                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
4995                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
4996                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
4997                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
4998             }
4999             int mode =
5000             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5001             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5002             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
5003             CV_Assert(mode && ippFunc);
5004
5005             double coeffs[3][3];
5006             for( int i = 0; i < 3; i++ )
5007                 for( int j = 0; j < 3; j++ )
5008                     coeffs[i][j] = matM.at<double>(i, j);
5009
5010             bool ok;
5011             Range range(0, dst.rows);
5012             IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5013             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5014             if( ok )
5015             {
5016                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5017                 return;
5018             }
5019             setIppErrorStatus();
5020         }
5021     }
5022 #endif
5023
5024     if( !(flags & WARP_INVERSE_MAP) )
5025         invert(matM, matM);
5026
5027     Range range(0, dst.rows);
5028     WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
5029     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5030 }
5031
5032
5033 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
5034 {
5035     angle *= CV_PI/180;
5036     double alpha = cos(angle)*scale;
5037     double beta = sin(angle)*scale;
5038
5039     Mat M(2, 3, CV_64F);
5040     double* m = M.ptr<double>();
5041
5042     m[0] = alpha;
5043     m[1] = beta;
5044     m[2] = (1-alpha)*center.x - beta*center.y;
5045     m[3] = -beta;
5046     m[4] = alpha;
5047     m[5] = beta*center.x + (1-alpha)*center.y;
5048
5049     return M;
5050 }
5051
5052 /* Calculates coefficients of perspective transformation
5053  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
5054  *
5055  *      c00*xi + c01*yi + c02
5056  * ui = ---------------------
5057  *      c20*xi + c21*yi + c22
5058  *
5059  *      c10*xi + c11*yi + c12
5060  * vi = ---------------------
5061  *      c20*xi + c21*yi + c22
5062  *
5063  * Coefficients are calculated by solving linear system:
5064  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
5065  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
5066  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
5067  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
5068  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
5069  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
5070  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
5071  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
5072  *
5073  * where:
5074  *   cij - matrix coefficients, c22 = 1
5075  */
5076 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
5077 {
5078     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
5079     double a[8][8], b[8];
5080     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
5081
5082     for( int i = 0; i < 4; ++i )
5083     {
5084         a[i][0] = a[i+4][3] = src[i].x;
5085         a[i][1] = a[i+4][4] = src[i].y;
5086         a[i][2] = a[i+4][5] = 1;
5087         a[i][3] = a[i][4] = a[i][5] =
5088         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
5089         a[i][6] = -src[i].x*dst[i].x;
5090         a[i][7] = -src[i].y*dst[i].x;
5091         a[i+4][6] = -src[i].x*dst[i].y;
5092         a[i+4][7] = -src[i].y*dst[i].y;
5093         b[i] = dst[i].x;
5094         b[i+4] = dst[i].y;
5095     }
5096
5097     solve( A, B, X, DECOMP_SVD );
5098     M.ptr<double>()[8] = 1.;
5099
5100     return M;
5101 }
5102
5103 /* Calculates coefficients of affine transformation
5104  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
5105  *
5106  * ui = c00*xi + c01*yi + c02
5107  *
5108  * vi = c10*xi + c11*yi + c12
5109  *
5110  * Coefficients are calculated by solving linear system:
5111  * / x0 y0  1  0  0  0 \ /c00\ /u0\
5112  * | x1 y1  1  0  0  0 | |c01| |u1|
5113  * | x2 y2  1  0  0  0 | |c02| |u2|
5114  * |  0  0  0 x0 y0  1 | |c10| |v0|
5115  * |  0  0  0 x1 y1  1 | |c11| |v1|
5116  * \  0  0  0 x2 y2  1 / |c12| |v2|
5117  *
5118  * where:
5119  *   cij - matrix coefficients
5120  */
5121
5122 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
5123 {
5124     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
5125     double a[6*6], b[6];
5126     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
5127
5128     for( int i = 0; i < 3; i++ )
5129     {
5130         int j = i*12;
5131         int k = i*12+6;
5132         a[j] = a[k+3] = src[i].x;
5133         a[j+1] = a[k+4] = src[i].y;
5134         a[j+2] = a[k+5] = 1;
5135         a[j+3] = a[j+4] = a[j+5] = 0;
5136         a[k] = a[k+1] = a[k+2] = 0;
5137         b[i*2] = dst[i].x;
5138         b[i*2+1] = dst[i].y;
5139     }
5140
5141     solve( A, B, X );
5142     return M;
5143 }
5144
5145 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
5146 {
5147     Mat matM = _matM.getMat();
5148     CV_Assert(matM.rows == 2 && matM.cols == 3);
5149     __iM.create(2, 3, matM.type());
5150     Mat _iM = __iM.getMat();
5151
5152     if( matM.type() == CV_32F )
5153     {
5154         const float* M = matM.ptr<float>();
5155         float* iM = _iM.ptr<float>();
5156         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5157
5158         double D = M[0]*M[step+1] - M[1]*M[step];
5159         D = D != 0 ? 1./D : 0;
5160         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5161         double b1 = -A11*M[2] - A12*M[step+2];
5162         double b2 = -A21*M[2] - A22*M[step+2];
5163
5164         iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
5165         iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
5166     }
5167     else if( matM.type() == CV_64F )
5168     {
5169         const double* M = matM.ptr<double>();
5170         double* iM = _iM.ptr<double>();
5171         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5172
5173         double D = M[0]*M[step+1] - M[1]*M[step];
5174         D = D != 0 ? 1./D : 0;
5175         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5176         double b1 = -A11*M[2] - A12*M[step+2];
5177         double b2 = -A21*M[2] - A22*M[step+2];
5178
5179         iM[0] = A11; iM[1] = A12; iM[2] = b1;
5180         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
5181     }
5182     else
5183         CV_Error( CV_StsUnsupportedFormat, "" );
5184 }
5185
5186 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
5187 {
5188     Mat src = _src.getMat(), dst = _dst.getMat();
5189     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
5190     return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5191 }
5192
5193 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
5194 {
5195     Mat src = _src.getMat(), dst = _dst.getMat();
5196     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
5197     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5198 }
5199
5200 CV_IMPL void
5201 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
5202 {
5203     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5204     CV_Assert( src.type() == dst.type() );
5205     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
5206         (double)dst.rows/src.rows, method );
5207 }
5208
5209
5210 CV_IMPL void
5211 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5212               int flags, CvScalar fillval )
5213 {
5214     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5215     cv::Mat matrix = cv::cvarrToMat(marr);
5216     CV_Assert( src.type() == dst.type() );
5217     cv::warpAffine( src, dst, matrix, dst.size(), flags,
5218         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5219         fillval );
5220 }
5221
5222 CV_IMPL void
5223 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5224                    int flags, CvScalar fillval )
5225 {
5226     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5227     cv::Mat matrix = cv::cvarrToMat(marr);
5228     CV_Assert( src.type() == dst.type() );
5229     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
5230         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5231         fillval );
5232 }
5233
5234 CV_IMPL void
5235 cvRemap( const CvArr* srcarr, CvArr* dstarr,
5236          const CvArr* _mapx, const CvArr* _mapy,
5237          int flags, CvScalar fillval )
5238 {
5239     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
5240     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
5241     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
5242     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
5243         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5244         fillval );
5245     CV_Assert( dst0.data == dst.data );
5246 }
5247
5248
5249 CV_IMPL CvMat*
5250 cv2DRotationMatrix( CvPoint2D32f center, double angle,
5251                     double scale, CvMat* matrix )
5252 {
5253     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
5254     CV_Assert( M.size() == M0.size() );
5255     M.convertTo(M0, M0.type());
5256     return matrix;
5257 }
5258
5259
5260 CV_IMPL CvMat*
5261 cvGetPerspectiveTransform( const CvPoint2D32f* src,
5262                           const CvPoint2D32f* dst,
5263                           CvMat* matrix )
5264 {
5265     cv::Mat M0 = cv::cvarrToMat(matrix),
5266         M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5267     CV_Assert( M.size() == M0.size() );
5268     M.convertTo(M0, M0.type());
5269     return matrix;
5270 }
5271
5272
5273 CV_IMPL CvMat*
5274 cvGetAffineTransform( const CvPoint2D32f* src,
5275                           const CvPoint2D32f* dst,
5276                           CvMat* matrix )
5277 {
5278     cv::Mat M0 = cv::cvarrToMat(matrix),
5279         M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5280     CV_Assert( M.size() == M0.size() );
5281     M.convertTo(M0, M0.type());
5282     return matrix;
5283 }
5284
5285
5286 CV_IMPL void
5287 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
5288 {
5289     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
5290     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
5291
5292     if( arr2 )
5293         map2 = cv::cvarrToMat(arr2);
5294     if( dstarr2 )
5295     {
5296         dstmap2 = cv::cvarrToMat(dstarr2);
5297         if( dstmap2.type() == CV_16SC1 )
5298             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
5299     }
5300
5301     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
5302 }
5303
5304 /****************************************************************************************\
5305 *                                   Log-Polar Transform                                  *
5306 \****************************************************************************************/
5307
5308 /* now it is done via Remap; more correct implementation should use
5309    some super-sampling technique outside of the "fovea" circle */
5310 CV_IMPL void
5311 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
5312             CvPoint2D32f center, double M, int flags )
5313 {
5314     cv::Ptr<CvMat> mapx, mapy;
5315
5316     CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
5317     CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
5318     CvSize ssize, dsize;
5319
5320     if( !CV_ARE_TYPES_EQ( src, dst ))
5321         CV_Error( CV_StsUnmatchedFormats, "" );
5322
5323     if( M <= 0 )
5324         CV_Error( CV_StsOutOfRange, "M should be >0" );
5325
5326     ssize = cvGetMatSize(src);
5327     dsize = cvGetMatSize(dst);
5328
5329     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5330     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5331
5332     if( !(flags & CV_WARP_INVERSE_MAP) )
5333     {
5334         int phi, rho;
5335         cv::AutoBuffer<double> _exp_tab(dsize.width);
5336         double* exp_tab = _exp_tab;
5337
5338         for( rho = 0; rho < dst->width; rho++ )
5339             exp_tab[rho] = std::exp(rho/M);
5340
5341         for( phi = 0; phi < dsize.height; phi++ )
5342         {
5343             double cp = cos(phi*2*CV_PI/dsize.height);
5344             double sp = sin(phi*2*CV_PI/dsize.height);
5345             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5346             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5347
5348             for( rho = 0; rho < dsize.width; rho++ )
5349             {
5350                 double r = exp_tab[rho];
5351                 double x = r*cp + center.x;
5352                 double y = r*sp + center.y;
5353
5354                 mx[rho] = (float)x;
5355                 my[rho] = (float)y;
5356             }
5357         }
5358     }
5359     else
5360     {
5361         int x, y;
5362         CvMat bufx, bufy, bufp, bufa;
5363         double ascale = ssize.height/(2*CV_PI);
5364         cv::AutoBuffer<float> _buf(4*dsize.width);
5365         float* buf = _buf;
5366
5367         bufx = cvMat( 1, dsize.width, CV_32F, buf );
5368         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5369         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5370         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5371
5372         for( x = 0; x < dsize.width; x++ )
5373             bufx.data.fl[x] = (float)x - center.x;
5374
5375         for( y = 0; y < dsize.height; y++ )
5376         {
5377             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5378             float* my = (float*)(mapy->data.ptr + y*mapy->step);
5379
5380             for( x = 0; x < dsize.width; x++ )
5381                 bufy.data.fl[x] = (float)y - center.y;
5382
5383 #if 1
5384             cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
5385
5386             for( x = 0; x < dsize.width; x++ )
5387                 bufp.data.fl[x] += 1.f;
5388
5389             cvLog( &bufp, &bufp );
5390
5391             for( x = 0; x < dsize.width; x++ )
5392             {
5393                 double rho = bufp.data.fl[x]*M;
5394                 double phi = bufa.data.fl[x]*ascale;
5395
5396                 mx[x] = (float)rho;
5397                 my[x] = (float)phi;
5398             }
5399 #else
5400             for( x = 0; x < dsize.width; x++ )
5401             {
5402                 double xx = bufx.data.fl[x];
5403                 double yy = bufy.data.fl[x];
5404
5405                 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
5406                 double a = atan2(yy,xx);
5407                 if( a < 0 )
5408                     a = 2*CV_PI + a;
5409                 a *= ascale;
5410
5411                 mx[x] = (float)p;
5412                 my[x] = (float)a;
5413             }
5414 #endif
5415         }
5416     }
5417
5418     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5419 }
5420
5421 void cv::logPolar( InputArray _src, OutputArray _dst,
5422                    Point2f center, double M, int flags )
5423 {
5424     Mat src = _src.getMat();
5425     _dst.create( src.size(), src.type() );
5426     CvMat c_src = src, c_dst = _dst.getMat();
5427     cvLogPolar( &c_src, &c_dst, center, M, flags );
5428 }
5429
5430 /****************************************************************************************
5431                                    Linear-Polar Transform
5432   J.L. Blanco, Apr 2009
5433  ****************************************************************************************/
5434 CV_IMPL
5435 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
5436             CvPoint2D32f center, double maxRadius, int flags )
5437 {
5438     cv::Ptr<CvMat> mapx, mapy;
5439
5440     CvMat srcstub, *src = (CvMat*)srcarr;
5441     CvMat dststub, *dst = (CvMat*)dstarr;
5442     CvSize ssize, dsize;
5443
5444     src = cvGetMat( srcarr, &srcstub,0,0 );
5445     dst = cvGetMat( dstarr, &dststub,0,0 );
5446
5447     if( !CV_ARE_TYPES_EQ( src, dst ))
5448         CV_Error( CV_StsUnmatchedFormats, "" );
5449
5450     ssize.width = src->cols;
5451     ssize.height = src->rows;
5452     dsize.width = dst->cols;
5453     dsize.height = dst->rows;
5454
5455     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5456     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5457
5458     if( !(flags & CV_WARP_INVERSE_MAP) )
5459     {
5460         int phi, rho;
5461
5462         for( phi = 0; phi < dsize.height; phi++ )
5463         {
5464             double cp = cos(phi*2*CV_PI/dsize.height);
5465             double sp = sin(phi*2*CV_PI/dsize.height);
5466             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5467             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5468
5469             for( rho = 0; rho < dsize.width; rho++ )
5470             {
5471                 double r = maxRadius*(rho+1)/dsize.width;
5472                 double x = r*cp + center.x;
5473                 double y = r*sp + center.y;
5474
5475                 mx[rho] = (float)x;
5476                 my[rho] = (float)y;
5477             }
5478         }
5479     }
5480     else
5481     {
5482         int x, y;
5483         CvMat bufx, bufy, bufp, bufa;
5484         const double ascale = ssize.height/(2*CV_PI);
5485         const double pscale = ssize.width/maxRadius;
5486
5487         cv::AutoBuffer<float> _buf(4*dsize.width);
5488         float* buf = _buf;
5489
5490         bufx = cvMat( 1, dsize.width, CV_32F, buf );
5491         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5492         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5493         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5494
5495         for( x = 0; x < dsize.width; x++ )
5496             bufx.data.fl[x] = (float)x - center.x;
5497
5498         for( y = 0; y < dsize.height; y++ )
5499         {
5500             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5501             float* my = (float*)(mapy->data.ptr + y*mapy->step);
5502
5503             for( x = 0; x < dsize.width; x++ )
5504                 bufy.data.fl[x] = (float)y - center.y;
5505
5506             cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
5507
5508             for( x = 0; x < dsize.width; x++ )
5509                 bufp.data.fl[x] += 1.f;
5510
5511             for( x = 0; x < dsize.width; x++ )
5512             {
5513                 double rho = bufp.data.fl[x]*pscale;
5514                 double phi = bufa.data.fl[x]*ascale;
5515                 mx[x] = (float)rho;
5516                 my[x] = (float)phi;
5517             }
5518         }
5519     }
5520
5521     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5522 }
5523
5524 void cv::linearPolar( InputArray _src, OutputArray _dst,
5525                       Point2f center, double maxRadius, int flags )
5526 {
5527     Mat src = _src.getMat();
5528     _dst.create( src.size(), src.type() );
5529     CvMat c_src = src, c_dst = _dst.getMat();
5530     cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );
5531 }
5532
5533 /* End of file. */