modules/imgproc/src/imgwarp.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 /* ////////////////////////////////////////////////////////////////////
  44 //
  45 //  Geometrical transforms on images and matrices: rotation, zoom etc.
  46 //
  47 // */
  48
  49 #include "precomp.hpp"
  50 #include <iostream>
  51 #include <vector>
  52
  53 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
  54 static IppStatus sts = ippInit();
  55 #endif
  56
  57 namespace cv
  58 {
  59
  60 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
  61     typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
  62     typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
  63     typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
  64     typedef IppStatus (CV_STDCALL* ippiResizeSqrPixelFunc)(const void*, IppiSize, int, IppiRect, void*, int, IppiRect, double, double, double, double, int, Ipp8u *);
  65
  66     template <int channels, typename Type>
  67     bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
  68     {
  69         Type values[channels];
  70         for( int i = 0; i < channels; i++ )
  71             values[i] = (Type)value[i];
  72         return func(values, dataPointer, step, size) >= 0;
  73     }
  74
  75     bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
  76     {
  77         if( channels == 1 )
  78         {
  79             switch( depth )
  80             {
  81             case CV_8U:
  82                 return ippiSet_8u_C1R((Ipp8u)value[0], (Ipp8u *)dataPointer, step, size) >= 0;
  83             case CV_16U:
  84                 return ippiSet_16u_C1R((Ipp16u)value[0], (Ipp16u *)dataPointer, step, size) >= 0;
  85             case CV_32F:
  86                 return ippiSet_32f_C1R((Ipp32f)value[0], (Ipp32f *)dataPointer, step, size) >= 0;
  87             }
  88         }
  89         else
  90         {
  91             if( channels == 3 )
  92             {
  93                 switch( depth )
  94                 {
  95                 case CV_8U:
  96                     return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
  97                 case CV_16U:
  98                     return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
  99                 case CV_32F:
 100                     return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
 101                 }
 102             }
 103             else if( channels == 4 )
 104             {
 105                 switch( depth )
 106                 {
 107                 case CV_8U:
 108                     return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
 109                 case CV_16U:
 110                     return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
 111                 case CV_32F:
 112                     return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
 113                 }
 114             }
 115         }
 116         return false;
 117     }
 118 #endif
 119
 120 /************** interpolation formulas and tables ***************/
 121
 122 const int INTER_RESIZE_COEF_BITS=11;
 123 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
 124
 125 const int INTER_REMAP_COEF_BITS=15;
 126 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
 127
 128 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
 129
 130 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
 131 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
 132
 133 #if CV_SSE2
 134 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
 135 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
 136 #endif
 137
 138 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
 139 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
 140
 141 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
 142 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
 143
 144 static inline void interpolateLinear( float x, float* coeffs )
 145 {
 146     coeffs[0] = 1.f - x;
 147     coeffs[1] = x;
 148 }
 149
 150 static inline void interpolateCubic( float x, float* coeffs )
 151 {
 152     const float A = -0.75f;
 153
 154     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
 155     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
 156     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
 157     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
 158 }
 159
 160 static inline void interpolateLanczos4( float x, float* coeffs )
 161 {
 162     static const double s45 = 0.70710678118654752440084436210485;
 163     static const double cs[][2]=
 164     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
 165
 166     if( x < FLT_EPSILON )
 167     {
 168         for( int i = 0; i < 8; i++ )
 169             coeffs[i] = 0;
 170         coeffs[3] = 1;
 171         return;
 172     }
 173
 174     float sum = 0;
 175     double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
 176     for(int i = 0; i < 8; i++ )
 177     {
 178         double y = -(x+3-i)*CV_PI*0.25;
 179         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
 180         sum += coeffs[i];
 181     }
 182
 183     sum = 1.f/sum;
 184     for(int i = 0; i < 8; i++ )
 185         coeffs[i] *= sum;
 186 }
 187
 188 static void initInterTab1D(int method, float* tab, int tabsz)
 189 {
 190     float scale = 1.f/tabsz;
 191     if( method == INTER_LINEAR )
 192     {
 193         for( int i = 0; i < tabsz; i++, tab += 2 )
 194             interpolateLinear( i*scale, tab );
 195     }
 196     else if( method == INTER_CUBIC )
 197     {
 198         for( int i = 0; i < tabsz; i++, tab += 4 )
 199             interpolateCubic( i*scale, tab );
 200     }
 201     else if( method == INTER_LANCZOS4 )
 202     {
 203         for( int i = 0; i < tabsz; i++, tab += 8 )
 204             interpolateLanczos4( i*scale, tab );
 205     }
 206     else
 207         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
 208 }
 209
 210
 211 static const void* initInterTab2D( int method, bool fixpt )
 212 {
 213     static bool inittab[INTER_MAX+1] = {false};
 214     float* tab = 0;
 215     short* itab = 0;
 216     int ksize = 0;
 217     if( method == INTER_LINEAR )
 218         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
 219     else if( method == INTER_CUBIC )
 220         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
 221     else if( method == INTER_LANCZOS4 )
 222         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
 223     else
 224         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
 225
 226     if( !inittab[method] )
 227     {
 228         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
 229         int i, j, k1, k2;
 230         initInterTab1D(method, _tab, INTER_TAB_SIZE);
 231         for( i = 0; i < INTER_TAB_SIZE; i++ )
 232             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
 233             {
 234                 int isum = 0;
 235                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
 236                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
 237
 238                 for( k1 = 0; k1 < ksize; k1++ )
 239                 {
 240                     float vy = _tab[i*ksize + k1];
 241                     for( k2 = 0; k2 < ksize; k2++ )
 242                     {
 243                         float v = vy*_tab[j*ksize + k2];
 244                         tab[k1*ksize + k2] = v;
 245                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
 246                     }
 247                 }
 248
 249                 if( isum != INTER_REMAP_COEF_SCALE )
 250                 {
 251                     int diff = isum - INTER_REMAP_COEF_SCALE;
 252                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
 253                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
 254                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
 255                         {
 256                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
 257                                 mk1 = k1, mk2 = k2;
 258                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
 259                                 Mk1 = k1, Mk2 = k2;
 260                         }
 261                     if( diff < 0 )
 262                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
 263                     else
 264                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
 265                 }
 266             }
 267         tab -= INTER_TAB_SIZE2*ksize*ksize;
 268         itab -= INTER_TAB_SIZE2*ksize*ksize;
 269 #if CV_SSE2
 270         if( method == INTER_LINEAR )
 271         {
 272             for( i = 0; i < INTER_TAB_SIZE2; i++ )
 273                 for( j = 0; j < 4; j++ )
 274                 {
 275                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
 276                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
 277                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
 278                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
 279                 }
 280         }
 281 #endif
 282         inittab[method] = true;
 283     }
 284     return fixpt ? (const void*)itab : (const void*)tab;
 285 }
 286
 287 #ifndef __MINGW32__
 288 static bool initAllInterTab2D()
 289 {
 290     return  initInterTab2D( INTER_LINEAR, false ) &&
 291             initInterTab2D( INTER_LINEAR, true ) &&
 292             initInterTab2D( INTER_CUBIC, false ) &&
 293             initInterTab2D( INTER_CUBIC, true ) &&
 294             initInterTab2D( INTER_LANCZOS4, false ) &&
 295             initInterTab2D( INTER_LANCZOS4, true );
 296 }
 297
 298 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
 299 #endif
 300
 301 template<typename ST, typename DT> struct Cast
 302 {
 303     typedef ST type1;
 304     typedef DT rtype;
 305
 306     DT operator()(ST val) const { return saturate_cast<DT>(val); }
 307 };
 308
 309 template<typename ST, typename DT, int bits> struct FixedPtCast
 310 {
 311     typedef ST type1;
 312     typedef DT rtype;
 313     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
 314
 315     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
 316 };
 317
 318 /****************************************************************************************\
 319 *                                         Resize                                         *
 320 \****************************************************************************************/
 321
 322 class resizeNNInvoker :
 323     public ParallelLoopBody
 324 {
 325 public:
 326     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
 327         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
 328         ify(_ify)
 329     {
 330     }
 331
 332     virtual void operator() (const Range& range) const
 333     {
 334         Size ssize = src.size(), dsize = dst.size();
 335         int y, x, pix_size = (int)src.elemSize();
 336
 337         for( y = range.start; y < range.end; y++ )
 338         {
 339             uchar* D = dst.data + dst.step*y;
 340             int sy = std::min(cvFloor(y*ify), ssize.height-1);
 341             const uchar* S = src.data + src.step*sy;
 342
 343             switch( pix_size )
 344             {
 345             case 1:
 346                 for( x = 0; x <= dsize.width - 2; x += 2 )
 347                 {
 348                     uchar t0 = S[x_ofs[x]];
 349                     uchar t1 = S[x_ofs[x+1]];
 350                     D[x] = t0;
 351                     D[x+1] = t1;
 352                 }
 353
 354                 for( ; x < dsize.width; x++ )
 355                     D[x] = S[x_ofs[x]];
 356                 break;
 357             case 2:
 358                 for( x = 0; x < dsize.width; x++ )
 359                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
 360                 break;
 361             case 3:
 362                 for( x = 0; x < dsize.width; x++, D += 3 )
 363                 {
 364                     const uchar* _tS = S + x_ofs[x];
 365                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
 366                 }
 367                 break;
 368             case 4:
 369                 for( x = 0; x < dsize.width; x++ )
 370                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
 371                 break;
 372             case 6:
 373                 for( x = 0; x < dsize.width; x++, D += 6 )
 374                 {
 375                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
 376                     ushort* _tD = (ushort*)D;
 377                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
 378                 }
 379                 break;
 380             case 8:
 381                 for( x = 0; x < dsize.width; x++, D += 8 )
 382                 {
 383                     const int* _tS = (const int*)(S + x_ofs[x]);
 384                     int* _tD = (int*)D;
 385                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
 386                 }
 387                 break;
 388             case 12:
 389                 for( x = 0; x < dsize.width; x++, D += 12 )
 390                 {
 391                     const int* _tS = (const int*)(S + x_ofs[x]);
 392                     int* _tD = (int*)D;
 393                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
 394                 }
 395                 break;
 396             default:
 397                 for( x = 0; x < dsize.width; x++, D += pix_size )
 398                 {
 399                     const int* _tS = (const int*)(S + x_ofs[x]);
 400                     int* _tD = (int*)D;
 401                     for( int k = 0; k < pix_size4; k++ )
 402                         _tD[k] = _tS[k];
 403                 }
 404             }
 405         }
 406     }
 407
 408 private:
 409     const Mat src;
 410     Mat dst;
 411     int* x_ofs, pix_size4;
 412     double ify;
 413
 414     resizeNNInvoker(const resizeNNInvoker&);
 415     resizeNNInvoker& operator=(const resizeNNInvoker&);
 416 };
 417
 418 static void
 419 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
 420 {
 421     Size ssize = src.size(), dsize = dst.size();
 422     AutoBuffer<int> _x_ofs(dsize.width);
 423     int* x_ofs = _x_ofs;
 424     int pix_size = (int)src.elemSize();
 425     int pix_size4 = (int)(pix_size / sizeof(int));
 426     double ifx = 1./fx, ify = 1./fy;
 427     int x;
 428
 429     for( x = 0; x < dsize.width; x++ )
 430     {
 431         int sx = cvFloor(x*ifx);
 432         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
 433     }
 434
 435     Range range(0, dsize.height);
 436     resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
 437     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 438 }
 439
 440
 441 struct VResizeNoVec
 442 {
 443     int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
 444 };
 445
 446 struct HResizeNoVec
 447 {
 448     int operator()(const uchar**, uchar**, int, const int*,
 449         const uchar*, int, int, int, int, int) const { return 0; }
 450 };
 451
 452 #if CV_SSE2
 453
 454 struct VResizeLinearVec_32s8u
 455 {
 456     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
 457     {
 458         if( !checkHardwareSupport(CV_CPU_SSE2) )
 459             return 0;
 460
 461         const int** src = (const int**)_src;
 462         const short* beta = (const short*)_beta;
 463         const int *S0 = src[0], *S1 = src[1];
 464         int x = 0;
 465         __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
 466         __m128i delta = _mm_set1_epi16(2);
 467
 468         if( (((size_t)S0|(size_t)S1)&15) == 0 )
 469             for( ; x <= width - 16; x += 16 )
 470             {
 471                 __m128i x0, x1, x2, y0, y1, y2;
 472                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
 473                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
 474                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
 475                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
 476                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
 477                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
 478
 479                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
 480                 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
 481                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
 482                 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
 483                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
 484                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
 485
 486                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
 487                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
 488
 489                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
 490                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
 491                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
 492             }
 493         else
 494             for( ; x <= width - 16; x += 16 )
 495             {
 496                 __m128i x0, x1, x2, y0, y1, y2;
 497                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
 498                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
 499                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
 500                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
 501                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
 502                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
 503
 504                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
 505                 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
 506                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
 507                 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
 508                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
 509                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
 510
 511                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
 512                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
 513
 514                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
 515                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
 516                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
 517             }
 518
 519         for( ; x < width - 4; x += 4 )
 520         {
 521             __m128i x0, y0;
 522             x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
 523             y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
 524             x0 = _mm_packs_epi32(x0, x0);
 525             y0 = _mm_packs_epi32(y0, y0);
 526             x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
 527             x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
 528             x0 = _mm_packus_epi16(x0, x0);
 529             *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
 530         }
 531
 532         return x;
 533     }
 534 };
 535
 536
 537 template<int shiftval> struct VResizeLinearVec_32f16
 538 {
 539     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 540     {
 541         if( !checkHardwareSupport(CV_CPU_SSE2) )
 542             return 0;
 543
 544         const float** src = (const float**)_src;
 545         const float* beta = (const float*)_beta;
 546         const float *S0 = src[0], *S1 = src[1];
 547         ushort* dst = (ushort*)_dst;
 548         int x = 0;
 549
 550         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
 551         __m128i preshift = _mm_set1_epi32(shiftval);
 552         __m128i postshift = _mm_set1_epi16((short)shiftval);
 553
 554         if( (((size_t)S0|(size_t)S1)&15) == 0 )
 555             for( ; x <= width - 16; x += 16 )
 556             {
 557                 __m128 x0, x1, y0, y1;
 558                 __m128i t0, t1, t2;
 559                 x0 = _mm_load_ps(S0 + x);
 560                 x1 = _mm_load_ps(S0 + x + 4);
 561                 y0 = _mm_load_ps(S1 + x);
 562                 y1 = _mm_load_ps(S1 + x + 4);
 563
 564                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 565                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 566                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 567                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 568                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
 569
 570                 x0 = _mm_load_ps(S0 + x + 8);
 571                 x1 = _mm_load_ps(S0 + x + 12);
 572                 y0 = _mm_load_ps(S1 + x + 8);
 573                 y1 = _mm_load_ps(S1 + x + 12);
 574
 575                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 576                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 577                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 578                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 579                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
 580
 581                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
 582                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
 583             }
 584         else
 585             for( ; x <= width - 16; x += 16 )
 586             {
 587                 __m128 x0, x1, y0, y1;
 588                 __m128i t0, t1, t2;
 589                 x0 = _mm_loadu_ps(S0 + x);
 590                 x1 = _mm_loadu_ps(S0 + x + 4);
 591                 y0 = _mm_loadu_ps(S1 + x);
 592                 y1 = _mm_loadu_ps(S1 + x + 4);
 593
 594                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 595                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 596                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 597                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 598                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
 599
 600                 x0 = _mm_loadu_ps(S0 + x + 8);
 601                 x1 = _mm_loadu_ps(S0 + x + 12);
 602                 y0 = _mm_loadu_ps(S1 + x + 8);
 603                 y1 = _mm_loadu_ps(S1 + x + 12);
 604
 605                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 606                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 607                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 608                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
 609                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
 610
 611                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
 612                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
 613             }
 614
 615         for( ; x < width - 4; x += 4 )
 616         {
 617             __m128 x0, y0;
 618             __m128i t0;
 619             x0 = _mm_loadu_ps(S0 + x);
 620             y0 = _mm_loadu_ps(S1 + x);
 621
 622             x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 623             t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
 624             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
 625             _mm_storel_epi64( (__m128i*)(dst + x), t0);
 626         }
 627
 628         return x;
 629     }
 630 };
 631
 632 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
 633 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
 634
 635 struct VResizeLinearVec_32f
 636 {
 637     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 638     {
 639         if( !checkHardwareSupport(CV_CPU_SSE) )
 640             return 0;
 641
 642         const float** src = (const float**)_src;
 643         const float* beta = (const float*)_beta;
 644         const float *S0 = src[0], *S1 = src[1];
 645         float* dst = (float*)_dst;
 646         int x = 0;
 647
 648         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
 649
 650         if( (((size_t)S0|(size_t)S1)&15) == 0 )
 651             for( ; x <= width - 8; x += 8 )
 652             {
 653                 __m128 x0, x1, y0, y1;
 654                 x0 = _mm_load_ps(S0 + x);
 655                 x1 = _mm_load_ps(S0 + x + 4);
 656                 y0 = _mm_load_ps(S1 + x);
 657                 y1 = _mm_load_ps(S1 + x + 4);
 658
 659                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 660                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 661
 662                 _mm_storeu_ps( dst + x, x0);
 663                 _mm_storeu_ps( dst + x + 4, x1);
 664             }
 665         else
 666             for( ; x <= width - 8; x += 8 )
 667             {
 668                 __m128 x0, x1, y0, y1;
 669                 x0 = _mm_loadu_ps(S0 + x);
 670                 x1 = _mm_loadu_ps(S0 + x + 4);
 671                 y0 = _mm_loadu_ps(S1 + x);
 672                 y1 = _mm_loadu_ps(S1 + x + 4);
 673
 674                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
 675                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
 676
 677                 _mm_storeu_ps( dst + x, x0);
 678                 _mm_storeu_ps( dst + x + 4, x1);
 679             }
 680
 681         return x;
 682     }
 683 };
 684
 685
 686 struct VResizeCubicVec_32s8u
 687 {
 688     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
 689     {
 690         if( !checkHardwareSupport(CV_CPU_SSE2) )
 691             return 0;
 692
 693         const int** src = (const int**)_src;
 694         const short* beta = (const short*)_beta;
 695         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
 696         int x = 0;
 697         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
 698         __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
 699             b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
 700
 701         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
 702             for( ; x <= width - 8; x += 8 )
 703             {
 704                 __m128i x0, x1, y0, y1;
 705                 __m128 s0, s1, f0, f1;
 706                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
 707                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
 708                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
 709                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
 710
 711                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
 712                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
 713                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
 714                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
 715                 s0 = _mm_add_ps(s0, f0);
 716                 s1 = _mm_add_ps(s1, f1);
 717
 718                 x0 = _mm_load_si128((const __m128i*)(S2 + x));
 719                 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
 720                 y0 = _mm_load_si128((const __m128i*)(S3 + x));
 721                 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
 722
 723                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
 724                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
 725                 s0 = _mm_add_ps(s0, f0);
 726                 s1 = _mm_add_ps(s1, f1);
 727                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
 728                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
 729                 s0 = _mm_add_ps(s0, f0);
 730                 s1 = _mm_add_ps(s1, f1);
 731
 732                 x0 = _mm_cvtps_epi32(s0);
 733                 x1 = _mm_cvtps_epi32(s1);
 734
 735                 x0 = _mm_packs_epi32(x0, x1);
 736                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
 737             }
 738         else
 739             for( ; x <= width - 8; x += 8 )
 740             {
 741                 __m128i x0, x1, y0, y1;
 742                 __m128 s0, s1, f0, f1;
 743                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
 744                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
 745                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
 746                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
 747
 748                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
 749                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
 750                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
 751                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
 752                 s0 = _mm_add_ps(s0, f0);
 753                 s1 = _mm_add_ps(s1, f1);
 754
 755                 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
 756                 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
 757                 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
 758                 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
 759
 760                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
 761                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
 762                 s0 = _mm_add_ps(s0, f0);
 763                 s1 = _mm_add_ps(s1, f1);
 764                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
 765                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
 766                 s0 = _mm_add_ps(s0, f0);
 767                 s1 = _mm_add_ps(s1, f1);
 768
 769                 x0 = _mm_cvtps_epi32(s0);
 770                 x1 = _mm_cvtps_epi32(s1);
 771
 772                 x0 = _mm_packs_epi32(x0, x1);
 773                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
 774             }
 775
 776         return x;
 777     }
 778 };
 779
 780
 781 template<int shiftval> struct VResizeCubicVec_32f16
 782 {
 783     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 784     {
 785         if( !checkHardwareSupport(CV_CPU_SSE2) )
 786             return 0;
 787
 788         const float** src = (const float**)_src;
 789         const float* beta = (const float*)_beta;
 790         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
 791         ushort* dst = (ushort*)_dst;
 792         int x = 0;
 793         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
 794             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
 795         __m128i preshift = _mm_set1_epi32(shiftval);
 796         __m128i postshift = _mm_set1_epi16((short)shiftval);
 797
 798         for( ; x <= width - 8; x += 8 )
 799         {
 800             __m128 x0, x1, y0, y1, s0, s1;
 801             __m128i t0, t1;
 802             x0 = _mm_loadu_ps(S0 + x);
 803             x1 = _mm_loadu_ps(S0 + x + 4);
 804             y0 = _mm_loadu_ps(S1 + x);
 805             y1 = _mm_loadu_ps(S1 + x + 4);
 806
 807             s0 = _mm_mul_ps(x0, b0);
 808             s1 = _mm_mul_ps(x1, b0);
 809             y0 = _mm_mul_ps(y0, b1);
 810             y1 = _mm_mul_ps(y1, b1);
 811             s0 = _mm_add_ps(s0, y0);
 812             s1 = _mm_add_ps(s1, y1);
 813
 814             x0 = _mm_loadu_ps(S2 + x);
 815             x1 = _mm_loadu_ps(S2 + x + 4);
 816             y0 = _mm_loadu_ps(S3 + x);
 817             y1 = _mm_loadu_ps(S3 + x + 4);
 818
 819             x0 = _mm_mul_ps(x0, b2);
 820             x1 = _mm_mul_ps(x1, b2);
 821             y0 = _mm_mul_ps(y0, b3);
 822             y1 = _mm_mul_ps(y1, b3);
 823             s0 = _mm_add_ps(s0, x0);
 824             s1 = _mm_add_ps(s1, x1);
 825             s0 = _mm_add_ps(s0, y0);
 826             s1 = _mm_add_ps(s1, y1);
 827
 828             t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
 829             t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
 830
 831             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
 832             _mm_storeu_si128( (__m128i*)(dst + x), t0);
 833         }
 834
 835         return x;
 836     }
 837 };
 838
 839 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
 840 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
 841
 842 struct VResizeCubicVec_32f
 843 {
 844     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
 845     {
 846         if( !checkHardwareSupport(CV_CPU_SSE) )
 847             return 0;
 848
 849         const float** src = (const float**)_src;
 850         const float* beta = (const float*)_beta;
 851         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
 852         float* dst = (float*)_dst;
 853         int x = 0;
 854         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
 855             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
 856
 857         for( ; x <= width - 8; x += 8 )
 858         {
 859             __m128 x0, x1, y0, y1, s0, s1;
 860             x0 = _mm_loadu_ps(S0 + x);
 861             x1 = _mm_loadu_ps(S0 + x + 4);
 862             y0 = _mm_loadu_ps(S1 + x);
 863             y1 = _mm_loadu_ps(S1 + x + 4);
 864
 865             s0 = _mm_mul_ps(x0, b0);
 866             s1 = _mm_mul_ps(x1, b0);
 867             y0 = _mm_mul_ps(y0, b1);
 868             y1 = _mm_mul_ps(y1, b1);
 869             s0 = _mm_add_ps(s0, y0);
 870             s1 = _mm_add_ps(s1, y1);
 871
 872             x0 = _mm_loadu_ps(S2 + x);
 873             x1 = _mm_loadu_ps(S2 + x + 4);
 874             y0 = _mm_loadu_ps(S3 + x);
 875             y1 = _mm_loadu_ps(S3 + x + 4);
 876
 877             x0 = _mm_mul_ps(x0, b2);
 878             x1 = _mm_mul_ps(x1, b2);
 879             y0 = _mm_mul_ps(y0, b3);
 880             y1 = _mm_mul_ps(y1, b3);
 881             s0 = _mm_add_ps(s0, x0);
 882             s1 = _mm_add_ps(s1, x1);
 883             s0 = _mm_add_ps(s0, y0);
 884             s1 = _mm_add_ps(s1, y1);
 885
 886             _mm_storeu_ps( dst + x, s0);
 887             _mm_storeu_ps( dst + x + 4, s1);
 888         }
 889
 890         return x;
 891     }
 892 };
 893
 894 #else
 895
 896 typedef VResizeNoVec VResizeLinearVec_32s8u;
 897 typedef VResizeNoVec VResizeLinearVec_32f16u;
 898 typedef VResizeNoVec VResizeLinearVec_32f16s;
 899 typedef VResizeNoVec VResizeLinearVec_32f;
 900
 901 typedef VResizeNoVec VResizeCubicVec_32s8u;
 902 typedef VResizeNoVec VResizeCubicVec_32f16u;
 903 typedef VResizeNoVec VResizeCubicVec_32f16s;
 904 typedef VResizeNoVec VResizeCubicVec_32f;
 905
 906 #endif
 907
 908 typedef HResizeNoVec HResizeLinearVec_8u32s;
 909 typedef HResizeNoVec HResizeLinearVec_16u32f;
 910 typedef HResizeNoVec HResizeLinearVec_16s32f;
 911 typedef HResizeNoVec HResizeLinearVec_32f;
 912 typedef HResizeNoVec HResizeLinearVec_64f;
 913
 914
 915 template<typename T, typename WT, typename AT, int ONE, class VecOp>
 916 struct HResizeLinear
 917 {
 918     typedef T value_type;
 919     typedef WT buf_type;
 920     typedef AT alpha_type;
 921
 922     void operator()(const T** src, WT** dst, int count,
 923                     const int* xofs, const AT* alpha,
 924                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
 925     {
 926         int dx, k;
 927         VecOp vecOp;
 928
 929         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
 930             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
 931
 932         for( k = 0; k <= count - 2; k++ )
 933         {
 934             const T *S0 = src[k], *S1 = src[k+1];
 935             WT *D0 = dst[k], *D1 = dst[k+1];
 936             for( dx = dx0; dx < xmax; dx++ )
 937             {
 938                 int sx = xofs[dx];
 939                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
 940                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
 941                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
 942                 D0[dx] = t0; D1[dx] = t1;
 943             }
 944
 945             for( ; dx < dwidth; dx++ )
 946             {
 947                 int sx = xofs[dx];
 948                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
 949             }
 950         }
 951
 952         for( ; k < count; k++ )
 953         {
 954             const T *S = src[k];
 955             WT *D = dst[k];
 956             for( dx = 0; dx < xmax; dx++ )
 957             {
 958                 int sx = xofs[dx];
 959                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
 960             }
 961
 962             for( ; dx < dwidth; dx++ )
 963                 D[dx] = WT(S[xofs[dx]]*ONE);
 964         }
 965     }
 966 };
 967
 968
 969 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
 970 struct VResizeLinear
 971 {
 972     typedef T value_type;
 973     typedef WT buf_type;
 974     typedef AT alpha_type;
 975
 976     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
 977     {
 978         WT b0 = beta[0], b1 = beta[1];
 979         const WT *S0 = src[0], *S1 = src[1];
 980         CastOp castOp;
 981         VecOp vecOp;
 982
 983         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
 984         #if CV_ENABLE_UNROLLED
 985         for( ; x <= width - 4; x += 4 )
 986         {
 987             WT t0, t1;
 988             t0 = S0[x]*b0 + S1[x]*b1;
 989             t1 = S0[x+1]*b0 + S1[x+1]*b1;
 990             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
 991             t0 = S0[x+2]*b0 + S1[x+2]*b1;
 992             t1 = S0[x+3]*b0 + S1[x+3]*b1;
 993             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
 994         }
 995         #endif
 996         for( ; x < width; x++ )
 997             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
 998     }
 999 };
1000
1001 template<>
1002 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1003 {
1004     typedef uchar value_type;
1005     typedef int buf_type;
1006     typedef short alpha_type;
1007
1008     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1009     {
1010         alpha_type b0 = beta[0], b1 = beta[1];
1011         const buf_type *S0 = src[0], *S1 = src[1];
1012         VResizeLinearVec_32s8u vecOp;
1013
1014         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1015         #if CV_ENABLE_UNROLLED
1016         for( ; x <= width - 4; x += 4 )
1017         {
1018             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1019             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1020             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1021             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1022         }
1023         #endif
1024         for( ; x < width; x++ )
1025             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1026     }
1027 };
1028
1029
1030 template<typename T, typename WT, typename AT>
1031 struct HResizeCubic
1032 {
1033     typedef T value_type;
1034     typedef WT buf_type;
1035     typedef AT alpha_type;
1036
1037     void operator()(const T** src, WT** dst, int count,
1038                     const int* xofs, const AT* alpha,
1039                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1040     {
1041         for( int k = 0; k < count; k++ )
1042         {
1043             const T *S = src[k];
1044             WT *D = dst[k];
1045             int dx = 0, limit = xmin;
1046             for(;;)
1047             {
1048                 for( ; dx < limit; dx++, alpha += 4 )
1049                 {
1050                     int j, sx = xofs[dx] - cn;
1051                     WT v = 0;
1052                     for( j = 0; j < 4; j++ )
1053                     {
1054                         int sxj = sx + j*cn;
1055                         if( (unsigned)sxj >= (unsigned)swidth )
1056                         {
1057                             while( sxj < 0 )
1058                                 sxj += cn;
1059                             while( sxj >= swidth )
1060                                 sxj -= cn;
1061                         }
1062                         v += S[sxj]*alpha[j];
1063                     }
1064                     D[dx] = v;
1065                 }
1066                 if( limit == dwidth )
1067                     break;
1068                 for( ; dx < xmax; dx++, alpha += 4 )
1069                 {
1070                     int sx = xofs[dx];
1071                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1072                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1073                 }
1074                 limit = dwidth;
1075             }
1076             alpha -= dwidth*4;
1077         }
1078     }
1079 };
1080
1081
1082 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1083 struct VResizeCubic
1084 {
1085     typedef T value_type;
1086     typedef WT buf_type;
1087     typedef AT alpha_type;
1088
1089     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1090     {
1091         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1092         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1093         CastOp castOp;
1094         VecOp vecOp;
1095
1096         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1097         for( ; x < width; x++ )
1098             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
1099     }
1100 };
1101
1102
1103 template<typename T, typename WT, typename AT>
1104 struct HResizeLanczos4
1105 {
1106     typedef T value_type;
1107     typedef WT buf_type;
1108     typedef AT alpha_type;
1109
1110     void operator()(const T** src, WT** dst, int count,
1111                     const int* xofs, const AT* alpha,
1112                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
1113     {
1114         for( int k = 0; k < count; k++ )
1115         {
1116             const T *S = src[k];
1117             WT *D = dst[k];
1118             int dx = 0, limit = xmin;
1119             for(;;)
1120             {
1121                 for( ; dx < limit; dx++, alpha += 8 )
1122                 {
1123                     int j, sx = xofs[dx] - cn*3;
1124                     WT v = 0;
1125                     for( j = 0; j < 8; j++ )
1126                     {
1127                         int sxj = sx + j*cn;
1128                         if( (unsigned)sxj >= (unsigned)swidth )
1129                         {
1130                             while( sxj < 0 )
1131                                 sxj += cn;
1132                             while( sxj >= swidth )
1133                                 sxj -= cn;
1134                         }
1135                         v += S[sxj]*alpha[j];
1136                     }
1137                     D[dx] = v;
1138                 }
1139                 if( limit == dwidth )
1140                     break;
1141                 for( ; dx < xmax; dx++, alpha += 8 )
1142                 {
1143                     int sx = xofs[dx];
1144                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
1145                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
1146                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
1147                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
1148                 }
1149                 limit = dwidth;
1150             }
1151             alpha -= dwidth*8;
1152         }
1153     }
1154 };
1155
1156
1157 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1158 struct VResizeLanczos4
1159 {
1160     typedef T value_type;
1161     typedef WT buf_type;
1162     typedef AT alpha_type;
1163
1164     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1165     {
1166         CastOp castOp;
1167         VecOp vecOp;
1168         int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1169         #if CV_ENABLE_UNROLLED
1170         for( ; x <= width - 4; x += 4 )
1171         {
1172             WT b = beta[0];
1173             const WT* S = src[0];
1174             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
1175
1176             for( k = 1; k < 8; k++ )
1177             {
1178                 b = beta[k]; S = src[k];
1179                 s0 += S[x]*b; s1 += S[x+1]*b;
1180                 s2 += S[x+2]*b; s3 += S[x+3]*b;
1181             }
1182
1183             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
1184             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
1185         }
1186         #endif
1187         for( ; x < width; x++ )
1188         {
1189             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
1190                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
1191                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
1192         }
1193     }
1194 };
1195
1196
1197 static inline int clip(int x, int a, int b)
1198 {
1199     return x >= a ? (x < b ? x : b-1) : a;
1200 }
1201
1202 static const int MAX_ESIZE=16;
1203
1204 template <typename HResize, typename VResize>
1205 class resizeGeneric_Invoker :
1206     public ParallelLoopBody
1207 {
1208 public:
1209     typedef typename HResize::value_type T;
1210     typedef typename HResize::buf_type WT;
1211     typedef typename HResize::alpha_type AT;
1212
1213     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
1214         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
1215         int _ksize, int _xmin, int _xmax) :
1216         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
1217         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
1218         ksize(_ksize), xmin(_xmin), xmax(_xmax)
1219     {
1220     }
1221
1222     virtual void operator() (const Range& range) const
1223     {
1224         int dy, cn = src.channels();
1225         HResize hresize;
1226         VResize vresize;
1227
1228         int bufstep = (int)alignSize(dsize.width, 16);
1229         AutoBuffer<WT> _buffer(bufstep*ksize);
1230         const T* srows[MAX_ESIZE]={0};
1231         WT* rows[MAX_ESIZE]={0};
1232         int prev_sy[MAX_ESIZE];
1233
1234         for(int k = 0; k < ksize; k++ )
1235         {
1236             prev_sy[k] = -1;
1237             rows[k] = (WT*)_buffer + bufstep*k;
1238         }
1239
1240         const AT* beta = _beta + ksize * range.start;
1241
1242         for( dy = range.start; dy < range.end; dy++, beta += ksize )
1243         {
1244             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
1245
1246             for(int k = 0; k < ksize; k++ )
1247             {
1248                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
1249                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
1250                 {
1251                     if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
1252                     {
1253                         if( k1 > k )
1254                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
1255                         break;
1256                     }
1257                 }
1258                 if( k1 == ksize )
1259                     k0 = std::min(k0, k); // remember the first row that needs to be computed
1260                 srows[k] = (T*)(src.data + src.step*sy);
1261                 prev_sy[k] = sy;
1262             }
1263
1264             if( k0 < ksize )
1265                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
1266                         ssize.width, dsize.width, cn, xmin, xmax );
1267             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
1268         }
1269     }
1270
1271 private:
1272     Mat src;
1273     Mat dst;
1274     const int* xofs, *yofs;
1275     const AT* alpha, *_beta;
1276     Size ssize, dsize;
1277     int ksize, xmin, xmax;
1278 };
1279
1280 template<class HResize, class VResize>
1281 static void resizeGeneric_( const Mat& src, Mat& dst,
1282                             const int* xofs, const void* _alpha,
1283                             const int* yofs, const void* _beta,
1284                             int xmin, int xmax, int ksize )
1285 {
1286     typedef typename HResize::alpha_type AT;
1287
1288     const AT* beta = (const AT*)_beta;
1289     Size ssize = src.size(), dsize = dst.size();
1290     int cn = src.channels();
1291     ssize.width *= cn;
1292     dsize.width *= cn;
1293     xmin *= cn;
1294     xmax *= cn;
1295     // image resize is a separable operation. In case of not too strong
1296
1297     Range range(0, dsize.height);
1298     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
1299         ssize, dsize, ksize, xmin, xmax);
1300     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1301 }
1302
1303 template <typename T, typename WT>
1304 struct ResizeAreaFastNoVec
1305 {
1306     ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
1307         int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
1308     int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
1309 };
1310
1311 template<typename T>
1312 struct ResizeAreaFastVec
1313 {
1314     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
1315         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
1316     {
1317         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
1318     }
1319
1320     int operator() (const T* S, T* D, int w) const
1321     {
1322         if( !fast_mode )
1323             return 0;
1324
1325         const T* nextS = (const T*)((const uchar*)S + step);
1326         int dx = 0;
1327
1328         if (cn == 1)
1329             for( ; dx < w; ++dx )
1330             {
1331                 int index = dx*2;
1332                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
1333             }
1334         else if (cn == 3)
1335             for( ; dx < w; dx += 3 )
1336             {
1337                 int index = dx*2;
1338                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
1339                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
1340                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
1341             }
1342         else
1343             {
1344                 assert(cn == 4);
1345                 for( ; dx < w; dx += 4 )
1346                 {
1347                     int index = dx*2;
1348                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
1349                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
1350                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
1351                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
1352                 }
1353             }
1354
1355         return dx;
1356     }
1357
1358 private:
1359     int scale_x, scale_y;
1360     int cn;
1361     bool fast_mode;
1362     int step;
1363 };
1364
1365 template <typename T, typename WT, typename VecOp>
1366 class resizeAreaFast_Invoker :
1367     public ParallelLoopBody
1368 {
1369 public:
1370     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
1371         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
1372         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
1373         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
1374     {
1375     }
1376
1377     virtual void operator() (const Range& range) const
1378     {
1379         Size ssize = src.size(), dsize = dst.size();
1380         int cn = src.channels();
1381         int area = scale_x*scale_y;
1382         float scale = 1.f/(area);
1383         int dwidth1 = (ssize.width/scale_x)*cn;
1384         dsize.width *= cn;
1385         ssize.width *= cn;
1386         int dy, dx, k = 0;
1387
1388         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
1389
1390         for( dy = range.start; dy < range.end; dy++ )
1391         {
1392             T* D = (T*)(dst.data + dst.step*dy);
1393             int sy0 = dy*scale_y;
1394             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
1395
1396             if( sy0 >= ssize.height )
1397             {
1398                 for( dx = 0; dx < dsize.width; dx++ )
1399                     D[dx] = 0;
1400                 continue;
1401             }
1402
1403             dx = vop((const T*)(src.data + src.step * sy0), D, w);
1404             for( ; dx < w; dx++ )
1405             {
1406                 const T* S = (const T*)(src.data + src.step * sy0) + xofs[dx];
1407                 WT sum = 0;
1408                 k = 0;
1409                 #if CV_ENABLE_UNROLLED
1410                 for( ; k <= area - 4; k += 4 )
1411                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
1412                 #endif
1413                 for( ; k < area; k++ )
1414                     sum += S[ofs[k]];
1415
1416                 D[dx] = saturate_cast<T>(sum * scale);
1417             }
1418
1419             for( ; dx < dsize.width; dx++ )
1420             {
1421                 WT sum = 0;
1422                 int count = 0, sx0 = xofs[dx];
1423                 if( sx0 >= ssize.width )
1424                     D[dx] = 0;
1425
1426                 for( int sy = 0; sy < scale_y; sy++ )
1427                 {
1428                     if( sy0 + sy >= ssize.height )
1429                         break;
1430                     const T* S = (const T*)(src.data + src.step*(sy0 + sy)) + sx0;
1431                     for( int sx = 0; sx < scale_x*cn; sx += cn )
1432                     {
1433                         if( sx0 + sx >= ssize.width )
1434                             break;
1435                         sum += S[sx];
1436                         count++;
1437                     }
1438                 }
1439
1440                 D[dx] = saturate_cast<T>((float)sum/count);
1441             }
1442         }
1443     }
1444
1445 private:
1446     Mat src;
1447     Mat dst;
1448     int scale_x, scale_y;
1449     const int *ofs, *xofs;
1450 };
1451
1452 template<typename T, typename WT, typename VecOp>
1453 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
1454                              int scale_x, int scale_y )
1455 {
1456     Range range(0, dst.rows);
1457     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
1458         scale_y, ofs, xofs);
1459     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1460 }
1461
1462 struct DecimateAlpha
1463 {
1464     int si, di;
1465     float alpha;
1466 };
1467
1468
1469 template<typename T, typename WT> class ResizeArea_Invoker :
1470     public ParallelLoopBody
1471 {
1472 public:
1473     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
1474                         const DecimateAlpha* _xtab, int _xtab_size,
1475                         const DecimateAlpha* _ytab, int _ytab_size,
1476                         const int* _tabofs )
1477     {
1478         src = &_src;
1479         dst = &_dst;
1480         xtab0 = _xtab;
1481         xtab_size0 = _xtab_size;
1482         ytab = _ytab;
1483         ytab_size = _ytab_size;
1484         tabofs = _tabofs;
1485     }
1486
1487     virtual void operator() (const Range& range) const
1488     {
1489         Size dsize = dst->size();
1490         int cn = dst->channels();
1491         dsize.width *= cn;
1492         AutoBuffer<WT> _buffer(dsize.width*2);
1493         const DecimateAlpha* xtab = xtab0;
1494         int xtab_size = xtab_size0;
1495         WT *buf = _buffer, *sum = buf + dsize.width;
1496         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
1497
1498         for( dx = 0; dx < dsize.width; dx++ )
1499             sum[dx] = (WT)0;
1500
1501         for( j = j_start; j < j_end; j++ )
1502         {
1503             WT beta = ytab[j].alpha;
1504             int dy = ytab[j].di;
1505             int sy = ytab[j].si;
1506
1507             {
1508                 const T* S = (const T*)(src->data + src->step*sy);
1509                 for( dx = 0; dx < dsize.width; dx++ )
1510                     buf[dx] = (WT)0;
1511
1512                 if( cn == 1 )
1513                     for( k = 0; k < xtab_size; k++ )
1514                     {
1515                         int dxn = xtab[k].di;
1516                         WT alpha = xtab[k].alpha;
1517                         buf[dxn] += S[xtab[k].si]*alpha;
1518                     }
1519                 else if( cn == 2 )
1520                     for( k = 0; k < xtab_size; k++ )
1521                     {
1522                         int sxn = xtab[k].si;
1523                         int dxn = xtab[k].di;
1524                         WT alpha = xtab[k].alpha;
1525                         WT t0 = buf[dxn] + S[sxn]*alpha;
1526                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
1527                         buf[dxn] = t0; buf[dxn+1] = t1;
1528                     }
1529                 else if( cn == 3 )
1530                     for( k = 0; k < xtab_size; k++ )
1531                     {
1532                         int sxn = xtab[k].si;
1533                         int dxn = xtab[k].di;
1534                         WT alpha = xtab[k].alpha;
1535                         WT t0 = buf[dxn] + S[sxn]*alpha;
1536                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
1537                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
1538                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
1539                     }
1540                 else if( cn == 4 )
1541                 {
1542                     for( k = 0; k < xtab_size; k++ )
1543                     {
1544                         int sxn = xtab[k].si;
1545                         int dxn = xtab[k].di;
1546                         WT alpha = xtab[k].alpha;
1547                         WT t0 = buf[dxn] + S[sxn]*alpha;
1548                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
1549                         buf[dxn] = t0; buf[dxn+1] = t1;
1550                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
1551                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
1552                         buf[dxn+2] = t0; buf[dxn+3] = t1;
1553                     }
1554                 }
1555                 else
1556                 {
1557                     for( k = 0; k < xtab_size; k++ )
1558                     {
1559                         int sxn = xtab[k].si;
1560                         int dxn = xtab[k].di;
1561                         WT alpha = xtab[k].alpha;
1562                         for( int c = 0; c < cn; c++ )
1563                             buf[dxn + c] += S[sxn + c]*alpha;
1564                     }
1565                 }
1566             }
1567
1568             if( dy != prev_dy )
1569             {
1570                 T* D = (T*)(dst->data + dst->step*prev_dy);
1571
1572                 for( dx = 0; dx < dsize.width; dx++ )
1573                 {
1574                     D[dx] = saturate_cast<T>(sum[dx]);
1575                     sum[dx] = beta*buf[dx];
1576                 }
1577                 prev_dy = dy;
1578             }
1579             else
1580             {
1581                 for( dx = 0; dx < dsize.width; dx++ )
1582                     sum[dx] += beta*buf[dx];
1583             }
1584         }
1585
1586         {
1587         T* D = (T*)(dst->data + dst->step*prev_dy);
1588         for( dx = 0; dx < dsize.width; dx++ )
1589             D[dx] = saturate_cast<T>(sum[dx]);
1590         }
1591     }
1592
1593 private:
1594     const Mat* src;
1595     Mat* dst;
1596     const DecimateAlpha* xtab0;
1597     const DecimateAlpha* ytab;
1598     int xtab_size0, ytab_size;
1599     const int* tabofs;
1600 };
1601
1602
1603 template <typename T, typename WT>
1604 static void resizeArea_( const Mat& src, Mat& dst,
1605                          const DecimateAlpha* xtab, int xtab_size,
1606                          const DecimateAlpha* ytab, int ytab_size,
1607                          const int* tabofs )
1608 {
1609     parallel_for_(Range(0, dst.rows),
1610                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
1611                  dst.total()/((double)(1 << 16)));
1612 }
1613
1614
1615 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
1616                             const int* xofs, const void* alpha,
1617                             const int* yofs, const void* beta,
1618                             int xmin, int xmax, int ksize );
1619
1620 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
1621                                     const int* ofs, const int *xofs,
1622                                     int scale_x, int scale_y );
1623
1624 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
1625                                 const DecimateAlpha* xtab, int xtab_size,
1626                                 const DecimateAlpha* ytab, int ytab_size,
1627                                 const int* yofs);
1628
1629
1630 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
1631 {
1632     int k = 0;
1633     for(int dx = 0; dx < dsize; dx++ )
1634     {
1635         double fsx1 = dx * scale;
1636         double fsx2 = fsx1 + scale;
1637         double cellWidth = min(scale, ssize - fsx1);
1638
1639         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
1640
1641         sx2 = std::min(sx2, ssize - 1);
1642         sx1 = std::min(sx1, sx2);
1643
1644         if( sx1 - fsx1 > 1e-3 )
1645         {
1646             assert( k < ssize*2 );
1647             tab[k].di = dx * cn;
1648             tab[k].si = (sx1 - 1) * cn;
1649             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
1650         }
1651
1652         for(int sx = sx1; sx < sx2; sx++ )
1653         {
1654             assert( k < ssize*2 );
1655             tab[k].di = dx * cn;
1656             tab[k].si = sx * cn;
1657             tab[k++].alpha = float(1.0 / cellWidth);
1658         }
1659
1660         if( fsx2 - sx2 > 1e-3 )
1661         {
1662             assert( k < ssize*2 );
1663             tab[k].di = dx * cn;
1664             tab[k].si = sx2 * cn;
1665             tab[k++].alpha = (float)(min(min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
1666         }
1667     }
1668     return k;
1669 }
1670
1671 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1672 class IPPresizeInvoker :
1673     public ParallelLoopBody
1674 {
1675 public:
1676     IPPresizeInvoker(Mat &_src, Mat &_dst, double &_inv_scale_x, double &_inv_scale_y, int _mode, ippiResizeSqrPixelFunc _func, bool *_ok) :
1677       ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), func(_func), ok(_ok)
1678       {
1679           *ok = true;
1680       }
1681
1682       virtual void operator() (const Range& range) const
1683       {
1684           int cn = src.channels();
1685           IppiRect srcroi = { 0, range.start, src.cols, range.end - range.start };
1686           int dsty = CV_IMIN(cvRound(range.start * inv_scale_y), dst.rows);
1687           int dstwidth = CV_IMIN(cvRound(src.cols * inv_scale_x), dst.cols);
1688           int dstheight = CV_IMIN(cvRound(range.end * inv_scale_y), dst.rows);
1689           IppiRect dstroi = { 0, dsty, dstwidth, dstheight - dsty };
1690           int bufsize;
1691           ippiResizeGetBufSize( srcroi, dstroi, cn, mode, &bufsize );
1692           AutoBuffer<uchar> buf(bufsize + 64);
1693           uchar* bufptr = alignPtr((uchar*)buf, 32);
1694           if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, bufptr ) < 0 )
1695               *ok = false;
1696       }
1697 private:
1698     Mat &src;
1699     Mat &dst;
1700     double inv_scale_x;
1701     double inv_scale_y;
1702     int mode;
1703     ippiResizeSqrPixelFunc func;
1704     bool *ok;
1705     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
1706 };
1707 #endif
1708
1709 }
1710
1711
1712 //////////////////////////////////////////////////////////////////////////////////////////
1713
1714 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
1715                  double inv_scale_x, double inv_scale_y, int interpolation )
1716 {
1717     static ResizeFunc linear_tab[] =
1718     {
1719         resizeGeneric_<
1720             HResizeLinear<uchar, int, short,
1721                 INTER_RESIZE_COEF_SCALE,
1722                 HResizeLinearVec_8u32s>,
1723             VResizeLinear<uchar, int, short,
1724                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
1725                 VResizeLinearVec_32s8u> >,
1726         0,
1727         resizeGeneric_<
1728             HResizeLinear<ushort, float, float, 1,
1729                 HResizeLinearVec_16u32f>,
1730             VResizeLinear<ushort, float, float, Cast<float, ushort>,
1731                 VResizeLinearVec_32f16u> >,
1732         resizeGeneric_<
1733             HResizeLinear<short, float, float, 1,
1734                 HResizeLinearVec_16s32f>,
1735             VResizeLinear<short, float, float, Cast<float, short>,
1736                 VResizeLinearVec_32f16s> >,
1737         0,
1738         resizeGeneric_<
1739             HResizeLinear<float, float, float, 1,
1740                 HResizeLinearVec_32f>,
1741             VResizeLinear<float, float, float, Cast<float, float>,
1742                 VResizeLinearVec_32f> >,
1743         resizeGeneric_<
1744             HResizeLinear<double, double, float, 1,
1745                 HResizeNoVec>,
1746             VResizeLinear<double, double, float, Cast<double, double>,
1747                 VResizeNoVec> >,
1748         0
1749     };
1750
1751     static ResizeFunc cubic_tab[] =
1752     {
1753         resizeGeneric_<
1754             HResizeCubic<uchar, int, short>,
1755             VResizeCubic<uchar, int, short,
1756                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
1757                 VResizeCubicVec_32s8u> >,
1758         0,
1759         resizeGeneric_<
1760             HResizeCubic<ushort, float, float>,
1761             VResizeCubic<ushort, float, float, Cast<float, ushort>,
1762             VResizeCubicVec_32f16u> >,
1763         resizeGeneric_<
1764             HResizeCubic<short, float, float>,
1765             VResizeCubic<short, float, float, Cast<float, short>,
1766             VResizeCubicVec_32f16s> >,
1767         0,
1768         resizeGeneric_<
1769             HResizeCubic<float, float, float>,
1770             VResizeCubic<float, float, float, Cast<float, float>,
1771             VResizeCubicVec_32f> >,
1772         resizeGeneric_<
1773             HResizeCubic<double, double, float>,
1774             VResizeCubic<double, double, float, Cast<double, double>,
1775             VResizeNoVec> >,
1776         0
1777     };
1778
1779     static ResizeFunc lanczos4_tab[] =
1780     {
1781         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
1782             VResizeLanczos4<uchar, int, short,
1783             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
1784             VResizeNoVec> >,
1785         0,
1786         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
1787             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
1788             VResizeNoVec> >,
1789         resizeGeneric_<HResizeLanczos4<short, float, float>,
1790             VResizeLanczos4<short, float, float, Cast<float, short>,
1791             VResizeNoVec> >,
1792         0,
1793         resizeGeneric_<HResizeLanczos4<float, float, float>,
1794             VResizeLanczos4<float, float, float, Cast<float, float>,
1795             VResizeNoVec> >,
1796         resizeGeneric_<HResizeLanczos4<double, double, float>,
1797             VResizeLanczos4<double, double, float, Cast<double, double>,
1798             VResizeNoVec> >,
1799         0
1800     };
1801
1802     static ResizeAreaFastFunc areafast_tab[] =
1803     {
1804         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar> >,
1805         0,
1806         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort> >,
1807         resizeAreaFast_<short, float, ResizeAreaFastVec<short> >,
1808         0,
1809         resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
1810         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
1811         0
1812     };
1813
1814     static ResizeAreaFunc area_tab[] =
1815     {
1816         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
1817         resizeArea_<short, float>, 0, resizeArea_<float, float>,
1818         resizeArea_<double, double>, 0
1819     };
1820
1821     Mat src = _src.getMat();
1822     Size ssize = src.size();
1823
1824     CV_Assert( ssize.area() > 0 );
1825     CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
1826     if( !dsize.area() )
1827     {
1828         dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
1829             saturate_cast<int>(src.rows*inv_scale_y));
1830         CV_Assert( dsize.area() );
1831     }
1832     else
1833     {
1834         inv_scale_x = (double)dsize.width/src.cols;
1835         inv_scale_y = (double)dsize.height/src.rows;
1836     }
1837     _dst.create(dsize, src.type());
1838     Mat dst = _dst.getMat();
1839
1840
1841 #ifdef HAVE_TEGRA_OPTIMIZATION
1842     if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
1843         return;
1844 #endif
1845
1846     int depth = src.depth(), cn = src.channels();
1847     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
1848     int k, sx, sy, dx, dy;
1849 /*
1850 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1851     int mode = interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : 0;
1852     int type = src.type();
1853     ippiResizeSqrPixelFunc ippFunc =
1854         type == CV_8UC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_8u_C1R :
1855         type == CV_8UC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_8u_C3R :
1856         type == CV_8UC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_8u_C4R :
1857         type == CV_16UC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16u_C1R :
1858         type == CV_16UC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16u_C3R :
1859         type == CV_16UC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16u_C4R :
1860         type == CV_16SC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16s_C1R :
1861         type == CV_16SC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16s_C3R :
1862         type == CV_16SC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_16s_C4R :
1863         type == CV_32FC1 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_32f_C1R :
1864         type == CV_32FC3 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_32f_C3R :
1865         type == CV_32FC4 ? (ippiResizeSqrPixelFunc)ippiResizeSqrPixel_32f_C4R :
1866         0;
1867     if( ippFunc && mode != 0 )
1868     {
1869         bool ok;
1870         Range range(0, src.rows);
1871         IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, ippFunc, &ok);
1872         parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1873         if( ok )
1874             return;
1875     }
1876 #endif
1877 */
1878     if( interpolation == INTER_NEAREST )
1879     {
1880         resizeNN( src, dst, inv_scale_x, inv_scale_y );
1881         return;
1882     }
1883
1884     {
1885         int iscale_x = saturate_cast<int>(scale_x);
1886         int iscale_y = saturate_cast<int>(scale_y);
1887
1888         bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
1889                 std::abs(scale_y - iscale_y) < DBL_EPSILON;
1890
1891         // in case of scale_x && scale_y is equal to 2
1892         // INTER_AREA (fast) also is equal to INTER_LINEAR
1893         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
1894         {
1895             interpolation = INTER_AREA;
1896         }
1897
1898         // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
1899         // In other cases it is emulated using some variant of bilinear interpolation
1900         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
1901         {
1902             if( is_area_fast )
1903             {
1904                 int area = iscale_x*iscale_y;
1905                 size_t srcstep = src.step / src.elemSize1();
1906                 AutoBuffer<int> _ofs(area + dsize.width*cn);
1907                 int* ofs = _ofs;
1908                 int* xofs = ofs + area;
1909                 ResizeAreaFastFunc func = areafast_tab[depth];
1910                 CV_Assert( func != 0 );
1911
1912                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
1913                     for( sx = 0; sx < iscale_x; sx++ )
1914                         ofs[k++] = (int)(sy*srcstep + sx*cn);
1915
1916                 for( dx = 0; dx < dsize.width; dx++ )
1917                 {
1918                     int j = dx * cn;
1919                     sx = iscale_x * j;
1920                     for( k = 0; k < cn; k++ )
1921                         xofs[j + k] = sx + k;
1922                 }
1923
1924                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
1925                 return;
1926             }
1927
1928             ResizeAreaFunc func = area_tab[depth];
1929             CV_Assert( func != 0 && cn <= 4 );
1930
1931             AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
1932             DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
1933
1934             int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
1935             int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
1936
1937             AutoBuffer<int> _tabofs(dsize.height + 1);
1938             int* tabofs = _tabofs;
1939             for( k = 0, dy = 0; k < ytab_size; k++ )
1940             {
1941                 if( k == 0 || ytab[k].di != ytab[k-1].di )
1942                 {
1943                     assert( ytab[k].di == dy );
1944                     tabofs[dy++] = k;
1945                 }
1946             }
1947             tabofs[dy] = ytab_size;
1948
1949             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
1950             return;
1951         }
1952     }
1953
1954     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
1955     bool area_mode = interpolation == INTER_AREA;
1956     bool fixpt = depth == CV_8U;
1957     float fx, fy;
1958     ResizeFunc func=0;
1959     int ksize=0, ksize2;
1960     if( interpolation == INTER_CUBIC )
1961         ksize = 4, func = cubic_tab[depth];
1962     else if( interpolation == INTER_LANCZOS4 )
1963         ksize = 8, func = lanczos4_tab[depth];
1964     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
1965         ksize = 2, func = linear_tab[depth];
1966     else
1967         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
1968     ksize2 = ksize/2;
1969
1970     CV_Assert( func != 0 );
1971
1972     AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
1973     int* xofs = (int*)(uchar*)_buffer;
1974     int* yofs = xofs + width;
1975     float* alpha = (float*)(yofs + dsize.height);
1976     short* ialpha = (short*)alpha;
1977     float* beta = alpha + width*ksize;
1978     short* ibeta = ialpha + width*ksize;
1979     float cbuf[MAX_ESIZE];
1980
1981     for( dx = 0; dx < dsize.width; dx++ )
1982     {
1983         if( !area_mode )
1984         {
1985             fx = (float)((dx+0.5)*scale_x - 0.5);
1986             sx = cvFloor(fx);
1987             fx -= sx;
1988         }
1989         else
1990         {
1991             sx = cvFloor(dx*scale_x);
1992             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
1993             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
1994         }
1995
1996         if( sx < ksize2-1 )
1997         {
1998             xmin = dx+1;
1999             if( sx < 0 )
2000                 fx = 0, sx = 0;
2001         }
2002
2003         if( sx + ksize2 >= ssize.width )
2004         {
2005             xmax = std::min( xmax, dx );
2006             if( sx >= ssize.width-1 )
2007                 fx = 0, sx = ssize.width-1;
2008         }
2009
2010         for( k = 0, sx *= cn; k < cn; k++ )
2011             xofs[dx*cn + k] = sx + k;
2012
2013         if( interpolation == INTER_CUBIC )
2014             interpolateCubic( fx, cbuf );
2015         else if( interpolation == INTER_LANCZOS4 )
2016             interpolateLanczos4( fx, cbuf );
2017         else
2018         {
2019             cbuf[0] = 1.f - fx;
2020             cbuf[1] = fx;
2021         }
2022         if( fixpt )
2023         {
2024             for( k = 0; k < ksize; k++ )
2025                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
2026             for( ; k < cn*ksize; k++ )
2027                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
2028         }
2029         else
2030         {
2031             for( k = 0; k < ksize; k++ )
2032                 alpha[dx*cn*ksize + k] = cbuf[k];
2033             for( ; k < cn*ksize; k++ )
2034                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
2035         }
2036     }
2037
2038     for( dy = 0; dy < dsize.height; dy++ )
2039     {
2040         if( !area_mode )
2041         {
2042             fy = (float)((dy+0.5)*scale_y - 0.5);
2043             sy = cvFloor(fy);
2044             fy -= sy;
2045         }
2046         else
2047         {
2048             sy = cvFloor(dy*scale_y);
2049             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
2050             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
2051         }
2052
2053         yofs[dy] = sy;
2054         if( interpolation == INTER_CUBIC )
2055             interpolateCubic( fy, cbuf );
2056         else if( interpolation == INTER_LANCZOS4 )
2057             interpolateLanczos4( fy, cbuf );
2058         else
2059         {
2060             cbuf[0] = 1.f - fy;
2061             cbuf[1] = fy;
2062         }
2063
2064         if( fixpt )
2065         {
2066             for( k = 0; k < ksize; k++ )
2067                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
2068         }
2069         else
2070         {
2071             for( k = 0; k < ksize; k++ )
2072                 beta[dy*ksize + k] = cbuf[k];
2073         }
2074     }
2075
2076     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
2077           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
2078 }
2079
2080
2081 /****************************************************************************************\
2082 *                       General warping (affine, perspective, remap)                     *
2083 \****************************************************************************************/
2084
2085 namespace cv
2086 {
2087
2088 template<typename T>
2089 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
2090                           int borderType, const Scalar& _borderValue )
2091 {
2092     Size ssize = _src.size(), dsize = _dst.size();
2093     int cn = _src.channels();
2094     const T* S0 = (const T*)_src.data;
2095     size_t sstep = _src.step/sizeof(S0[0]);
2096     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
2097         saturate_cast<T>(_borderValue[1]),
2098         saturate_cast<T>(_borderValue[2]),
2099         saturate_cast<T>(_borderValue[3]));
2100     int dx, dy;
2101
2102     unsigned width1 = ssize.width, height1 = ssize.height;
2103
2104     if( _dst.isContinuous() && _xy.isContinuous() )
2105     {
2106         dsize.width *= dsize.height;
2107         dsize.height = 1;
2108     }
2109
2110     for( dy = 0; dy < dsize.height; dy++ )
2111     {
2112         T* D = (T*)(_dst.data + _dst.step*dy);
2113         const short* XY = (const short*)(_xy.data + _xy.step*dy);
2114
2115         if( cn == 1 )
2116         {
2117             for( dx = 0; dx < dsize.width; dx++ )
2118             {
2119                 int sx = XY[dx*2], sy = XY[dx*2+1];
2120                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
2121                     D[dx] = S0[sy*sstep + sx];
2122                 else
2123                 {
2124                     if( borderType == BORDER_REPLICATE )
2125                     {
2126                         sx = clip(sx, 0, ssize.width);
2127                         sy = clip(sy, 0, ssize.height);
2128                         D[dx] = S0[sy*sstep + sx];
2129                     }
2130                     else if( borderType == BORDER_CONSTANT )
2131                         D[dx] = cval[0];
2132                     else if( borderType != BORDER_TRANSPARENT )
2133                     {
2134                         sx = borderInterpolate(sx, ssize.width, borderType);
2135                         sy = borderInterpolate(sy, ssize.height, borderType);
2136                         D[dx] = S0[sy*sstep + sx];
2137                     }
2138                 }
2139             }
2140         }
2141         else
2142         {
2143             for( dx = 0; dx < dsize.width; dx++, D += cn )
2144             {
2145                 int sx = XY[dx*2], sy = XY[dx*2+1], k;
2146                 const T *S;
2147                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
2148                 {
2149                     if( cn == 3 )
2150                     {
2151                         S = S0 + sy*sstep + sx*3;
2152                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
2153                     }
2154                     else if( cn == 4 )
2155                     {
2156                         S = S0 + sy*sstep + sx*4;
2157                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
2158                     }
2159                     else
2160                     {
2161                         S = S0 + sy*sstep + sx*cn;
2162                         for( k = 0; k < cn; k++ )
2163                             D[k] = S[k];
2164                     }
2165                 }
2166                 else if( borderType != BORDER_TRANSPARENT )
2167                 {
2168                     if( borderType == BORDER_REPLICATE )
2169                     {
2170                         sx = clip(sx, 0, ssize.width);
2171                         sy = clip(sy, 0, ssize.height);
2172                         S = S0 + sy*sstep + sx*cn;
2173                     }
2174                     else if( borderType == BORDER_CONSTANT )
2175                         S = &cval[0];
2176                     else
2177                     {
2178                         sx = borderInterpolate(sx, ssize.width, borderType);
2179                         sy = borderInterpolate(sy, ssize.height, borderType);
2180                         S = S0 + sy*sstep + sx*cn;
2181                     }
2182                     for( k = 0; k < cn; k++ )
2183                         D[k] = S[k];
2184                 }
2185             }
2186         }
2187     }
2188 }
2189
2190
2191 struct RemapNoVec
2192 {
2193     int operator()( const Mat&, void*, const short*, const ushort*,
2194                     const void*, int ) const { return 0; }
2195 };
2196
2197 #if CV_SSE2
2198
2199 struct RemapVec_8u
2200 {
2201     int operator()( const Mat& _src, void* _dst, const short* XY,
2202                     const ushort* FXY, const void* _wtab, int width ) const
2203     {
2204         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
2205
2206         if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
2207                 sstep > 0x8000 )
2208             return 0;
2209
2210         const uchar *S0 = _src.data, *S1 = _src.data + _src.step;
2211         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
2212         uchar* D = (uchar*)_dst;
2213         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
2214         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
2215         __m128i z = _mm_setzero_si128();
2216         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
2217
2218         if( cn == 1 )
2219         {
2220             for( ; x <= width - 8; x += 8 )
2221             {
2222                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
2223                 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
2224                 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
2225                 unsigned i0, i1;
2226
2227                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
2228                 xy1 = _mm_madd_epi16( xy1, xy2ofs );
2229                 _mm_store_si128( (__m128i*)iofs0, xy0 );
2230                 _mm_store_si128( (__m128i*)iofs1, xy1 );
2231
2232                 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
2233                 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
2234                 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
2235                 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
2236                 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
2237                 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
2238                 v0 = _mm_unpacklo_epi8(v0, z);
2239                 v1 = _mm_unpacklo_epi8(v1, z);
2240
2241                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
2242                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
2243                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
2244                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
2245                 b0 = _mm_unpacklo_epi64(a0, a1);
2246                 b1 = _mm_unpackhi_epi64(a0, a1);
2247                 v0 = _mm_madd_epi16(v0, b0);
2248                 v1 = _mm_madd_epi16(v1, b1);
2249                 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
2250
2251                 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
2252                 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
2253                 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
2254                 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
2255                 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
2256                 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
2257                 v2 = _mm_unpacklo_epi8(v2, z);
2258                 v3 = _mm_unpacklo_epi8(v3, z);
2259
2260                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
2261                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
2262                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
2263                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
2264                 b0 = _mm_unpacklo_epi64(a0, a1);
2265                 b1 = _mm_unpackhi_epi64(a0, a1);
2266                 v2 = _mm_madd_epi16(v2, b0);
2267                 v3 = _mm_madd_epi16(v3, b1);
2268                 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
2269
2270                 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
2271                 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
2272                 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
2273                 _mm_storel_epi64( (__m128i*)(D + x), v0 );
2274             }
2275         }
2276         else if( cn == 3 )
2277         {
2278             for( ; x <= width - 5; x += 4, D += 12 )
2279             {
2280                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
2281                 __m128i u0, v0, u1, v1;
2282
2283                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
2284                 _mm_store_si128( (__m128i*)iofs0, xy0 );
2285                 const __m128i *w0, *w1;
2286                 w0 = (const __m128i*)(wtab + FXY[x]*16);
2287                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
2288
2289                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
2290                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
2291                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
2292                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
2293                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
2294                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
2295                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
2296                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
2297                 u0 = _mm_unpacklo_epi8(u0, z);
2298                 v0 = _mm_unpacklo_epi8(v0, z);
2299                 u1 = _mm_unpacklo_epi8(u1, z);
2300                 v1 = _mm_unpacklo_epi8(v1, z);
2301                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
2302                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
2303                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
2304                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
2305                 u0 = _mm_slli_si128(u0, 4);
2306                 u0 = _mm_packs_epi32(u0, u1);
2307                 u0 = _mm_packus_epi16(u0, u0);
2308                 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
2309
2310                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
2311                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
2312
2313                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
2314                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
2315                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
2316                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
2317                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
2318                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
2319                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
2320                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
2321                 u0 = _mm_unpacklo_epi8(u0, z);
2322                 v0 = _mm_unpacklo_epi8(v0, z);
2323                 u1 = _mm_unpacklo_epi8(u1, z);
2324                 v1 = _mm_unpacklo_epi8(v1, z);
2325                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
2326                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
2327                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
2328                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
2329                 u0 = _mm_slli_si128(u0, 4);
2330                 u0 = _mm_packs_epi32(u0, u1);
2331                 u0 = _mm_packus_epi16(u0, u0);
2332                 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
2333             }
2334         }
2335         else if( cn == 4 )
2336         {
2337             for( ; x <= width - 4; x += 4, D += 16 )
2338             {
2339                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
2340                 __m128i u0, v0, u1, v1;
2341
2342                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
2343                 _mm_store_si128( (__m128i*)iofs0, xy0 );
2344                 const __m128i *w0, *w1;
2345                 w0 = (const __m128i*)(wtab + FXY[x]*16);
2346                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
2347
2348                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
2349                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
2350                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
2351                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
2352                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
2353                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
2354                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
2355                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
2356                 u0 = _mm_unpacklo_epi8(u0, z);
2357                 v0 = _mm_unpacklo_epi8(v0, z);
2358                 u1 = _mm_unpacklo_epi8(u1, z);
2359                 v1 = _mm_unpacklo_epi8(v1, z);
2360                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
2361                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
2362                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
2363                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
2364                 u0 = _mm_packs_epi32(u0, u1);
2365                 u0 = _mm_packus_epi16(u0, u0);
2366                 _mm_storel_epi64((__m128i*)D, u0);
2367
2368                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
2369                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
2370
2371                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
2372                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
2373                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
2374                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
2375                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
2376                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
2377                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
2378                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
2379                 u0 = _mm_unpacklo_epi8(u0, z);
2380                 v0 = _mm_unpacklo_epi8(v0, z);
2381                 u1 = _mm_unpacklo_epi8(u1, z);
2382                 v1 = _mm_unpacklo_epi8(v1, z);
2383                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
2384                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
2385                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
2386                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
2387                 u0 = _mm_packs_epi32(u0, u1);
2388                 u0 = _mm_packus_epi16(u0, u0);
2389                 _mm_storel_epi64((__m128i*)(D + 8), u0);
2390             }
2391         }
2392
2393         return x;
2394     }
2395 };
2396
2397 #else
2398
2399 typedef RemapNoVec RemapVec_8u;
2400
2401 #endif
2402
2403
2404 template<class CastOp, class VecOp, typename AT>
2405 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
2406                            const Mat& _fxy, const void* _wtab,
2407                            int borderType, const Scalar& _borderValue )
2408 {
2409     typedef typename CastOp::rtype T;
2410     typedef typename CastOp::type1 WT;
2411     Size ssize = _src.size(), dsize = _dst.size();
2412     int cn = _src.channels();
2413     const AT* wtab = (const AT*)_wtab;
2414     const T* S0 = (const T*)_src.data;
2415     size_t sstep = _src.step/sizeof(S0[0]);
2416     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
2417         saturate_cast<T>(_borderValue[1]),
2418         saturate_cast<T>(_borderValue[2]),
2419         saturate_cast<T>(_borderValue[3]));
2420     int dx, dy;
2421     CastOp castOp;
2422     VecOp vecOp;
2423
2424     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
2425     CV_Assert( cn <= 4 && ssize.area() > 0 );
2426 #if CV_SSE2
2427     if( _src.type() == CV_8UC3 )
2428         width1 = std::max(ssize.width-2, 0);
2429 #endif
2430
2431     for( dy = 0; dy < dsize.height; dy++ )
2432     {
2433         T* D = (T*)(_dst.data + _dst.step*dy);
2434         const short* XY = (const short*)(_xy.data + _xy.step*dy);
2435         const ushort* FXY = (const ushort*)(_fxy.data + _fxy.step*dy);
2436         int X0 = 0;
2437         bool prevInlier = false;
2438
2439         for( dx = 0; dx <= dsize.width; dx++ )
2440         {
2441             bool curInlier = dx < dsize.width ?
2442                 (unsigned)XY[dx*2] < width1 &&
2443                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
2444             if( curInlier == prevInlier )
2445                 continue;
2446
2447             int X1 = dx;
2448             dx = X0;
2449             X0 = X1;
2450             prevInlier = curInlier;
2451
2452             if( !curInlier )
2453             {
2454                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
2455                 D += len*cn;
2456                 dx += len;
2457
2458                 if( cn == 1 )
2459                 {
2460                     for( ; dx < X1; dx++, D++ )
2461                     {
2462                         int sx = XY[dx*2], sy = XY[dx*2+1];
2463                         const AT* w = wtab + FXY[dx]*4;
2464                         const T* S = S0 + sy*sstep + sx;
2465                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
2466                     }
2467                 }
2468                 else if( cn == 2 )
2469                     for( ; dx < X1; dx++, D += 2 )
2470                     {
2471                         int sx = XY[dx*2], sy = XY[dx*2+1];
2472                         const AT* w = wtab + FXY[dx]*4;
2473                         const T* S = S0 + sy*sstep + sx*2;
2474                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
2475                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
2476                         D[0] = castOp(t0); D[1] = castOp(t1);
2477                     }
2478                 else if( cn == 3 )
2479                     for( ; dx < X1; dx++, D += 3 )
2480                     {
2481                         int sx = XY[dx*2], sy = XY[dx*2+1];
2482                         const AT* w = wtab + FXY[dx]*4;
2483                         const T* S = S0 + sy*sstep + sx*3;
2484                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
2485                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
2486                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
2487                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
2488                     }
2489                 else
2490                     for( ; dx < X1; dx++, D += 4 )
2491                     {
2492                         int sx = XY[dx*2], sy = XY[dx*2+1];
2493                         const AT* w = wtab + FXY[dx]*4;
2494                         const T* S = S0 + sy*sstep + sx*4;
2495                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
2496                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
2497                         D[0] = castOp(t0); D[1] = castOp(t1);
2498                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
2499                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
2500                         D[2] = castOp(t0); D[3] = castOp(t1);
2501                     }
2502             }
2503             else
2504             {
2505                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
2506                 {
2507                     D += (X1 - dx)*cn;
2508                     dx = X1;
2509                     continue;
2510                 }
2511
2512                 if( cn == 1 )
2513                     for( ; dx < X1; dx++, D++ )
2514                     {
2515                         int sx = XY[dx*2], sy = XY[dx*2+1];
2516                         if( borderType == BORDER_CONSTANT &&
2517                             (sx >= ssize.width || sx+1 < 0 ||
2518                              sy >= ssize.height || sy+1 < 0) )
2519                         {
2520                             D[0] = cval[0];
2521                         }
2522                         else
2523                         {
2524                             int sx0, sx1, sy0, sy1;
2525                             T v0, v1, v2, v3;
2526                             const AT* w = wtab + FXY[dx]*4;
2527                             if( borderType == BORDER_REPLICATE )
2528                             {
2529                                 sx0 = clip(sx, 0, ssize.width);
2530                                 sx1 = clip(sx+1, 0, ssize.width);
2531                                 sy0 = clip(sy, 0, ssize.height);
2532                                 sy1 = clip(sy+1, 0, ssize.height);
2533                                 v0 = S0[sy0*sstep + sx0];
2534                                 v1 = S0[sy0*sstep + sx1];
2535                                 v2 = S0[sy1*sstep + sx0];
2536                                 v3 = S0[sy1*sstep + sx1];
2537                             }
2538                             else
2539                             {
2540                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
2541                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
2542                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
2543                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
2544                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
2545                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
2546                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
2547                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
2548                             }
2549                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
2550                         }
2551                     }
2552                 else
2553                     for( ; dx < X1; dx++, D += cn )
2554                     {
2555                         int sx = XY[dx*2], sy = XY[dx*2+1], k;
2556                         if( borderType == BORDER_CONSTANT &&
2557                             (sx >= ssize.width || sx+1 < 0 ||
2558                              sy >= ssize.height || sy+1 < 0) )
2559                         {
2560                             for( k = 0; k < cn; k++ )
2561                                 D[k] = cval[k];
2562                         }
2563                         else
2564                         {
2565                             int sx0, sx1, sy0, sy1;
2566                             const T *v0, *v1, *v2, *v3;
2567                             const AT* w = wtab + FXY[dx]*4;
2568                             if( borderType == BORDER_REPLICATE )
2569                             {
2570                                 sx0 = clip(sx, 0, ssize.width);
2571                                 sx1 = clip(sx+1, 0, ssize.width);
2572                                 sy0 = clip(sy, 0, ssize.height);
2573                                 sy1 = clip(sy+1, 0, ssize.height);
2574                                 v0 = S0 + sy0*sstep + sx0*cn;
2575                                 v1 = S0 + sy0*sstep + sx1*cn;
2576                                 v2 = S0 + sy1*sstep + sx0*cn;
2577                                 v3 = S0 + sy1*sstep + sx1*cn;
2578                             }
2579                             else if( borderType == BORDER_TRANSPARENT &&
2580                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
2581                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
2582                                 continue;
2583                             else
2584                             {
2585                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
2586                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
2587                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
2588                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
2589                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
2590                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
2591                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
2592                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
2593                             }
2594                             for( k = 0; k < cn; k++ )
2595                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
2596                         }
2597                     }
2598             }
2599         }
2600     }
2601 }
2602
2603
2604 template<class CastOp, typename AT, int ONE>
2605 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
2606                           const Mat& _fxy, const void* _wtab,
2607                           int borderType, const Scalar& _borderValue )
2608 {
2609     typedef typename CastOp::rtype T;
2610     typedef typename CastOp::type1 WT;
2611     Size ssize = _src.size(), dsize = _dst.size();
2612     int cn = _src.channels();
2613     const AT* wtab = (const AT*)_wtab;
2614     const T* S0 = (const T*)_src.data;
2615     size_t sstep = _src.step/sizeof(S0[0]);
2616     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
2617         saturate_cast<T>(_borderValue[1]),
2618         saturate_cast<T>(_borderValue[2]),
2619         saturate_cast<T>(_borderValue[3]));
2620     int dx, dy;
2621     CastOp castOp;
2622     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
2623
2624     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
2625
2626     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
2627     {
2628         dsize.width *= dsize.height;
2629         dsize.height = 1;
2630     }
2631
2632     for( dy = 0; dy < dsize.height; dy++ )
2633     {
2634         T* D = (T*)(_dst.data + _dst.step*dy);
2635         const short* XY = (const short*)(_xy.data + _xy.step*dy);
2636         const ushort* FXY = (const ushort*)(_fxy.data + _fxy.step*dy);
2637
2638         for( dx = 0; dx < dsize.width; dx++, D += cn )
2639         {
2640             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
2641             const AT* w = wtab + FXY[dx]*16;
2642             int i, k;
2643             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
2644             {
2645                 const T* S = S0 + sy*sstep + sx*cn;
2646                 for( k = 0; k < cn; k++ )
2647                 {
2648                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
2649                     S += sstep;
2650                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
2651                     S += sstep;
2652                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
2653                     S += sstep;
2654                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
2655                     S += 1 - sstep*3;
2656                     D[k] = castOp(sum);
2657                 }
2658             }
2659             else
2660             {
2661                 int x[4], y[4];
2662                 if( borderType == BORDER_TRANSPARENT &&
2663                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
2664                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
2665                     continue;
2666
2667                 if( borderType1 == BORDER_CONSTANT &&
2668                     (sx >= ssize.width || sx+4 <= 0 ||
2669                     sy >= ssize.height || sy+4 <= 0))
2670                 {
2671                     for( k = 0; k < cn; k++ )
2672                         D[k] = cval[k];
2673                     continue;
2674                 }
2675
2676                 for( i = 0; i < 4; i++ )
2677                 {
2678                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
2679                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
2680                 }
2681
2682                 for( k = 0; k < cn; k++, S0++, w -= 16 )
2683                 {
2684                     WT cv = cval[k], sum = cv*ONE;
2685                     for( i = 0; i < 4; i++, w += 4 )
2686                     {
2687                         int yi = y[i];
2688                         const T* S = S0 + yi*sstep;
2689                         if( yi < 0 )
2690                             continue;
2691                         if( x[0] >= 0 )
2692                             sum += (S[x[0]] - cv)*w[0];
2693                         if( x[1] >= 0 )
2694                             sum += (S[x[1]] - cv)*w[1];
2695                         if( x[2] >= 0 )
2696                             sum += (S[x[2]] - cv)*w[2];
2697                         if( x[3] >= 0 )
2698                             sum += (S[x[3]] - cv)*w[3];
2699                     }
2700                     D[k] = castOp(sum);
2701                 }
2702                 S0 -= cn;
2703             }
2704         }
2705     }
2706 }
2707
2708
2709 template<class CastOp, typename AT, int ONE>
2710 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
2711                            const Mat& _fxy, const void* _wtab,
2712                            int borderType, const Scalar& _borderValue )
2713 {
2714     typedef typename CastOp::rtype T;
2715     typedef typename CastOp::type1 WT;
2716     Size ssize = _src.size(), dsize = _dst.size();
2717     int cn = _src.channels();
2718     const AT* wtab = (const AT*)_wtab;
2719     const T* S0 = (const T*)_src.data;
2720     size_t sstep = _src.step/sizeof(S0[0]);
2721     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
2722         saturate_cast<T>(_borderValue[1]),
2723         saturate_cast<T>(_borderValue[2]),
2724         saturate_cast<T>(_borderValue[3]));
2725     int dx, dy;
2726     CastOp castOp;
2727     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
2728
2729     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
2730
2731     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
2732     {
2733         dsize.width *= dsize.height;
2734         dsize.height = 1;
2735     }
2736
2737     for( dy = 0; dy < dsize.height; dy++ )
2738     {
2739         T* D = (T*)(_dst.data + _dst.step*dy);
2740         const short* XY = (const short*)(_xy.data + _xy.step*dy);
2741         const ushort* FXY = (const ushort*)(_fxy.data + _fxy.step*dy);
2742
2743         for( dx = 0; dx < dsize.width; dx++, D += cn )
2744         {
2745             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
2746             const AT* w = wtab + FXY[dx]*64;
2747             const T* S = S0 + sy*sstep + sx*cn;
2748             int i, k;
2749             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
2750             {
2751                 for( k = 0; k < cn; k++ )
2752                 {
2753                     WT sum = 0;
2754                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
2755                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
2756                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
2757                     w -= 64;
2758                     S -= sstep*8 - 1;
2759                     D[k] = castOp(sum);
2760                 }
2761             }
2762             else
2763             {
2764                 int x[8], y[8];
2765                 if( borderType == BORDER_TRANSPARENT &&
2766                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
2767                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
2768                     continue;
2769
2770                 if( borderType1 == BORDER_CONSTANT &&
2771                     (sx >= ssize.width || sx+8 <= 0 ||
2772                     sy >= ssize.height || sy+8 <= 0))
2773                 {
2774                     for( k = 0; k < cn; k++ )
2775                         D[k] = cval[k];
2776                     continue;
2777                 }
2778
2779                 for( i = 0; i < 8; i++ )
2780                 {
2781                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
2782                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
2783                 }
2784
2785                 for( k = 0; k < cn; k++, S0++, w -= 64 )
2786                 {
2787                     WT cv = cval[k], sum = cv*ONE;
2788                     for( i = 0; i < 8; i++, w += 8 )
2789                     {
2790                         int yi = y[i];
2791                         const T* S1 = S0 + yi*sstep;
2792                         if( yi < 0 )
2793                             continue;
2794                         if( x[0] >= 0 )
2795                             sum += (S1[x[0]] - cv)*w[0];
2796                         if( x[1] >= 0 )
2797                             sum += (S1[x[1]] - cv)*w[1];
2798                         if( x[2] >= 0 )
2799                             sum += (S1[x[2]] - cv)*w[2];
2800                         if( x[3] >= 0 )
2801                             sum += (S1[x[3]] - cv)*w[3];
2802                         if( x[4] >= 0 )
2803                             sum += (S1[x[4]] - cv)*w[4];
2804                         if( x[5] >= 0 )
2805                             sum += (S1[x[5]] - cv)*w[5];
2806                         if( x[6] >= 0 )
2807                             sum += (S1[x[6]] - cv)*w[6];
2808                         if( x[7] >= 0 )
2809                             sum += (S1[x[7]] - cv)*w[7];
2810                     }
2811                     D[k] = castOp(sum);
2812                 }
2813                 S0 -= cn;
2814             }
2815         }
2816     }
2817 }
2818
2819
2820 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
2821                             int borderType, const Scalar& _borderValue );
2822
2823 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
2824                           const Mat& _fxy, const void* _wtab,
2825                           int borderType, const Scalar& _borderValue);
2826
2827 class RemapInvoker :
2828     public ParallelLoopBody
2829 {
2830 public:
2831     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
2832                  const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
2833                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
2834         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
2835         interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue),
2836         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
2837     {
2838     }
2839
2840     virtual void operator() (const Range& range) const
2841     {
2842         int x, y, x1, y1;
2843         const int buf_size = 1 << 14;
2844         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
2845         int bcols0 = std::min(buf_size/brows0, dst->cols);
2846         brows0 = std::min(buf_size/bcols0, dst->rows);
2847     #if CV_SSE2
2848         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
2849     #endif
2850
2851         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
2852         if( !nnfunc )
2853             _bufa.create(brows0, bcols0, CV_16UC1);
2854
2855         for( y = range.start; y < range.end; y += brows0 )
2856         {
2857             for( x = 0; x < dst->cols; x += bcols0 )
2858             {
2859                 int brows = std::min(brows0, range.end - y);
2860                 int bcols = std::min(bcols0, dst->cols - x);
2861                 Mat dpart(*dst, Rect(x, y, bcols, brows));
2862                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
2863
2864                 if( nnfunc )
2865                 {
2866                     if( m1->type() == CV_16SC2 && !m2->data ) // the data is already in the right format
2867                         bufxy = (*m1)(Rect(x, y, bcols, brows));
2868                     else if( map_depth != CV_32F )
2869                     {
2870                         for( y1 = 0; y1 < brows; y1++ )
2871                         {
2872                             short* XY = (short*)(bufxy.data + bufxy.step*y1);
2873                             const short* sXY = (const short*)(m1->data + m1->step*(y+y1)) + x*2;
2874                             const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
2875
2876                             for( x1 = 0; x1 < bcols; x1++ )
2877                             {
2878                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
2879                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
2880                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
2881                             }
2882                         }
2883                     }
2884                     else if( !planar_input )
2885                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
2886                     else
2887                     {
2888                         for( y1 = 0; y1 < brows; y1++ )
2889                         {
2890                             short* XY = (short*)(bufxy.data + bufxy.step*y1);
2891                             const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x;
2892                             const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x;
2893                             x1 = 0;
2894
2895                         #if CV_SSE2
2896                             if( useSIMD )
2897                             {
2898                                 for( ; x1 <= bcols - 8; x1 += 8 )
2899                                 {
2900                                     __m128 fx0 = _mm_loadu_ps(sX + x1);
2901                                     __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
2902                                     __m128 fy0 = _mm_loadu_ps(sY + x1);
2903                                     __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
2904                                     __m128i ix0 = _mm_cvtps_epi32(fx0);
2905                                     __m128i ix1 = _mm_cvtps_epi32(fx1);
2906                                     __m128i iy0 = _mm_cvtps_epi32(fy0);
2907                                     __m128i iy1 = _mm_cvtps_epi32(fy1);
2908                                     ix0 = _mm_packs_epi32(ix0, ix1);
2909                                     iy0 = _mm_packs_epi32(iy0, iy1);
2910                                     ix1 = _mm_unpacklo_epi16(ix0, iy0);
2911                                     iy1 = _mm_unpackhi_epi16(ix0, iy0);
2912                                     _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
2913                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
2914                                 }
2915                             }
2916                         #endif
2917
2918                             for( ; x1 < bcols; x1++ )
2919                             {
2920                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
2921                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
2922                             }
2923                         }
2924                     }
2925                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
2926                     continue;
2927                 }
2928
2929                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
2930                 for( y1 = 0; y1 < brows; y1++ )
2931                 {
2932                     short* XY = (short*)(bufxy.data + bufxy.step*y1);
2933                     ushort* A = (ushort*)(bufa.data + bufa.step*y1);
2934
2935                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
2936                     {
2937                         bufxy = (*m1)(Rect(x, y, bcols, brows));
2938
2939                         const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
2940                         for( x1 = 0; x1 < bcols; x1++ )
2941                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
2942                     }
2943                     else if( planar_input )
2944                     {
2945                         const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x;
2946                         const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x;
2947
2948                         x1 = 0;
2949                     #if CV_SSE2
2950                         if( useSIMD )
2951                         {
2952                             __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
2953                             __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
2954                             for( ; x1 <= bcols - 8; x1 += 8 )
2955                             {
2956                                 __m128 fx0 = _mm_loadu_ps(sX + x1);
2957                                 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
2958                                 __m128 fy0 = _mm_loadu_ps(sY + x1);
2959                                 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
2960                                 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
2961                                 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
2962                                 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
2963                                 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
2964                                 __m128i mx0 = _mm_and_si128(ix0, mask);
2965                                 __m128i mx1 = _mm_and_si128(ix1, mask);
2966                                 __m128i my0 = _mm_and_si128(iy0, mask);
2967                                 __m128i my1 = _mm_and_si128(iy1, mask);
2968                                 mx0 = _mm_packs_epi32(mx0, mx1);
2969                                 my0 = _mm_packs_epi32(my0, my1);
2970                                 my0 = _mm_slli_epi16(my0, INTER_BITS);
2971                                 mx0 = _mm_or_si128(mx0, my0);
2972                                 _mm_storeu_si128((__m128i*)(A + x1), mx0);
2973                                 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
2974                                 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
2975                                 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
2976                                 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
2977                                 ix0 = _mm_packs_epi32(ix0, ix1);
2978                                 iy0 = _mm_packs_epi32(iy0, iy1);
2979                                 ix1 = _mm_unpacklo_epi16(ix0, iy0);
2980                                 iy1 = _mm_unpackhi_epi16(ix0, iy0);
2981                                 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
2982                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
2983                             }
2984                         }
2985                     #endif
2986
2987                         for( ; x1 < bcols; x1++ )
2988                         {
2989                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
2990                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
2991                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
2992                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
2993                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
2994                             A[x1] = (ushort)v;
2995                         }
2996                     }
2997                     else
2998                     {
2999                         const float* sXY = (const float*)(m1->data + m1->step*(y+y1)) + x*2;
3000
3001                         for( x1 = 0; x1 < bcols; x1++ )
3002                         {
3003                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
3004                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
3005                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
3006                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
3007                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
3008                             A[x1] = (ushort)v;
3009                         }
3010                     }
3011                 }
3012                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
3013             }
3014         }
3015     }
3016
3017 private:
3018     const Mat* src;
3019     Mat* dst;
3020     const Mat *m1, *m2;
3021     int interpolation, borderType;
3022     Scalar borderValue;
3023     int planar_input;
3024     RemapNNFunc nnfunc;
3025     RemapFunc ifunc;
3026     const void *ctab;
3027 };
3028
3029 }
3030
3031 void cv::remap( InputArray _src, OutputArray _dst,
3032                 InputArray _map1, InputArray _map2,
3033                 int interpolation, int borderType, const Scalar& borderValue )
3034 {
3035     static RemapNNFunc nn_tab[] =
3036     {
3037         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
3038         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
3039     };
3040
3041     static RemapFunc linear_tab[] =
3042     {
3043         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
3044         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
3045         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
3046         remapBilinear<Cast<float, float>, RemapNoVec, float>,
3047         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
3048     };
3049
3050     static RemapFunc cubic_tab[] =
3051     {
3052         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
3053         remapBicubic<Cast<float, ushort>, float, 1>,
3054         remapBicubic<Cast<float, short>, float, 1>, 0,
3055         remapBicubic<Cast<float, float>, float, 1>,
3056         remapBicubic<Cast<double, double>, float, 1>, 0
3057     };
3058
3059     static RemapFunc lanczos4_tab[] =
3060     {
3061         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
3062         remapLanczos4<Cast<float, ushort>, float, 1>,
3063         remapLanczos4<Cast<float, short>, float, 1>, 0,
3064         remapLanczos4<Cast<float, float>, float, 1>,
3065         remapLanczos4<Cast<double, double>, float, 1>, 0
3066     };
3067
3068     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
3069
3070     CV_Assert( map1.size().area() > 0 );
3071     CV_Assert( !map2.data || (map2.size() == map1.size()));
3072
3073     _dst.create( map1.size(), src.type() );
3074     Mat dst = _dst.getMat();
3075     if( dst.data == src.data )
3076         src = src.clone();
3077
3078     int depth = src.depth();
3079     RemapNNFunc nnfunc = 0;
3080     RemapFunc ifunc = 0;
3081     const void* ctab = 0;
3082     bool fixpt = depth == CV_8U;
3083     bool planar_input = false;
3084
3085     if( interpolation == INTER_NEAREST )
3086     {
3087         nnfunc = nn_tab[depth];
3088         CV_Assert( nnfunc != 0 );
3089     }
3090     else
3091     {
3092         if( interpolation == INTER_AREA )
3093             interpolation = INTER_LINEAR;
3094
3095         if( interpolation == INTER_LINEAR )
3096             ifunc = linear_tab[depth];
3097         else if( interpolation == INTER_CUBIC )
3098             ifunc = cubic_tab[depth];
3099         else if( interpolation == INTER_LANCZOS4 )
3100             ifunc = lanczos4_tab[depth];
3101         else
3102             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
3103         CV_Assert( ifunc != 0 );
3104         ctab = initInterTab2D( interpolation, fixpt );
3105     }
3106
3107     const Mat *m1 = &map1, *m2 = &map2;
3108
3109     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || !map2.data)) ||
3110         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || !map1.data)) )
3111     {
3112         if( map1.type() != CV_16SC2 )
3113             std::swap(m1, m2);
3114     }
3115     else
3116     {
3117         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && !map2.data) ||
3118             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
3119         planar_input = map1.channels() == 1;
3120     }
3121
3122     RemapInvoker invoker(src, dst, m1, m2, interpolation,
3123                          borderType, borderValue, planar_input, nnfunc, ifunc,
3124                          ctab);
3125     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
3126 }
3127
3128
3129 void cv::convertMaps( InputArray _map1, InputArray _map2,
3130                       OutputArray _dstmap1, OutputArray _dstmap2,
3131                       int dstm1type, bool nninterpolate )
3132 {
3133     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
3134     Size size = map1.size();
3135     const Mat *m1 = &map1, *m2 = &map2;
3136     int m1type = m1->type(), m2type = m2->type();
3137
3138     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
3139                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
3140                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
3141                (m1type == CV_32FC2 && !m2->data) );
3142
3143     if( m2type == CV_16SC2 )
3144     {
3145         std::swap( m1, m2 );
3146         std::swap( m1type, m2type );
3147     }
3148
3149     if( dstm1type <= 0 )
3150         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
3151     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
3152     _dstmap1.create( size, dstm1type );
3153     dstmap1 = _dstmap1.getMat();
3154
3155     if( !nninterpolate && dstm1type != CV_32FC2 )
3156     {
3157         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
3158         dstmap2 = _dstmap2.getMat();
3159     }
3160     else
3161         _dstmap2.release();
3162
3163     if( m1type == dstm1type || (nninterpolate &&
3164         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
3165         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
3166     {
3167         m1->convertTo( dstmap1, dstmap1.type() );
3168         if( dstmap2.data && dstmap2.type() == m2->type() )
3169             m2->copyTo( dstmap2 );
3170         return;
3171     }
3172
3173     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
3174     {
3175         Mat vdata[] = { *m1, *m2 };
3176         merge( vdata, 2, dstmap1 );
3177         return;
3178     }
3179
3180     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
3181     {
3182         Mat mv[] = { dstmap1, dstmap2 };
3183         split( *m1, mv );
3184         return;
3185     }
3186
3187     if( m1->isContinuous() && (!m2->data || m2->isContinuous()) &&
3188         dstmap1.isContinuous() && (!dstmap2.data || dstmap2.isContinuous()) )
3189     {
3190         size.width *= size.height;
3191         size.height = 1;
3192     }
3193
3194     const float scale = 1.f/INTER_TAB_SIZE;
3195     int x, y;
3196     for( y = 0; y < size.height; y++ )
3197     {
3198         const float* src1f = (const float*)(m1->data + m1->step*y);
3199         const float* src2f = (const float*)(m2->data + m2->step*y);
3200         const short* src1 = (const short*)src1f;
3201         const ushort* src2 = (const ushort*)src2f;
3202
3203         float* dst1f = (float*)(dstmap1.data + dstmap1.step*y);
3204         float* dst2f = (float*)(dstmap2.data + dstmap2.step*y);
3205         short* dst1 = (short*)dst1f;
3206         ushort* dst2 = (ushort*)dst2f;
3207
3208         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
3209         {
3210             if( nninterpolate )
3211                 for( x = 0; x < size.width; x++ )
3212                 {
3213                     dst1[x*2] = saturate_cast<short>(src1f[x]);
3214                     dst1[x*2+1] = saturate_cast<short>(src2f[x]);
3215                 }
3216             else
3217                 for( x = 0; x < size.width; x++ )
3218                 {
3219                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
3220                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
3221                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
3222                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
3223                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
3224                 }
3225         }
3226         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
3227         {
3228             if( nninterpolate )
3229                 for( x = 0; x < size.width; x++ )
3230                 {
3231                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
3232                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
3233                 }
3234             else
3235                 for( x = 0; x < size.width; x++ )
3236                 {
3237                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
3238                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
3239                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
3240                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
3241                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
3242                 }
3243         }
3244         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
3245         {
3246             for( x = 0; x < size.width; x++ )
3247             {
3248                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
3249                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
3250                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
3251             }
3252         }
3253         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
3254         {
3255             for( x = 0; x < size.width; x++ )
3256             {
3257                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
3258                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
3259                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
3260             }
3261         }
3262         else
3263             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
3264     }
3265 }
3266
3267
3268 namespace cv
3269 {
3270
3271 class warpAffineInvoker :
3272     public ParallelLoopBody
3273 {
3274 public:
3275     warpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
3276                       const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
3277         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
3278         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
3279         M(_M)
3280     {
3281     }
3282
3283     virtual void operator() (const Range& range) const
3284     {
3285         const int BLOCK_SZ = 64;
3286         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
3287         const int AB_BITS = MAX(10, (int)INTER_BITS);
3288         const int AB_SCALE = 1 << AB_BITS;
3289         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
3290     #if CV_SSE2
3291         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
3292     #endif
3293
3294         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
3295         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
3296         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
3297
3298         for( y = range.start; y < range.end; y += bh0 )
3299         {
3300             for( x = 0; x < dst.cols; x += bw0 )
3301             {
3302                 int bw = std::min( bw0, dst.cols - x);
3303                 int bh = std::min( bh0, range.end - y);
3304
3305                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
3306                 Mat dpart(dst, Rect(x, y, bw, bh));
3307
3308                 for( y1 = 0; y1 < bh; y1++ )
3309                 {
3310                     short* xy = XY + y1*bw*2;
3311                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
3312                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
3313
3314                     if( interpolation == INTER_NEAREST )
3315                         for( x1 = 0; x1 < bw; x1++ )
3316                         {
3317                             int X = (X0 + adelta[x+x1]) >> AB_BITS;
3318                             int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
3319                             xy[x1*2] = saturate_cast<short>(X);
3320                             xy[x1*2+1] = saturate_cast<short>(Y);
3321                         }
3322                     else
3323                     {
3324                         short* alpha = A + y1*bw;
3325                         x1 = 0;
3326                     #if CV_SSE2
3327                         if( useSIMD )
3328                         {
3329                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
3330                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
3331                             for( ; x1 <= bw - 8; x1 += 8 )
3332                             {
3333                                 __m128i tx0, tx1, ty0, ty1;
3334                                 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
3335                                 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
3336                                 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
3337                                 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
3338
3339                                 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
3340                                 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
3341                                 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
3342                                 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
3343
3344                                 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
3345                                                             _mm_and_si128(tx1, fxy_mask));
3346                                 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
3347                                                             _mm_and_si128(ty1, fxy_mask));
3348                                 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
3349                                                             _mm_srai_epi32(tx1, INTER_BITS));
3350                                 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
3351                                                     _mm_srai_epi32(ty1, INTER_BITS));
3352                                 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
3353
3354                                 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
3355                                 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
3356                                 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
3357                             }
3358                         }
3359                     #endif
3360                         for( ; x1 < bw; x1++ )
3361                         {
3362                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
3363                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
3364                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
3365                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
3366                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
3367                                     (X & (INTER_TAB_SIZE-1)));
3368                         }
3369                     }
3370                 }
3371
3372                 if( interpolation == INTER_NEAREST )
3373                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
3374                 else
3375                 {
3376                     Mat _matA(bh, bw, CV_16U, A);
3377                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
3378                 }
3379             }
3380         }
3381     }
3382
3383 private:
3384     Mat src;
3385     Mat dst;
3386     int interpolation, borderType;
3387     Scalar borderValue;
3388     int *adelta, *bdelta;
3389     double *M;
3390 };
3391
3392 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
3393 class IPPwarpAffineInvoker :
3394     public ParallelLoopBody
3395 {
3396 public:
3397     IPPwarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int &_borderType, const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
3398       ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs), borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
3399       {
3400           *ok = true;
3401       }
3402
3403       virtual void operator() (const Range& range) const
3404       {
3405           IppiSize srcsize = { src.cols, src.rows };
3406           IppiRect srcroi = { 0, 0, src.cols, src.rows };
3407           IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
3408           int cnn = src.channels();
3409           if( borderType == BORDER_CONSTANT )
3410           {
3411               IppiSize setSize = { dst.cols, range.end - range.start };
3412               void *dataPointer = dst.data + dst.step[0] * range.start;
3413               if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
3414               {
3415                   *ok = false;
3416                   return;
3417               }
3418           }
3419           if( func( src.data, srcsize, (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, coeffs, mode ) < 0) ////Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
3420               *ok = false;
3421       }
3422 private:
3423     Mat &src;
3424     Mat &dst;
3425     double (&coeffs)[2][3];
3426     int mode;
3427     int borderType;
3428     Scalar borderValue;
3429     ippiWarpAffineBackFunc func;
3430     bool *ok;
3431     const IPPwarpAffineInvoker& operator= (const IPPwarpAffineInvoker&);
3432 };
3433 #endif
3434
3435 }
3436
3437
3438 void cv::warpAffine( InputArray _src, OutputArray _dst,
3439                      InputArray _M0, Size dsize,
3440                      int flags, int borderType, const Scalar& borderValue )
3441 {
3442     Mat src = _src.getMat(), M0 = _M0.getMat();
3443     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
3444     Mat dst = _dst.getMat();
3445     CV_Assert( src.cols > 0 && src.rows > 0 );
3446     if( dst.data == src.data )
3447         src = src.clone();
3448
3449     double M[6];
3450     Mat matM(2, 3, CV_64F, M);
3451     int interpolation = flags & INTER_MAX;
3452     if( interpolation == INTER_AREA )
3453         interpolation = INTER_LINEAR;
3454
3455     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
3456     M0.convertTo(matM, matM.type());
3457
3458 #ifdef HAVE_TEGRA_OPTIMIZATION
3459     if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
3460         return;
3461 #endif
3462
3463     if( !(flags & WARP_INVERSE_MAP) )
3464     {
3465         double D = M[0]*M[4] - M[1]*M[3];
3466         D = D != 0 ? 1./D : 0;
3467         double A11 = M[4]*D, A22=M[0]*D;
3468         M[0] = A11; M[1] *= -D;
3469         M[3] *= -D; M[4] = A22;
3470         double b1 = -M[0]*M[2] - M[1]*M[5];
3471         double b2 = -M[3]*M[2] - M[4]*M[5];
3472         M[2] = b1; M[5] = b2;
3473     }
3474
3475     int x;
3476     AutoBuffer<int> _abdelta(dst.cols*2);
3477     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
3478     const int AB_BITS = MAX(10, (int)INTER_BITS);
3479     const int AB_SCALE = 1 << AB_BITS;
3480 /*
3481 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
3482     int depth = src.depth();
3483     int channels = src.channels();
3484     if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
3485         ( channels == 1 || channels == 3 || channels == 4 ) &&
3486         ( borderType == cv::BORDER_TRANSPARENT || ( borderType == cv::BORDER_CONSTANT ) ) )
3487     {
3488         int type = src.type();
3489         ippiWarpAffineBackFunc ippFunc =
3490             type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
3491             type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
3492             type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
3493             type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
3494             type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
3495             type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
3496             type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
3497             type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
3498             type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
3499             0;
3500         int mode =
3501             flags == INTER_LINEAR ? IPPI_INTER_LINEAR :
3502             flags == INTER_NEAREST ? IPPI_INTER_NN :
3503             flags == INTER_CUBIC ? IPPI_INTER_CUBIC :
3504             0;
3505         if( mode && ippFunc )
3506         {
3507             double coeffs[2][3];
3508             for( int i = 0; i < 2; i++ )
3509             {
3510                 for( int j = 0; j < 3; j++ )
3511                 {
3512                     coeffs[i][j] = matM.at<double>(i, j);
3513                 }
3514             }
3515             bool ok;
3516             Range range(0, dst.rows);
3517             IPPwarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
3518             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3519             if( ok )
3520                 return;
3521         }
3522     }
3523 #endif
3524 */
3525     for( x = 0; x < dst.cols; x++ )
3526     {
3527         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
3528         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
3529     }
3530
3531     Range range(0, dst.rows);
3532     warpAffineInvoker invoker(src, dst, interpolation, borderType,
3533                               borderValue, adelta, bdelta, M);
3534     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3535 }
3536
3537
3538 namespace cv
3539 {
3540
3541 class warpPerspectiveInvoker :
3542     public ParallelLoopBody
3543 {
3544 public:
3545
3546     warpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
3547                            int _borderType, const Scalar &_borderValue) :
3548         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
3549         borderType(_borderType), borderValue(_borderValue)
3550     {
3551     }
3552
3553     virtual void operator() (const Range& range) const
3554     {
3555         const int BLOCK_SZ = 32;
3556         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
3557         int x, y, x1, y1, width = dst.cols, height = dst.rows;
3558
3559         int bh0 = std::min(BLOCK_SZ/2, height);
3560         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
3561         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
3562
3563         for( y = range.start; y < range.end; y += bh0 )
3564         {
3565             for( x = 0; x < width; x += bw0 )
3566             {
3567                 int bw = std::min( bw0, width - x);
3568                 int bh = std::min( bh0, range.end - y); // height
3569
3570                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
3571                 Mat dpart(dst, Rect(x, y, bw, bh));
3572
3573                 for( y1 = 0; y1 < bh; y1++ )
3574                 {
3575                     short* xy = XY + y1*bw*2;
3576                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
3577                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
3578                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
3579
3580                     if( interpolation == INTER_NEAREST )
3581                         for( x1 = 0; x1 < bw; x1++ )
3582                         {
3583                             double W = W0 + M[6]*x1;
3584                             W = W ? 1./W : 0;
3585                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
3586                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
3587                             int X = saturate_cast<int>(fX);
3588                             int Y = saturate_cast<int>(fY);
3589
3590                             xy[x1*2] = saturate_cast<short>(X);
3591                             xy[x1*2+1] = saturate_cast<short>(Y);
3592                         }
3593                     else
3594                     {
3595                         short* alpha = A + y1*bw;
3596                         for( x1 = 0; x1 < bw; x1++ )
3597                         {
3598                             double W = W0 + M[6]*x1;
3599                             W = W ? INTER_TAB_SIZE/W : 0;
3600                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
3601                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
3602                             int X = saturate_cast<int>(fX);
3603                             int Y = saturate_cast<int>(fY);
3604
3605                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
3606                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
3607                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
3608                                                 (X & (INTER_TAB_SIZE-1)));
3609                         }
3610                     }
3611                 }
3612
3613                 if( interpolation == INTER_NEAREST )
3614                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
3615                 else
3616                 {
3617                     Mat _matA(bh, bw, CV_16U, A);
3618                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
3619                 }
3620             }
3621         }
3622     }
3623
3624 private:
3625     Mat src;
3626     Mat dst;
3627     double* M;
3628     int interpolation, borderType;
3629     Scalar borderValue;
3630 };
3631
3632 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
3633 class IPPwarpPerspectiveInvoker :
3634     public ParallelLoopBody
3635 {
3636 public:
3637     IPPwarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation, int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveBackFunc _func, bool *_ok) :
3638       ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs), borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
3639       {
3640           *ok = true;
3641       }
3642
3643       virtual void operator() (const Range& range) const
3644       {
3645           IppiSize srcsize = {src.cols, src.rows};
3646           IppiRect srcroi = {0, 0, src.cols, src.rows};
3647           IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
3648           int cnn = src.channels();
3649
3650           if( borderType == BORDER_CONSTANT )
3651           {
3652               IppiSize setSize = {dst.cols, range.end - range.start};
3653               void *dataPointer = dst.data + dst.step[0] * range.start;
3654               if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
3655               {
3656                   *ok = false;
3657                   return;
3658               }
3659           }
3660           if( func(src.data, srcsize, (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, coeffs, mode) < 0)
3661               *ok = false;
3662       }
3663 private:
3664     Mat &src;
3665     Mat &dst;
3666     double (&coeffs)[3][3];
3667     int mode;
3668     int borderType;
3669     const Scalar borderValue;
3670     ippiWarpPerspectiveBackFunc func;
3671     bool *ok;
3672     const IPPwarpPerspectiveInvoker& operator= (const IPPwarpPerspectiveInvoker&);
3673 };
3674 #endif
3675
3676 }
3677
3678 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
3679                           Size dsize, int flags, int borderType, const Scalar& borderValue )
3680 {
3681     Mat src = _src.getMat(), M0 = _M0.getMat();
3682     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
3683     Mat dst = _dst.getMat();
3684
3685     CV_Assert( src.cols > 0 && src.rows > 0 );
3686     if( dst.data == src.data )
3687         src = src.clone();
3688
3689     double M[9];
3690     Mat matM(3, 3, CV_64F, M);
3691     int interpolation = flags & INTER_MAX;
3692     if( interpolation == INTER_AREA )
3693         interpolation = INTER_LINEAR;
3694
3695     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
3696     M0.convertTo(matM, matM.type());
3697
3698 #ifdef HAVE_TEGRA_OPTIMIZATION
3699     if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
3700         return;
3701 #endif
3702
3703     if( !(flags & WARP_INVERSE_MAP) )
3704          invert(matM, matM);
3705 /*
3706 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
3707     int depth = src.depth();
3708     int channels = src.channels();
3709     if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
3710         ( channels == 1 || channels == 3 || channels == 4 ) &&
3711         ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) )
3712     {
3713         int type = src.type();
3714         ippiWarpPerspectiveBackFunc ippFunc =
3715             type == CV_8UC1 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_8u_C1R :
3716             type == CV_8UC3 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_8u_C3R :
3717             type == CV_8UC4 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_8u_C4R :
3718             type == CV_16UC1 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_16u_C1R :
3719             type == CV_16UC3 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_16u_C3R :
3720             type == CV_16UC4 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_16u_C4R :
3721             type == CV_32FC1 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_32f_C1R :
3722             type == CV_32FC3 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_32f_C3R :
3723             type == CV_32FC4 ? (ippiWarpPerspectiveBackFunc)ippiWarpPerspectiveBack_32f_C4R :
3724             0;
3725         int mode =
3726             flags == INTER_LINEAR ? IPPI_INTER_LINEAR :
3727             flags == INTER_NEAREST ? IPPI_INTER_NN :
3728             flags == INTER_CUBIC ? IPPI_INTER_CUBIC :
3729             0;
3730         if( mode && ippFunc )
3731         {
3732             double coeffs[3][3];
3733             for( int i = 0; i < 3; i++ )
3734             {
3735                 for( int j = 0; j < 3; j++ )
3736                 {
3737                     coeffs[i][j] = matM.at<double>(i, j);
3738                 }
3739             }
3740             bool ok;
3741             Range range(0, dst.rows);
3742             IPPwarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
3743             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3744             if( ok )
3745                 return;
3746         }
3747     }
3748 #endif
3749 */
3750     Range range(0, dst.rows);
3751     warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
3752     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3753 }
3754
3755
3756 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
3757 {
3758     angle *= CV_PI/180;
3759     double alpha = cos(angle)*scale;
3760     double beta = sin(angle)*scale;
3761
3762     Mat M(2, 3, CV_64F);
3763     double* m = (double*)M.data;
3764
3765     m[0] = alpha;
3766     m[1] = beta;
3767     m[2] = (1-alpha)*center.x - beta*center.y;
3768     m[3] = -beta;
3769     m[4] = alpha;
3770     m[5] = beta*center.x + (1-alpha)*center.y;
3771
3772     return M;
3773 }
3774
3775 /* Calculates coefficients of perspective transformation
3776  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
3777  *
3778  *      c00*xi + c01*yi + c02
3779  * ui = ---------------------
3780  *      c20*xi + c21*yi + c22
3781  *
3782  *      c10*xi + c11*yi + c12
3783  * vi = ---------------------
3784  *      c20*xi + c21*yi + c22
3785  *
3786  * Coefficients are calculated by solving linear system:
3787  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
3788  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
3789  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
3790  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
3791  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
3792  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
3793  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
3794  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
3795  *
3796  * where:
3797  *   cij - matrix coefficients, c22 = 1
3798  */
3799 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
3800 {
3801     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.data);
3802     double a[8][8], b[8];
3803     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
3804
3805     for( int i = 0; i < 4; ++i )
3806     {
3807         a[i][0] = a[i+4][3] = src[i].x;
3808         a[i][1] = a[i+4][4] = src[i].y;
3809         a[i][2] = a[i+4][5] = 1;
3810         a[i][3] = a[i][4] = a[i][5] =
3811         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
3812         a[i][6] = -src[i].x*dst[i].x;
3813         a[i][7] = -src[i].y*dst[i].x;
3814         a[i+4][6] = -src[i].x*dst[i].y;
3815         a[i+4][7] = -src[i].y*dst[i].y;
3816         b[i] = dst[i].x;
3817         b[i+4] = dst[i].y;
3818     }
3819
3820     solve( A, B, X, DECOMP_SVD );
3821     ((double*)M.data)[8] = 1.;
3822
3823     return M;
3824 }
3825
3826 /* Calculates coefficients of affine transformation
3827  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
3828  *
3829  * ui = c00*xi + c01*yi + c02
3830  *
3831  * vi = c10*xi + c11*yi + c12
3832  *
3833  * Coefficients are calculated by solving linear system:
3834  * / x0 y0  1  0  0  0 \ /c00\ /u0\
3835  * | x1 y1  1  0  0  0 | |c01| |u1|
3836  * | x2 y2  1  0  0  0 | |c02| |u2|
3837  * |  0  0  0 x0 y0  1 | |c10| |v0|
3838  * |  0  0  0 x1 y1  1 | |c11| |v1|
3839  * \  0  0  0 x2 y2  1 / |c12| |v2|
3840  *
3841  * where:
3842  *   cij - matrix coefficients
3843  */
3844
3845 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
3846 {
3847     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.data);
3848     double a[6*6], b[6];
3849     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
3850
3851     for( int i = 0; i < 3; i++ )
3852     {
3853         int j = i*12;
3854         int k = i*12+6;
3855         a[j] = a[k+3] = src[i].x;
3856         a[j+1] = a[k+4] = src[i].y;
3857         a[j+2] = a[k+5] = 1;
3858         a[j+3] = a[j+4] = a[j+5] = 0;
3859         a[k] = a[k+1] = a[k+2] = 0;
3860         b[i*2] = dst[i].x;
3861         b[i*2+1] = dst[i].y;
3862     }
3863
3864     solve( A, B, X );
3865     return M;
3866 }
3867
3868 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
3869 {
3870     Mat matM = _matM.getMat();
3871     CV_Assert(matM.rows == 2 && matM.cols == 3);
3872     __iM.create(2, 3, matM.type());
3873     Mat _iM = __iM.getMat();
3874
3875     if( matM.type() == CV_32F )
3876     {
3877         const float* M = (const float*)matM.data;
3878         float* iM = (float*)_iM.data;
3879         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
3880
3881         double D = M[0]*M[step+1] - M[1]*M[step];
3882         D = D != 0 ? 1./D : 0;
3883         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
3884         double b1 = -A11*M[2] - A12*M[step+2];
3885         double b2 = -A21*M[2] - A22*M[step+2];
3886
3887         iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
3888         iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
3889     }
3890     else if( matM.type() == CV_64F )
3891     {
3892         const double* M = (const double*)matM.data;
3893         double* iM = (double*)_iM.data;
3894         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
3895
3896         double D = M[0]*M[step+1] - M[1]*M[step];
3897         D = D != 0 ? 1./D : 0;
3898         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
3899         double b1 = -A11*M[2] - A12*M[step+2];
3900         double b2 = -A21*M[2] - A22*M[step+2];
3901
3902         iM[0] = A11; iM[1] = A12; iM[2] = b1;
3903         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
3904     }
3905     else
3906         CV_Error( CV_StsUnsupportedFormat, "" );
3907 }
3908
3909 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
3910 {
3911     Mat src = _src.getMat(), dst = _dst.getMat();
3912     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
3913     return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
3914 }
3915
3916 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
3917 {
3918     Mat src = _src.getMat(), dst = _dst.getMat();
3919     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
3920     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
3921 }
3922
3923 CV_IMPL void
3924 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
3925 {
3926     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3927     CV_Assert( src.type() == dst.type() );
3928     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
3929         (double)dst.rows/src.rows, method );
3930 }
3931
3932
3933 CV_IMPL void
3934 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
3935               int flags, CvScalar fillval )
3936 {
3937     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3938     cv::Mat matrix = cv::cvarrToMat(marr);
3939     CV_Assert( src.type() == dst.type() );
3940     cv::warpAffine( src, dst, matrix, dst.size(), flags,
3941         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3942         fillval );
3943 }
3944
3945 CV_IMPL void
3946 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
3947                    int flags, CvScalar fillval )
3948 {
3949     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3950     cv::Mat matrix = cv::cvarrToMat(marr);
3951     CV_Assert( src.type() == dst.type() );
3952     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
3953         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3954         fillval );
3955 }
3956
3957 CV_IMPL void
3958 cvRemap( const CvArr* srcarr, CvArr* dstarr,
3959          const CvArr* _mapx, const CvArr* _mapy,
3960          int flags, CvScalar fillval )
3961 {
3962     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
3963     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
3964     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
3965     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
3966         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3967         fillval );
3968     CV_Assert( dst0.data == dst.data );
3969 }
3970
3971
3972 CV_IMPL CvMat*
3973 cv2DRotationMatrix( CvPoint2D32f center, double angle,
3974                     double scale, CvMat* matrix )
3975 {
3976     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
3977     CV_Assert( M.size() == M0.size() );
3978     M.convertTo(M0, M0.type());
3979     return matrix;
3980 }
3981
3982
3983 CV_IMPL CvMat*
3984 cvGetPerspectiveTransform( const CvPoint2D32f* src,
3985                           const CvPoint2D32f* dst,
3986                           CvMat* matrix )
3987 {
3988     cv::Mat M0 = cv::cvarrToMat(matrix),
3989         M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
3990     CV_Assert( M.size() == M0.size() );
3991     M.convertTo(M0, M0.type());
3992     return matrix;
3993 }
3994
3995
3996 CV_IMPL CvMat*
3997 cvGetAffineTransform( const CvPoint2D32f* src,
3998                           const CvPoint2D32f* dst,
3999                           CvMat* matrix )
4000 {
4001     cv::Mat M0 = cv::cvarrToMat(matrix),
4002         M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
4003     CV_Assert( M.size() == M0.size() );
4004     M.convertTo(M0, M0.type());
4005     return matrix;
4006 }
4007
4008
4009 CV_IMPL void
4010 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
4011 {
4012     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
4013     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
4014
4015     if( arr2 )
4016         map2 = cv::cvarrToMat(arr2);
4017     if( dstarr2 )
4018     {
4019         dstmap2 = cv::cvarrToMat(dstarr2);
4020         if( dstmap2.type() == CV_16SC1 )
4021             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.data, dstmap2.step);
4022     }
4023
4024     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
4025 }
4026
4027 /****************************************************************************************\
4028 *                                   Log-Polar Transform                                  *
4029 \****************************************************************************************/
4030
4031 /* now it is done via Remap; more correct implementation should use
4032    some super-sampling technique outside of the "fovea" circle */
4033 CV_IMPL void
4034 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
4035             CvPoint2D32f center, double M, int flags )
4036 {
4037     cv::Ptr<CvMat> mapx, mapy;
4038
4039     CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
4040     CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
4041     CvSize ssize, dsize;
4042
4043     if( !CV_ARE_TYPES_EQ( src, dst ))
4044         CV_Error( CV_StsUnmatchedFormats, "" );
4045
4046     if( M <= 0 )
4047         CV_Error( CV_StsOutOfRange, "M should be >0" );
4048
4049     ssize = cvGetMatSize(src);
4050     dsize = cvGetMatSize(dst);
4051
4052     mapx = cvCreateMat( dsize.height, dsize.width, CV_32F );
4053     mapy = cvCreateMat( dsize.height, dsize.width, CV_32F );
4054
4055     if( !(flags & CV_WARP_INVERSE_MAP) )
4056     {
4057         int phi, rho;
4058         cv::AutoBuffer<double> _exp_tab(dsize.width);
4059         double* exp_tab = _exp_tab;
4060
4061         for( rho = 0; rho < dst->width; rho++ )
4062             exp_tab[rho] = std::exp(rho/M);
4063
4064         for( phi = 0; phi < dsize.height; phi++ )
4065         {
4066             double cp = cos(phi*2*CV_PI/dsize.height);
4067             double sp = sin(phi*2*CV_PI/dsize.height);
4068             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
4069             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
4070
4071             for( rho = 0; rho < dsize.width; rho++ )
4072             {
4073                 double r = exp_tab[rho];
4074                 double x = r*cp + center.x;
4075                 double y = r*sp + center.y;
4076
4077                 mx[rho] = (float)x;
4078                 my[rho] = (float)y;
4079             }
4080         }
4081     }
4082     else
4083     {
4084         int x, y;
4085         CvMat bufx, bufy, bufp, bufa;
4086         double ascale = ssize.height/(2*CV_PI);
4087         cv::AutoBuffer<float> _buf(4*dsize.width);
4088         float* buf = _buf;
4089
4090         bufx = cvMat( 1, dsize.width, CV_32F, buf );
4091         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
4092         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
4093         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
4094
4095         for( x = 0; x < dsize.width; x++ )
4096             bufx.data.fl[x] = (float)x - center.x;
4097
4098         for( y = 0; y < dsize.height; y++ )
4099         {
4100             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
4101             float* my = (float*)(mapy->data.ptr + y*mapy->step);
4102
4103             for( x = 0; x < dsize.width; x++ )
4104                 bufy.data.fl[x] = (float)y - center.y;
4105
4106 #if 1
4107             cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
4108
4109             for( x = 0; x < dsize.width; x++ )
4110                 bufp.data.fl[x] += 1.f;
4111
4112             cvLog( &bufp, &bufp );
4113
4114             for( x = 0; x < dsize.width; x++ )
4115             {
4116                 double rho = bufp.data.fl[x]*M;
4117                 double phi = bufa.data.fl[x]*ascale;
4118
4119                 mx[x] = (float)rho;
4120                 my[x] = (float)phi;
4121             }
4122 #else
4123             for( x = 0; x < dsize.width; x++ )
4124             {
4125                 double xx = bufx.data.fl[x];
4126                 double yy = bufy.data.fl[x];
4127
4128                 double p = log(sqrt(xx*xx + yy*yy) + 1.)*M;
4129                 double a = atan2(yy,xx);
4130                 if( a < 0 )
4131                     a = 2*CV_PI + a;
4132                 a *= ascale;
4133
4134                 mx[x] = (float)p;
4135                 my[x] = (float)a;
4136             }
4137 #endif
4138         }
4139     }
4140
4141     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
4142 }
4143
4144
4145 /****************************************************************************************
4146                                    Linear-Polar Transform
4147   J.L. Blanco, Apr 2009
4148  ****************************************************************************************/
4149 CV_IMPL
4150 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
4151             CvPoint2D32f center, double maxRadius, int flags )
4152 {
4153     cv::Ptr<CvMat> mapx, mapy;
4154
4155     CvMat srcstub, *src = (CvMat*)srcarr;
4156     CvMat dststub, *dst = (CvMat*)dstarr;
4157     CvSize ssize, dsize;
4158
4159     src = cvGetMat( srcarr, &srcstub,0,0 );
4160     dst = cvGetMat( dstarr, &dststub,0,0 );
4161
4162     if( !CV_ARE_TYPES_EQ( src, dst ))
4163         CV_Error( CV_StsUnmatchedFormats, "" );
4164
4165     ssize.width = src->cols;
4166     ssize.height = src->rows;
4167     dsize.width = dst->cols;
4168     dsize.height = dst->rows;
4169
4170     mapx = cvCreateMat( dsize.height, dsize.width, CV_32F );
4171     mapy = cvCreateMat( dsize.height, dsize.width, CV_32F );
4172
4173     if( !(flags & CV_WARP_INVERSE_MAP) )
4174     {
4175         int phi, rho;
4176
4177         for( phi = 0; phi < dsize.height; phi++ )
4178         {
4179             double cp = cos(phi*2*CV_PI/dsize.height);
4180             double sp = sin(phi*2*CV_PI/dsize.height);
4181             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
4182             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
4183
4184             for( rho = 0; rho < dsize.width; rho++ )
4185             {
4186                 double r = maxRadius*(rho+1)/dsize.width;
4187                 double x = r*cp + center.x;
4188                 double y = r*sp + center.y;
4189
4190                 mx[rho] = (float)x;
4191                 my[rho] = (float)y;
4192             }
4193         }
4194     }
4195     else
4196     {
4197         int x, y;
4198         CvMat bufx, bufy, bufp, bufa;
4199         const double ascale = ssize.height/(2*CV_PI);
4200         const double pscale = ssize.width/maxRadius;
4201
4202         cv::AutoBuffer<float> _buf(4*dsize.width);
4203         float* buf = _buf;
4204
4205         bufx = cvMat( 1, dsize.width, CV_32F, buf );
4206         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
4207         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
4208         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
4209
4210         for( x = 0; x < dsize.width; x++ )
4211             bufx.data.fl[x] = (float)x - center.x;
4212
4213         for( y = 0; y < dsize.height; y++ )
4214         {
4215             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
4216             float* my = (float*)(mapy->data.ptr + y*mapy->step);
4217
4218             for( x = 0; x < dsize.width; x++ )
4219                 bufy.data.fl[x] = (float)y - center.y;
4220
4221             cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
4222
4223             for( x = 0; x < dsize.width; x++ )
4224                 bufp.data.fl[x] += 1.f;
4225
4226             for( x = 0; x < dsize.width; x++ )
4227             {
4228                 double rho = bufp.data.fl[x]*pscale;
4229                 double phi = bufa.data.fl[x]*ascale;
4230                 mx[x] = (float)rho;
4231                 my[x] = (float)phi;
4232             }
4233         }
4234     }
4235
4236     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
4237 }
4238
4239
4240 /* End of file. */