1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 /* ////////////////////////////////////////////////////////////////////
45 // Geometrical transforms on images and matrices: rotation, zoom etc.
49 #include "precomp.hpp"
50 #include "opencl_kernels_imgproc.hpp"
52 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
53 static IppStatus sts = ippInit();
58 #if IPP_VERSION_X100 >= 701
59 typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
60 typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
61 typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
64 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) && 0
65 typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
66 typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
67 typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
69 template <int channels, typename Type>
70 bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
72 Type values[channels];
73 for( int i = 0; i < channels; i++ )
74 values[i] = saturate_cast<Type>(value[i]);
75 return func(values, dataPointer, step, size) >= 0;
78 static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
85 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
87 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
89 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
99 return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
101 return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
103 return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
106 else if( channels == 4 )
111 return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
113 return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
115 return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
123 /************** interpolation formulas and tables ***************/
125 const int INTER_RESIZE_COEF_BITS=11;
126 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
128 const int INTER_REMAP_COEF_BITS=15;
129 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
131 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
133 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
134 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
136 #if CV_SSE2 || CV_NEON
137 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
138 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
141 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
142 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
144 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
145 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
147 static inline void interpolateLinear( float x, float* coeffs )
153 static inline void interpolateCubic( float x, float* coeffs )
155 const float A = -0.75f;
157 coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
158 coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
159 coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
160 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
163 static inline void interpolateLanczos4( float x, float* coeffs )
165 static const double s45 = 0.70710678118654752440084436210485;
166 static const double cs[][2]=
167 {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
169 if( x < FLT_EPSILON )
171 for( int i = 0; i < 8; i++ )
178 double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
179 for(int i = 0; i < 8; i++ )
181 double y = -(x+3-i)*CV_PI*0.25;
182 coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
187 for(int i = 0; i < 8; i++ )
191 static void initInterTab1D(int method, float* tab, int tabsz)
193 float scale = 1.f/tabsz;
194 if( method == INTER_LINEAR )
196 for( int i = 0; i < tabsz; i++, tab += 2 )
197 interpolateLinear( i*scale, tab );
199 else if( method == INTER_CUBIC )
201 for( int i = 0; i < tabsz; i++, tab += 4 )
202 interpolateCubic( i*scale, tab );
204 else if( method == INTER_LANCZOS4 )
206 for( int i = 0; i < tabsz; i++, tab += 8 )
207 interpolateLanczos4( i*scale, tab );
210 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
214 static const void* initInterTab2D( int method, bool fixpt )
216 static bool inittab[INTER_MAX+1] = {false};
220 if( method == INTER_LINEAR )
221 tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
222 else if( method == INTER_CUBIC )
223 tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
224 else if( method == INTER_LANCZOS4 )
225 tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
227 CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
229 if( !inittab[method] )
231 AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
233 initInterTab1D(method, _tab, INTER_TAB_SIZE);
234 for( i = 0; i < INTER_TAB_SIZE; i++ )
235 for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
238 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
239 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
241 for( k1 = 0; k1 < ksize; k1++ )
243 float vy = _tab[i*ksize + k1];
244 for( k2 = 0; k2 < ksize; k2++ )
246 float v = vy*_tab[j*ksize + k2];
247 tab[k1*ksize + k2] = v;
248 isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
252 if( isum != INTER_REMAP_COEF_SCALE )
254 int diff = isum - INTER_REMAP_COEF_SCALE;
255 int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
256 for( k1 = ksize2; k1 < ksize2+2; k1++ )
257 for( k2 = ksize2; k2 < ksize2+2; k2++ )
259 if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
261 else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
265 itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
267 itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
270 tab -= INTER_TAB_SIZE2*ksize*ksize;
271 itab -= INTER_TAB_SIZE2*ksize*ksize;
272 #if CV_SSE2 || CV_NEON
273 if( method == INTER_LINEAR )
275 for( i = 0; i < INTER_TAB_SIZE2; i++ )
276 for( j = 0; j < 4; j++ )
278 BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
279 BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
280 BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
281 BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
285 inittab[method] = true;
287 return fixpt ? (const void*)itab : (const void*)tab;
291 static bool initAllInterTab2D()
293 return initInterTab2D( INTER_LINEAR, false ) &&
294 initInterTab2D( INTER_LINEAR, true ) &&
295 initInterTab2D( INTER_CUBIC, false ) &&
296 initInterTab2D( INTER_CUBIC, true ) &&
297 initInterTab2D( INTER_LANCZOS4, false ) &&
298 initInterTab2D( INTER_LANCZOS4, true );
301 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
304 template<typename ST, typename DT> struct Cast
309 DT operator()(ST val) const { return saturate_cast<DT>(val); }
312 template<typename ST, typename DT, int bits> struct FixedPtCast
316 enum { SHIFT = bits, DELTA = 1 << (bits-1) };
318 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
321 /****************************************************************************************\
323 \****************************************************************************************/
325 class resizeNNInvoker :
326 public ParallelLoopBody
329 resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
330 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
335 virtual void operator() (const Range& range) const
337 Size ssize = src.size(), dsize = dst.size();
338 int y, x, pix_size = (int)src.elemSize();
340 for( y = range.start; y < range.end; y++ )
342 uchar* D = dst.data + dst.step*y;
343 int sy = std::min(cvFloor(y*ify), ssize.height-1);
344 const uchar* S = src.ptr(sy);
349 for( x = 0; x <= dsize.width - 2; x += 2 )
351 uchar t0 = S[x_ofs[x]];
352 uchar t1 = S[x_ofs[x+1]];
357 for( ; x < dsize.width; x++ )
361 for( x = 0; x < dsize.width; x++ )
362 *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
365 for( x = 0; x < dsize.width; x++, D += 3 )
367 const uchar* _tS = S + x_ofs[x];
368 D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
372 for( x = 0; x < dsize.width; x++ )
373 *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
376 for( x = 0; x < dsize.width; x++, D += 6 )
378 const ushort* _tS = (const ushort*)(S + x_ofs[x]);
379 ushort* _tD = (ushort*)D;
380 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
384 for( x = 0; x < dsize.width; x++, D += 8 )
386 const int* _tS = (const int*)(S + x_ofs[x]);
388 _tD[0] = _tS[0]; _tD[1] = _tS[1];
392 for( x = 0; x < dsize.width; x++, D += 12 )
394 const int* _tS = (const int*)(S + x_ofs[x]);
396 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
400 for( x = 0; x < dsize.width; x++, D += pix_size )
402 const int* _tS = (const int*)(S + x_ofs[x]);
404 for( int k = 0; k < pix_size4; k++ )
414 int* x_ofs, pix_size4;
417 resizeNNInvoker(const resizeNNInvoker&);
418 resizeNNInvoker& operator=(const resizeNNInvoker&);
422 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
424 Size ssize = src.size(), dsize = dst.size();
425 AutoBuffer<int> _x_ofs(dsize.width);
427 int pix_size = (int)src.elemSize();
428 int pix_size4 = (int)(pix_size / sizeof(int));
429 double ifx = 1./fx, ify = 1./fy;
432 for( x = 0; x < dsize.width; x++ )
434 int sx = cvFloor(x*ifx);
435 x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
438 Range range(0, dsize.height);
439 resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
440 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
446 int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
451 int operator()(const uchar**, uchar**, int, const int*,
452 const uchar*, int, int, int, int, int) const { return 0; }
457 struct VResizeLinearVec_32s8u
459 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
461 if( !checkHardwareSupport(CV_CPU_SSE2) )
464 const int** src = (const int**)_src;
465 const short* beta = (const short*)_beta;
466 const int *S0 = src[0], *S1 = src[1];
468 __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
469 __m128i delta = _mm_set1_epi16(2);
471 if( (((size_t)S0|(size_t)S1)&15) == 0 )
472 for( ; x <= width - 16; x += 16 )
474 __m128i x0, x1, x2, y0, y1, y2;
475 x0 = _mm_load_si128((const __m128i*)(S0 + x));
476 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
477 y0 = _mm_load_si128((const __m128i*)(S1 + x));
478 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
479 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
480 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
482 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
483 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
484 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
485 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
486 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
487 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
489 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
490 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
492 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
493 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
494 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
497 for( ; x <= width - 16; x += 16 )
499 __m128i x0, x1, x2, y0, y1, y2;
500 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
501 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
502 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
503 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
504 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
505 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
507 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
508 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
509 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
510 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
511 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
512 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
514 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
515 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
517 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
518 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
519 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
522 for( ; x < width - 4; x += 4 )
525 x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
526 y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
527 x0 = _mm_packs_epi32(x0, x0);
528 y0 = _mm_packs_epi32(y0, y0);
529 x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
530 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
531 x0 = _mm_packus_epi16(x0, x0);
532 *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
540 template<int shiftval> struct VResizeLinearVec_32f16
542 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
544 if( !checkHardwareSupport(CV_CPU_SSE2) )
547 const float** src = (const float**)_src;
548 const float* beta = (const float*)_beta;
549 const float *S0 = src[0], *S1 = src[1];
550 ushort* dst = (ushort*)_dst;
553 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
554 __m128i preshift = _mm_set1_epi32(shiftval);
555 __m128i postshift = _mm_set1_epi16((short)shiftval);
557 if( (((size_t)S0|(size_t)S1)&15) == 0 )
558 for( ; x <= width - 16; x += 16 )
560 __m128 x0, x1, y0, y1;
562 x0 = _mm_load_ps(S0 + x);
563 x1 = _mm_load_ps(S0 + x + 4);
564 y0 = _mm_load_ps(S1 + x);
565 y1 = _mm_load_ps(S1 + x + 4);
567 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
568 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
569 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
570 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
571 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
573 x0 = _mm_load_ps(S0 + x + 8);
574 x1 = _mm_load_ps(S0 + x + 12);
575 y0 = _mm_load_ps(S1 + x + 8);
576 y1 = _mm_load_ps(S1 + x + 12);
578 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
579 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
580 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
581 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
582 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
584 _mm_storeu_si128( (__m128i*)(dst + x), t0);
585 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
588 for( ; x <= width - 16; x += 16 )
590 __m128 x0, x1, y0, y1;
592 x0 = _mm_loadu_ps(S0 + x);
593 x1 = _mm_loadu_ps(S0 + x + 4);
594 y0 = _mm_loadu_ps(S1 + x);
595 y1 = _mm_loadu_ps(S1 + x + 4);
597 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
598 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
599 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
600 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
601 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
603 x0 = _mm_loadu_ps(S0 + x + 8);
604 x1 = _mm_loadu_ps(S0 + x + 12);
605 y0 = _mm_loadu_ps(S1 + x + 8);
606 y1 = _mm_loadu_ps(S1 + x + 12);
608 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
609 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
610 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
611 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
612 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
614 _mm_storeu_si128( (__m128i*)(dst + x), t0);
615 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
618 for( ; x < width - 4; x += 4 )
622 x0 = _mm_loadu_ps(S0 + x);
623 y0 = _mm_loadu_ps(S1 + x);
625 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
626 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
627 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
628 _mm_storel_epi64( (__m128i*)(dst + x), t0);
635 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
636 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
638 struct VResizeLinearVec_32f
640 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
642 if( !checkHardwareSupport(CV_CPU_SSE) )
645 const float** src = (const float**)_src;
646 const float* beta = (const float*)_beta;
647 const float *S0 = src[0], *S1 = src[1];
648 float* dst = (float*)_dst;
651 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
653 if( (((size_t)S0|(size_t)S1)&15) == 0 )
654 for( ; x <= width - 8; x += 8 )
656 __m128 x0, x1, y0, y1;
657 x0 = _mm_load_ps(S0 + x);
658 x1 = _mm_load_ps(S0 + x + 4);
659 y0 = _mm_load_ps(S1 + x);
660 y1 = _mm_load_ps(S1 + x + 4);
662 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
663 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
665 _mm_storeu_ps( dst + x, x0);
666 _mm_storeu_ps( dst + x + 4, x1);
669 for( ; x <= width - 8; x += 8 )
671 __m128 x0, x1, y0, y1;
672 x0 = _mm_loadu_ps(S0 + x);
673 x1 = _mm_loadu_ps(S0 + x + 4);
674 y0 = _mm_loadu_ps(S1 + x);
675 y1 = _mm_loadu_ps(S1 + x + 4);
677 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
678 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
680 _mm_storeu_ps( dst + x, x0);
681 _mm_storeu_ps( dst + x + 4, x1);
689 struct VResizeCubicVec_32s8u
691 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
693 if( !checkHardwareSupport(CV_CPU_SSE2) )
696 const int** src = (const int**)_src;
697 const short* beta = (const short*)_beta;
698 const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
700 float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
701 __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
702 b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
704 if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
705 for( ; x <= width - 8; x += 8 )
707 __m128i x0, x1, y0, y1;
708 __m128 s0, s1, f0, f1;
709 x0 = _mm_load_si128((const __m128i*)(S0 + x));
710 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
711 y0 = _mm_load_si128((const __m128i*)(S1 + x));
712 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
714 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
715 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
716 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
717 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
718 s0 = _mm_add_ps(s0, f0);
719 s1 = _mm_add_ps(s1, f1);
721 x0 = _mm_load_si128((const __m128i*)(S2 + x));
722 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
723 y0 = _mm_load_si128((const __m128i*)(S3 + x));
724 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
726 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
727 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
728 s0 = _mm_add_ps(s0, f0);
729 s1 = _mm_add_ps(s1, f1);
730 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
731 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
732 s0 = _mm_add_ps(s0, f0);
733 s1 = _mm_add_ps(s1, f1);
735 x0 = _mm_cvtps_epi32(s0);
736 x1 = _mm_cvtps_epi32(s1);
738 x0 = _mm_packs_epi32(x0, x1);
739 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
742 for( ; x <= width - 8; x += 8 )
744 __m128i x0, x1, y0, y1;
745 __m128 s0, s1, f0, f1;
746 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
747 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
748 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
749 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
751 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
752 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
753 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
754 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
755 s0 = _mm_add_ps(s0, f0);
756 s1 = _mm_add_ps(s1, f1);
758 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
759 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
760 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
761 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
763 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
764 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
765 s0 = _mm_add_ps(s0, f0);
766 s1 = _mm_add_ps(s1, f1);
767 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
768 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
769 s0 = _mm_add_ps(s0, f0);
770 s1 = _mm_add_ps(s1, f1);
772 x0 = _mm_cvtps_epi32(s0);
773 x1 = _mm_cvtps_epi32(s1);
775 x0 = _mm_packs_epi32(x0, x1);
776 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
784 template<int shiftval> struct VResizeCubicVec_32f16
786 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
788 if( !checkHardwareSupport(CV_CPU_SSE2) )
791 const float** src = (const float**)_src;
792 const float* beta = (const float*)_beta;
793 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
794 ushort* dst = (ushort*)_dst;
796 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
797 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
798 __m128i preshift = _mm_set1_epi32(shiftval);
799 __m128i postshift = _mm_set1_epi16((short)shiftval);
801 for( ; x <= width - 8; x += 8 )
803 __m128 x0, x1, y0, y1, s0, s1;
805 x0 = _mm_loadu_ps(S0 + x);
806 x1 = _mm_loadu_ps(S0 + x + 4);
807 y0 = _mm_loadu_ps(S1 + x);
808 y1 = _mm_loadu_ps(S1 + x + 4);
810 s0 = _mm_mul_ps(x0, b0);
811 s1 = _mm_mul_ps(x1, b0);
812 y0 = _mm_mul_ps(y0, b1);
813 y1 = _mm_mul_ps(y1, b1);
814 s0 = _mm_add_ps(s0, y0);
815 s1 = _mm_add_ps(s1, y1);
817 x0 = _mm_loadu_ps(S2 + x);
818 x1 = _mm_loadu_ps(S2 + x + 4);
819 y0 = _mm_loadu_ps(S3 + x);
820 y1 = _mm_loadu_ps(S3 + x + 4);
822 x0 = _mm_mul_ps(x0, b2);
823 x1 = _mm_mul_ps(x1, b2);
824 y0 = _mm_mul_ps(y0, b3);
825 y1 = _mm_mul_ps(y1, b3);
826 s0 = _mm_add_ps(s0, x0);
827 s1 = _mm_add_ps(s1, x1);
828 s0 = _mm_add_ps(s0, y0);
829 s1 = _mm_add_ps(s1, y1);
831 t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
832 t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
834 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
835 _mm_storeu_si128( (__m128i*)(dst + x), t0);
842 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
843 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
845 struct VResizeCubicVec_32f
847 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
849 if( !checkHardwareSupport(CV_CPU_SSE) )
852 const float** src = (const float**)_src;
853 const float* beta = (const float*)_beta;
854 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
855 float* dst = (float*)_dst;
857 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
858 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
860 for( ; x <= width - 8; x += 8 )
862 __m128 x0, x1, y0, y1, s0, s1;
863 x0 = _mm_loadu_ps(S0 + x);
864 x1 = _mm_loadu_ps(S0 + x + 4);
865 y0 = _mm_loadu_ps(S1 + x);
866 y1 = _mm_loadu_ps(S1 + x + 4);
868 s0 = _mm_mul_ps(x0, b0);
869 s1 = _mm_mul_ps(x1, b0);
870 y0 = _mm_mul_ps(y0, b1);
871 y1 = _mm_mul_ps(y1, b1);
872 s0 = _mm_add_ps(s0, y0);
873 s1 = _mm_add_ps(s1, y1);
875 x0 = _mm_loadu_ps(S2 + x);
876 x1 = _mm_loadu_ps(S2 + x + 4);
877 y0 = _mm_loadu_ps(S3 + x);
878 y1 = _mm_loadu_ps(S3 + x + 4);
880 x0 = _mm_mul_ps(x0, b2);
881 x1 = _mm_mul_ps(x1, b2);
882 y0 = _mm_mul_ps(y0, b3);
883 y1 = _mm_mul_ps(y1, b3);
884 s0 = _mm_add_ps(s0, x0);
885 s1 = _mm_add_ps(s1, x1);
886 s0 = _mm_add_ps(s0, y0);
887 s1 = _mm_add_ps(s1, y1);
889 _mm_storeu_ps( dst + x, s0);
890 _mm_storeu_ps( dst + x + 4, s1);
899 struct VResizeLinearVec_32s8u
901 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
903 const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
904 const short* beta = (const short*)_beta;
906 int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
908 for( ; x <= width - 16; x += 16)
910 int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
911 int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
913 int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
914 int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
916 int16x8_t v_dst0 = vmlaq_s16(vmulq_s16(v_src0, v_b0), v_src1, v_b1);
917 v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
919 v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
920 v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
922 v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
923 v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
925 int16x8_t v_dst1 = vmlaq_s16(vmulq_s16(v_src0, v_b0), v_src1, v_b1);
926 v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
928 vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
935 struct VResizeLinearVec_32f16u
937 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
939 const float** src = (const float**)_src;
940 const float* beta = (const float*)_beta;
941 const float *S0 = src[0], *S1 = src[1];
942 ushort* dst = (ushort*)_dst;
945 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
947 for( ; x <= width - 8; x += 8 )
949 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
950 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
952 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
953 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
955 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
956 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
963 struct VResizeLinearVec_32f16s
965 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
967 const float** src = (const float**)_src;
968 const float* beta = (const float*)_beta;
969 const float *S0 = src[0], *S1 = src[1];
970 short* dst = (short*)_dst;
973 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
975 for( ; x <= width - 8; x += 8 )
977 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
978 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
980 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
981 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
983 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
984 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
991 struct VResizeLinearVec_32f
993 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
995 const float** src = (const float**)_src;
996 const float* beta = (const float*)_beta;
997 const float *S0 = src[0], *S1 = src[1];
998 float* dst = (float*)_dst;
1001 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1003 for( ; x <= width - 8; x += 8 )
1005 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1006 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1008 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
1009 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
1016 typedef VResizeNoVec VResizeCubicVec_32s8u;
1018 struct VResizeCubicVec_32f16u
1020 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1022 const float** src = (const float**)_src;
1023 const float* beta = (const float*)_beta;
1024 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1025 ushort* dst = (ushort*)_dst;
1027 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1028 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1030 for( ; x <= width - 8; x += 8 )
1032 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1033 v_b1, vld1q_f32(S1 + x)),
1034 v_b2, vld1q_f32(S2 + x)),
1035 v_b3, vld1q_f32(S3 + x));
1036 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1037 v_b1, vld1q_f32(S1 + x + 4)),
1038 v_b2, vld1q_f32(S2 + x + 4)),
1039 v_b3, vld1q_f32(S3 + x + 4));
1041 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1042 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1049 struct VResizeCubicVec_32f16s
1051 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1053 const float** src = (const float**)_src;
1054 const float* beta = (const float*)_beta;
1055 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1056 short* dst = (short*)_dst;
1058 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1059 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1061 for( ; x <= width - 8; x += 8 )
1063 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1064 v_b1, vld1q_f32(S1 + x)),
1065 v_b2, vld1q_f32(S2 + x)),
1066 v_b3, vld1q_f32(S3 + x));
1067 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1068 v_b1, vld1q_f32(S1 + x + 4)),
1069 v_b2, vld1q_f32(S2 + x + 4)),
1070 v_b3, vld1q_f32(S3 + x + 4));
1072 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1073 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1080 struct VResizeCubicVec_32f
1082 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1084 const float** src = (const float**)_src;
1085 const float* beta = (const float*)_beta;
1086 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1087 float* dst = (float*)_dst;
1089 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1090 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1092 for( ; x <= width - 8; x += 8 )
1094 vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1095 v_b1, vld1q_f32(S1 + x)),
1096 v_b2, vld1q_f32(S2 + x)),
1097 v_b3, vld1q_f32(S3 + x)));
1098 vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1099 v_b1, vld1q_f32(S1 + x + 4)),
1100 v_b2, vld1q_f32(S2 + x + 4)),
1101 v_b3, vld1q_f32(S3 + x + 4)));
1110 typedef VResizeNoVec VResizeLinearVec_32s8u;
1111 typedef VResizeNoVec VResizeLinearVec_32f16u;
1112 typedef VResizeNoVec VResizeLinearVec_32f16s;
1113 typedef VResizeNoVec VResizeLinearVec_32f;
1115 typedef VResizeNoVec VResizeCubicVec_32s8u;
1116 typedef VResizeNoVec VResizeCubicVec_32f16u;
1117 typedef VResizeNoVec VResizeCubicVec_32f16s;
1118 typedef VResizeNoVec VResizeCubicVec_32f;
1122 typedef HResizeNoVec HResizeLinearVec_8u32s;
1123 typedef HResizeNoVec HResizeLinearVec_16u32f;
1124 typedef HResizeNoVec HResizeLinearVec_16s32f;
1125 typedef HResizeNoVec HResizeLinearVec_32f;
1126 typedef HResizeNoVec HResizeLinearVec_64f;
1129 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1130 struct HResizeLinear
1132 typedef T value_type;
1133 typedef WT buf_type;
1134 typedef AT alpha_type;
1136 void operator()(const T** src, WT** dst, int count,
1137 const int* xofs, const AT* alpha,
1138 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1143 int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
1144 xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
1146 for( k = 0; k <= count - 2; k++ )
1148 const T *S0 = src[k], *S1 = src[k+1];
1149 WT *D0 = dst[k], *D1 = dst[k+1];
1150 for( dx = dx0; dx < xmax; dx++ )
1153 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1154 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1155 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1156 D0[dx] = t0; D1[dx] = t1;
1159 for( ; dx < dwidth; dx++ )
1162 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1166 for( ; k < count; k++ )
1168 const T *S = src[k];
1170 for( dx = 0; dx < xmax; dx++ )
1173 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1176 for( ; dx < dwidth; dx++ )
1177 D[dx] = WT(S[xofs[dx]]*ONE);
1183 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1184 struct VResizeLinear
1186 typedef T value_type;
1187 typedef WT buf_type;
1188 typedef AT alpha_type;
1190 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1192 WT b0 = beta[0], b1 = beta[1];
1193 const WT *S0 = src[0], *S1 = src[1];
1197 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1198 #if CV_ENABLE_UNROLLED
1199 for( ; x <= width - 4; x += 4 )
1202 t0 = S0[x]*b0 + S1[x]*b1;
1203 t1 = S0[x+1]*b0 + S1[x+1]*b1;
1204 dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1205 t0 = S0[x+2]*b0 + S1[x+2]*b1;
1206 t1 = S0[x+3]*b0 + S1[x+3]*b1;
1207 dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1210 for( ; x < width; x++ )
1211 dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1216 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1218 typedef uchar value_type;
1219 typedef int buf_type;
1220 typedef short alpha_type;
1222 void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1224 alpha_type b0 = beta[0], b1 = beta[1];
1225 const buf_type *S0 = src[0], *S1 = src[1];
1226 VResizeLinearVec_32s8u vecOp;
1228 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1229 #if CV_ENABLE_UNROLLED
1230 for( ; x <= width - 4; x += 4 )
1232 dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1233 dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1234 dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1235 dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1238 for( ; x < width; x++ )
1239 dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1244 template<typename T, typename WT, typename AT>
1247 typedef T value_type;
1248 typedef WT buf_type;
1249 typedef AT alpha_type;
1251 void operator()(const T** src, WT** dst, int count,
1252 const int* xofs, const AT* alpha,
1253 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1255 for( int k = 0; k < count; k++ )
1257 const T *S = src[k];
1259 int dx = 0, limit = xmin;
1262 for( ; dx < limit; dx++, alpha += 4 )
1264 int j, sx = xofs[dx] - cn;
1266 for( j = 0; j < 4; j++ )
1268 int sxj = sx + j*cn;
1269 if( (unsigned)sxj >= (unsigned)swidth )
1273 while( sxj >= swidth )
1276 v += S[sxj]*alpha[j];
1280 if( limit == dwidth )
1282 for( ; dx < xmax; dx++, alpha += 4 )
1285 D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1286 S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1296 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1299 typedef T value_type;
1300 typedef WT buf_type;
1301 typedef AT alpha_type;
1303 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1305 WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1306 const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1310 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1311 for( ; x < width; x++ )
1312 dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
1317 template<typename T, typename WT, typename AT>
1318 struct HResizeLanczos4
1320 typedef T value_type;
1321 typedef WT buf_type;
1322 typedef AT alpha_type;
1324 void operator()(const T** src, WT** dst, int count,
1325 const int* xofs, const AT* alpha,
1326 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1328 for( int k = 0; k < count; k++ )
1330 const T *S = src[k];
1332 int dx = 0, limit = xmin;
1335 for( ; dx < limit; dx++, alpha += 8 )
1337 int j, sx = xofs[dx] - cn*3;
1339 for( j = 0; j < 8; j++ )
1341 int sxj = sx + j*cn;
1342 if( (unsigned)sxj >= (unsigned)swidth )
1346 while( sxj >= swidth )
1349 v += S[sxj]*alpha[j];
1353 if( limit == dwidth )
1355 for( ; dx < xmax; dx++, alpha += 8 )
1358 D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
1359 S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
1360 S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
1361 S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
1371 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1372 struct VResizeLanczos4
1374 typedef T value_type;
1375 typedef WT buf_type;
1376 typedef AT alpha_type;
1378 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1382 int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1383 #if CV_ENABLE_UNROLLED
1384 for( ; x <= width - 4; x += 4 )
1387 const WT* S = src[0];
1388 WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
1390 for( k = 1; k < 8; k++ )
1392 b = beta[k]; S = src[k];
1393 s0 += S[x]*b; s1 += S[x+1]*b;
1394 s2 += S[x+2]*b; s3 += S[x+3]*b;
1397 dst[x] = castOp(s0); dst[x+1] = castOp(s1);
1398 dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
1401 for( ; x < width; x++ )
1403 dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
1404 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
1405 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
1411 static inline int clip(int x, int a, int b)
1413 return x >= a ? (x < b ? x : b-1) : a;
1416 static const int MAX_ESIZE=16;
1418 template <typename HResize, typename VResize>
1419 class resizeGeneric_Invoker :
1420 public ParallelLoopBody
1423 typedef typename HResize::value_type T;
1424 typedef typename HResize::buf_type WT;
1425 typedef typename HResize::alpha_type AT;
1427 resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
1428 const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
1429 int _ksize, int _xmin, int _xmax) :
1430 ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
1431 alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
1432 ksize(_ksize), xmin(_xmin), xmax(_xmax)
1434 CV_Assert(ksize <= MAX_ESIZE);
1437 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1438 # pragma GCC diagnostic push
1439 # pragma GCC diagnostic ignored "-Warray-bounds"
1441 virtual void operator() (const Range& range) const
1443 int dy, cn = src.channels();
1447 int bufstep = (int)alignSize(dsize.width, 16);
1448 AutoBuffer<WT> _buffer(bufstep*ksize);
1449 const T* srows[MAX_ESIZE]={0};
1450 WT* rows[MAX_ESIZE]={0};
1451 int prev_sy[MAX_ESIZE];
1453 for(int k = 0; k < ksize; k++ )
1456 rows[k] = (WT*)_buffer + bufstep*k;
1459 const AT* beta = _beta + ksize * range.start;
1461 for( dy = range.start; dy < range.end; dy++, beta += ksize )
1463 int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
1465 for(int k = 0; k < ksize; k++ )
1467 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
1468 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
1470 if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
1473 memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
1478 k0 = std::min(k0, k); // remember the first row that needs to be computed
1479 srows[k] = src.template ptr<T>(sy);
1484 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
1485 ssize.width, dsize.width, cn, xmin, xmax );
1486 vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
1489 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1490 # pragma GCC diagnostic pop
1496 const int* xofs, *yofs;
1497 const AT* alpha, *_beta;
1499 const int ksize, xmin, xmax;
1501 resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
1504 template<class HResize, class VResize>
1505 static void resizeGeneric_( const Mat& src, Mat& dst,
1506 const int* xofs, const void* _alpha,
1507 const int* yofs, const void* _beta,
1508 int xmin, int xmax, int ksize )
1510 typedef typename HResize::alpha_type AT;
1512 const AT* beta = (const AT*)_beta;
1513 Size ssize = src.size(), dsize = dst.size();
1514 int cn = src.channels();
1519 // image resize is a separable operation. In case of not too strong
1521 Range range(0, dsize.height);
1522 resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
1523 ssize, dsize, ksize, xmin, xmax);
1524 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1527 template <typename T, typename WT>
1528 struct ResizeAreaFastNoVec
1530 ResizeAreaFastNoVec(int, int) { }
1531 ResizeAreaFastNoVec(int, int, int, int) { }
1532 int operator() (const T*, T*, int) const
1538 class ResizeAreaFastVec_SIMD_8u
1541 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1542 cn(_cn), step(_step)
1546 int operator() (const uchar* S, uchar* D, int w) const
1549 const uchar* S0 = S, * S1 = S0 + step;
1551 uint16x8_t v_2 = vdupq_n_u16(2);
1555 for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
1557 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
1559 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
1560 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
1561 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
1563 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
1564 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
1565 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
1567 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
1572 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1574 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
1576 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
1577 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
1578 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
1579 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
1581 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
1582 vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
1583 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
1584 vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
1585 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
1587 vst1_u8(D, vmovn_u16(v_dst));
1598 class ResizeAreaFastVec_SIMD_16u
1601 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1602 cn(_cn), step(_step)
1606 int operator() (const ushort * S, ushort * D, int w) const
1609 const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
1611 uint32x4_t v_2 = vdupq_n_u32(2);
1615 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1617 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
1619 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
1620 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
1621 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
1623 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
1624 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
1625 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
1627 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
1632 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1634 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
1635 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
1636 vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
1637 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
1650 class ResizeAreaFastVec_SIMD_8u
1653 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1654 cn(_cn), step(_step)
1656 use_simd = checkHardwareSupport(CV_CPU_SSE2);
1659 int operator() (const uchar* S, uchar* D, int w) const
1665 const uchar* S0 = S;
1666 const uchar* S1 = S0 + step;
1667 __m128i zero = _mm_setzero_si128();
1668 __m128i delta2 = _mm_set1_epi16(2);
1672 __m128i masklow = _mm_set1_epi16(0x00ff);
1673 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1675 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1676 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1678 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
1679 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
1680 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
1681 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1683 _mm_storel_epi64((__m128i*)D, s0);
1687 for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
1689 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1690 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1692 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1693 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
1694 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1695 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
1697 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
1698 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
1699 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1700 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1701 _mm_storel_epi64((__m128i*)D, s0);
1703 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
1704 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
1705 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1706 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1707 _mm_storel_epi64((__m128i*)(D+3), s0);
1712 int v[] = { 0, 0, -1, -1 };
1713 __m128i mask = _mm_loadu_si128((const __m128i*)v);
1715 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1717 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1718 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1720 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1721 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
1722 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1723 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
1725 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
1726 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
1727 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1728 __m128i res0 = _mm_srli_epi16(s0, 2);
1730 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
1731 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
1732 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1733 __m128i res1 = _mm_srli_epi16(s0, 2);
1734 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
1735 _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
1736 _mm_storel_epi64((__m128i*)(D), s0);
1749 class ResizeAreaFastVec_SIMD_16u
1752 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1753 cn(_cn), step(_step)
1755 use_simd = checkHardwareSupport(CV_CPU_SSE2);
1758 int operator() (const ushort* S, ushort* D, int w) const
1764 const ushort* S0 = (const ushort*)S;
1765 const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
1766 __m128i masklow = _mm_set1_epi32(0x0000ffff);
1767 __m128i zero = _mm_setzero_si128();
1768 __m128i delta2 = _mm_set1_epi32(2);
1770 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
1774 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1776 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1777 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1779 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
1780 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
1781 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
1782 s0 = _mm_srli_epi32(s0, 2);
1783 s0 = _mm_packus_epi32(s0, zero);
1785 _mm_storel_epi64((__m128i*)D, s0);
1789 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
1791 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1792 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1794 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
1795 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
1796 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
1797 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
1799 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
1800 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
1801 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
1802 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
1803 _mm_storel_epi64((__m128i*)D, s0);
1808 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1810 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1811 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1813 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
1814 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
1815 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
1816 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
1818 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
1819 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
1820 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
1821 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
1822 _mm_storel_epi64((__m128i*)D, s0);
1826 #undef _mm_packus_epi32
1838 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
1839 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
1842 template<typename T, typename SIMDVecOp>
1843 struct ResizeAreaFastVec
1845 ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
1846 scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
1848 fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
1851 int operator() (const T* S, T* D, int w) const
1856 const T* nextS = (const T*)((const uchar*)S + step);
1857 int dx = vecOp(S, D, w);
1860 for( ; dx < w; ++dx )
1863 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
1866 for( ; dx < w; dx += 3 )
1869 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
1870 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
1871 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
1876 for( ; dx < w; dx += 4 )
1879 D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
1880 D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
1881 D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
1882 D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
1890 int scale_x, scale_y;
1897 template <typename T, typename WT, typename VecOp>
1898 class resizeAreaFast_Invoker :
1899 public ParallelLoopBody
1902 resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
1903 int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
1904 ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
1905 scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
1909 virtual void operator() (const Range& range) const
1911 Size ssize = src.size(), dsize = dst.size();
1912 int cn = src.channels();
1913 int area = scale_x*scale_y;
1914 float scale = 1.f/(area);
1915 int dwidth1 = (ssize.width/scale_x)*cn;
1920 VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
1922 for( dy = range.start; dy < range.end; dy++ )
1924 T* D = (T*)(dst.data + dst.step*dy);
1925 int sy0 = dy*scale_y;
1926 int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
1928 if( sy0 >= ssize.height )
1930 for( dx = 0; dx < dsize.width; dx++ )
1935 dx = vop(src.template ptr<T>(sy0), D, w);
1936 for( ; dx < w; dx++ )
1938 const T* S = src.template ptr<T>(sy0) + xofs[dx];
1941 #if CV_ENABLE_UNROLLED
1942 for( ; k <= area - 4; k += 4 )
1943 sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
1945 for( ; k < area; k++ )
1948 D[dx] = saturate_cast<T>(sum * scale);
1951 for( ; dx < dsize.width; dx++ )
1954 int count = 0, sx0 = xofs[dx];
1955 if( sx0 >= ssize.width )
1958 for( int sy = 0; sy < scale_y; sy++ )
1960 if( sy0 + sy >= ssize.height )
1962 const T* S = src.template ptr<T>(sy0 + sy) + sx0;
1963 for( int sx = 0; sx < scale_x*cn; sx += cn )
1965 if( sx0 + sx >= ssize.width )
1972 D[dx] = saturate_cast<T>((float)sum/count);
1980 int scale_x, scale_y;
1981 const int *ofs, *xofs;
1984 template<typename T, typename WT, typename VecOp>
1985 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
1986 int scale_x, int scale_y )
1988 Range range(0, dst.rows);
1989 resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
1990 scale_y, ofs, xofs);
1991 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1994 struct DecimateAlpha
2001 template<typename T, typename WT> class ResizeArea_Invoker :
2002 public ParallelLoopBody
2005 ResizeArea_Invoker( const Mat& _src, Mat& _dst,
2006 const DecimateAlpha* _xtab, int _xtab_size,
2007 const DecimateAlpha* _ytab, int _ytab_size,
2008 const int* _tabofs )
2013 xtab_size0 = _xtab_size;
2015 ytab_size = _ytab_size;
2019 virtual void operator() (const Range& range) const
2021 Size dsize = dst->size();
2022 int cn = dst->channels();
2024 AutoBuffer<WT> _buffer(dsize.width*2);
2025 const DecimateAlpha* xtab = xtab0;
2026 int xtab_size = xtab_size0;
2027 WT *buf = _buffer, *sum = buf + dsize.width;
2028 int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
2030 for( dx = 0; dx < dsize.width; dx++ )
2033 for( j = j_start; j < j_end; j++ )
2035 WT beta = ytab[j].alpha;
2036 int dy = ytab[j].di;
2037 int sy = ytab[j].si;
2040 const T* S = src->template ptr<T>(sy);
2041 for( dx = 0; dx < dsize.width; dx++ )
2045 for( k = 0; k < xtab_size; k++ )
2047 int dxn = xtab[k].di;
2048 WT alpha = xtab[k].alpha;
2049 buf[dxn] += S[xtab[k].si]*alpha;
2052 for( k = 0; k < xtab_size; k++ )
2054 int sxn = xtab[k].si;
2055 int dxn = xtab[k].di;
2056 WT alpha = xtab[k].alpha;
2057 WT t0 = buf[dxn] + S[sxn]*alpha;
2058 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2059 buf[dxn] = t0; buf[dxn+1] = t1;
2062 for( k = 0; k < xtab_size; k++ )
2064 int sxn = xtab[k].si;
2065 int dxn = xtab[k].di;
2066 WT alpha = xtab[k].alpha;
2067 WT t0 = buf[dxn] + S[sxn]*alpha;
2068 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2069 WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
2070 buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
2074 for( k = 0; k < xtab_size; k++ )
2076 int sxn = xtab[k].si;
2077 int dxn = xtab[k].di;
2078 WT alpha = xtab[k].alpha;
2079 WT t0 = buf[dxn] + S[sxn]*alpha;
2080 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2081 buf[dxn] = t0; buf[dxn+1] = t1;
2082 t0 = buf[dxn+2] + S[sxn+2]*alpha;
2083 t1 = buf[dxn+3] + S[sxn+3]*alpha;
2084 buf[dxn+2] = t0; buf[dxn+3] = t1;
2089 for( k = 0; k < xtab_size; k++ )
2091 int sxn = xtab[k].si;
2092 int dxn = xtab[k].di;
2093 WT alpha = xtab[k].alpha;
2094 for( int c = 0; c < cn; c++ )
2095 buf[dxn + c] += S[sxn + c]*alpha;
2102 T* D = dst->template ptr<T>(prev_dy);
2104 for( dx = 0; dx < dsize.width; dx++ )
2106 D[dx] = saturate_cast<T>(sum[dx]);
2107 sum[dx] = beta*buf[dx];
2113 for( dx = 0; dx < dsize.width; dx++ )
2114 sum[dx] += beta*buf[dx];
2119 T* D = dst->template ptr<T>(prev_dy);
2120 for( dx = 0; dx < dsize.width; dx++ )
2121 D[dx] = saturate_cast<T>(sum[dx]);
2128 const DecimateAlpha* xtab0;
2129 const DecimateAlpha* ytab;
2130 int xtab_size0, ytab_size;
2135 template <typename T, typename WT>
2136 static void resizeArea_( const Mat& src, Mat& dst,
2137 const DecimateAlpha* xtab, int xtab_size,
2138 const DecimateAlpha* ytab, int ytab_size,
2141 parallel_for_(Range(0, dst.rows),
2142 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
2143 dst.total()/((double)(1 << 16)));
2147 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
2148 const int* xofs, const void* alpha,
2149 const int* yofs, const void* beta,
2150 int xmin, int xmax, int ksize );
2152 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
2153 const int* ofs, const int *xofs,
2154 int scale_x, int scale_y );
2156 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
2157 const DecimateAlpha* xtab, int xtab_size,
2158 const DecimateAlpha* ytab, int ytab_size,
2162 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
2165 for(int dx = 0; dx < dsize; dx++ )
2167 double fsx1 = dx * scale;
2168 double fsx2 = fsx1 + scale;
2169 double cellWidth = std::min(scale, ssize - fsx1);
2171 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2173 sx2 = std::min(sx2, ssize - 1);
2174 sx1 = std::min(sx1, sx2);
2176 if( sx1 - fsx1 > 1e-3 )
2178 assert( k < ssize*2 );
2179 tab[k].di = dx * cn;
2180 tab[k].si = (sx1 - 1) * cn;
2181 tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
2184 for(int sx = sx1; sx < sx2; sx++ )
2186 assert( k < ssize*2 );
2187 tab[k].di = dx * cn;
2188 tab[k].si = sx * cn;
2189 tab[k++].alpha = float(1.0 / cellWidth);
2192 if( fsx2 - sx2 > 1e-3 )
2194 assert( k < ssize*2 );
2195 tab[k].di = dx * cn;
2196 tab[k].si = sx2 * cn;
2197 tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2203 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
2205 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
2206 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2207 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2208 specBuf.allocate(specSize);\
2209 pSpec = (uchar*)specBuf;\
2210 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
2212 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
2213 if (mode == (int)ippCubic) { *ok = false; return; } \
2214 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2215 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2216 specBuf.allocate(specSize);\
2217 pSpec = (uchar*)specBuf;\
2218 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
2219 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
2220 getSrcOffsetFunc = (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
2222 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
2223 func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
2224 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2225 specBuf.allocate(specSize);\
2226 pSpec = (uchar*)specBuf;\
2227 AutoBuffer<uchar> buf(initSize);\
2228 uchar* pInit = (uchar*)buf;\
2229 CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
2231 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
2232 if (mode == (int)ippLinear) { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
2233 else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
2234 else { *ok = false; return; } \
2235 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
2236 getSrcOffsetFunc = (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
2238 #if IPP_VERSION_X100 >= 701
2239 class IPPresizeInvoker :
2240 public ParallelLoopBody
2243 IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
2244 ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
2245 inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
2246 func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
2249 IppiSize srcSize, dstSize;
2250 int type = src.type(), specSize = 0, initSize = 0;
2251 srcSize.width = src.cols;
2252 srcSize.height = src.rows;
2253 dstSize.width = dst.cols;
2254 dstSize.height = dst.rows;
2258 #if 0 // disabled since it breaks tests for CascadeClassifier
2259 case CV_8UC1: SET_IPP_RESIZE_PTR(8u,C1); break;
2260 case CV_8UC3: SET_IPP_RESIZE_PTR(8u,C3); break;
2261 case CV_8UC4: SET_IPP_RESIZE_PTR(8u,C4); break;
2263 case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
2264 case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
2265 case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
2266 case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
2267 case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
2268 case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
2269 case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
2270 case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
2271 case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
2272 case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
2273 case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
2274 case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
2275 default: { *ok = false; return; } break;
2283 virtual void operator() (const Range& range) const
2288 int cn = src.channels();
2289 int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
2290 int dstwidth = min(cvRound(src.cols * inv_scale_x), dst.cols);
2291 int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
2293 IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
2294 IppiSize dstSize = { dstwidth, dstheight - dsty };
2295 int bufsize = 0, itemSize = (int)src.elemSize1();
2297 CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
2298 CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
2300 const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
2301 Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
2303 AutoBuffer<uchar> buf(bufsize + 64);
2304 uchar* bufptr = alignPtr((uchar*)buf, 32);
2306 if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
2310 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2319 AutoBuffer<uchar> specBuf;
2321 ippiResizeFunc func;
2322 ippiResizeGetBufferSize getBufferSizeFunc;
2323 ippiResizeGetSrcOffset getSrcOffsetFunc;
2325 const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
2332 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
2333 float * const alpha_tab, int * const ofs_tab)
2336 for ( ; dx < dsize; dx++)
2340 double fsx1 = dx * scale;
2341 double fsx2 = fsx1 + scale;
2342 double cellWidth = std::min(scale, ssize - fsx1);
2344 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2346 sx2 = std::min(sx2, ssize - 1);
2347 sx1 = std::min(sx1, sx2);
2349 if (sx1 - fsx1 > 1e-3)
2351 map_tab[k] = sx1 - 1;
2352 alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
2355 for (int sx = sx1; sx < sx2; sx++)
2358 alpha_tab[k++] = float(1.0 / cellWidth);
2361 if (fsx2 - sx2 > 1e-3)
2364 alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2370 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
2371 double fx, double fy, int interpolation)
2373 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2375 double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
2376 float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
2377 int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
2378 bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
2379 std::abs(inv_fy - iscale_y) < DBL_EPSILON;
2381 // in case of scale_x && scale_y is equal to 2
2382 // INTER_AREA (fast) also is equal to INTER_LINEAR
2383 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2384 /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
2387 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
2388 (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
2391 UMat src = _src.getUMat();
2392 _dst.create(dsize, type);
2393 UMat dst = _dst.getUMat();
2395 Size ssize = src.size();
2397 size_t globalsize[] = { dst.cols, dst.rows };
2399 ocl::Image2D srcImage;
2401 // See if this could be done with a sampler. We stick with integer
2402 // datatypes because the observed error is low.
2403 bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
2404 ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
2405 ocl::Image2D::isFormatSupported(depth, cn, true) &&
2409 int wdepth = std::max(depth, CV_32S);
2411 cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
2412 "-D convertToDT=%s -D cn=%d",
2413 depth, ocl::typeToStr(type), ocl::typeToStr(depth),
2414 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2416 k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
2422 // Convert the input into an OpenCL image type, using normalized channel data types
2423 // and aliasing the UMat.
2424 srcImage = ocl::Image2D(src, true, true);
2425 k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
2426 (float)inv_fx, (float)inv_fy);
2430 if (interpolation == INTER_LINEAR && !useSampler)
2434 // integer path is slower because of CPU part, so it's disabled
2435 if (depth == CV_8U && ((void)0, 0))
2437 AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
2438 int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
2439 short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
2443 for (int dx = 0; dx < dsize.width; dx++)
2445 fxx = (float)((dx+0.5)*inv_fx - 0.5);
2452 if (sx >= ssize.width-1)
2453 fxx = 0, sx = ssize.width-1;
2456 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
2457 ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE);
2460 for (int dy = 0; dy < dsize.height; dy++)
2462 fyy = (float)((dy+0.5)*inv_fy - 0.5);
2467 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
2468 ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE);
2471 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2473 Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
2475 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2476 format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
2477 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2478 "-D INTER_RESIZE_COEF_BITS=%d",
2479 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2480 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2481 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2482 cn, INTER_RESIZE_COEF_BITS));
2486 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2487 ocl::KernelArg::PtrReadOnly(coeffs));
2491 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2492 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2493 format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
2494 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2495 "-D INTER_RESIZE_COEF_BITS=%d",
2496 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2497 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2498 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2499 cn, INTER_RESIZE_COEF_BITS));
2503 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2504 (float)inv_fx, (float)inv_fy);
2507 else if (interpolation == INTER_NEAREST)
2509 k.create("resizeNN", ocl::imgproc::resize_oclsrc,
2510 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
2511 ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
2515 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2516 (float)inv_fx, (float)inv_fy);
2518 else if (interpolation == INTER_AREA)
2520 int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
2521 int wtype = CV_MAKE_TYPE(wdepth, cn);
2524 String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
2525 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2526 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
2528 UMat alphaOcl, tabofsOcl, mapOcl;
2533 int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
2534 buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
2535 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
2536 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
2537 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
2538 iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
2540 k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
2546 buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
2547 k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
2551 int xytab_size = (ssize.width + ssize.height) << 1;
2552 int tabofs_size = dsize.height + dsize.width + 2;
2554 AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
2555 AutoBuffer<float> _xyalpha_tab(xytab_size);
2556 int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
2557 float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
2558 int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
2560 ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
2561 ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
2563 // loading precomputed arrays to GPU
2564 Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
2565 Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
2566 Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
2569 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
2572 k.args(srcarg, dstarg);
2574 k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
2575 ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
2577 return k.run(2, globalsize, NULL, false);
2580 return k.run(2, globalsize, 0, false);
2587 //////////////////////////////////////////////////////////////////////////////////////////
2589 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
2590 double inv_scale_x, double inv_scale_y, int interpolation )
2592 static ResizeFunc linear_tab[] =
2595 HResizeLinear<uchar, int, short,
2596 INTER_RESIZE_COEF_SCALE,
2597 HResizeLinearVec_8u32s>,
2598 VResizeLinear<uchar, int, short,
2599 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2600 VResizeLinearVec_32s8u> >,
2603 HResizeLinear<ushort, float, float, 1,
2604 HResizeLinearVec_16u32f>,
2605 VResizeLinear<ushort, float, float, Cast<float, ushort>,
2606 VResizeLinearVec_32f16u> >,
2608 HResizeLinear<short, float, float, 1,
2609 HResizeLinearVec_16s32f>,
2610 VResizeLinear<short, float, float, Cast<float, short>,
2611 VResizeLinearVec_32f16s> >,
2614 HResizeLinear<float, float, float, 1,
2615 HResizeLinearVec_32f>,
2616 VResizeLinear<float, float, float, Cast<float, float>,
2617 VResizeLinearVec_32f> >,
2619 HResizeLinear<double, double, float, 1,
2621 VResizeLinear<double, double, float, Cast<double, double>,
2626 static ResizeFunc cubic_tab[] =
2629 HResizeCubic<uchar, int, short>,
2630 VResizeCubic<uchar, int, short,
2631 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2632 VResizeCubicVec_32s8u> >,
2635 HResizeCubic<ushort, float, float>,
2636 VResizeCubic<ushort, float, float, Cast<float, ushort>,
2637 VResizeCubicVec_32f16u> >,
2639 HResizeCubic<short, float, float>,
2640 VResizeCubic<short, float, float, Cast<float, short>,
2641 VResizeCubicVec_32f16s> >,
2644 HResizeCubic<float, float, float>,
2645 VResizeCubic<float, float, float, Cast<float, float>,
2646 VResizeCubicVec_32f> >,
2648 HResizeCubic<double, double, float>,
2649 VResizeCubic<double, double, float, Cast<double, double>,
2654 static ResizeFunc lanczos4_tab[] =
2656 resizeGeneric_<HResizeLanczos4<uchar, int, short>,
2657 VResizeLanczos4<uchar, int, short,
2658 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2661 resizeGeneric_<HResizeLanczos4<ushort, float, float>,
2662 VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
2664 resizeGeneric_<HResizeLanczos4<short, float, float>,
2665 VResizeLanczos4<short, float, float, Cast<float, short>,
2668 resizeGeneric_<HResizeLanczos4<float, float, float>,
2669 VResizeLanczos4<float, float, float, Cast<float, float>,
2671 resizeGeneric_<HResizeLanczos4<double, double, float>,
2672 VResizeLanczos4<double, double, float, Cast<double, double>,
2677 static ResizeAreaFastFunc areafast_tab[] =
2679 resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
2681 resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
2682 resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastNoVec<short, float> > >,
2684 resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
2685 resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
2689 static ResizeAreaFunc area_tab[] =
2691 resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
2692 resizeArea_<short, float>, 0, resizeArea_<float, float>,
2693 resizeArea_<double, double>, 0
2696 Size ssize = _src.size();
2698 CV_Assert( ssize.area() > 0 );
2699 CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
2700 if( dsize.area() == 0 )
2702 dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
2703 saturate_cast<int>(ssize.height*inv_scale_y));
2704 CV_Assert( dsize.area() > 0 );
2708 inv_scale_x = (double)dsize.width/ssize.width;
2709 inv_scale_y = (double)dsize.height/ssize.height;
2712 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
2713 ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
2715 Mat src = _src.getMat();
2716 _dst.create(dsize, src.type());
2717 Mat dst = _dst.getMat();
2719 #ifdef HAVE_TEGRA_OPTIMIZATION
2720 if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
2724 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2725 double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
2726 int k, sx, sy, dx, dy;
2728 int iscale_x = saturate_cast<int>(scale_x);
2729 int iscale_y = saturate_cast<int>(scale_y);
2731 bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
2732 std::abs(scale_y - iscale_y) < DBL_EPSILON;
2734 #if IPP_VERSION_X100 >= 701
2737 #define IPP_RESIZE_EPS 1e-10
2739 double ex = fabs((double)dsize.width / src.cols - inv_scale_x) / inv_scale_x;
2740 double ey = fabs((double)dsize.height / src.rows - inv_scale_y) / inv_scale_y;
2742 if ( ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
2743 (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
2744 !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U))
2747 if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
2749 else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
2752 if( mode >= 0 && (cn == 1 || cn == 3 || cn == 4) &&
2753 (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
2754 (depth == CV_64F && mode == ippLinear)))
2757 Range range(0, src.rows);
2758 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
2759 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2762 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2765 setIppErrorStatus();
2768 #undef IPP_RESIZE_EPS
2772 if( interpolation == INTER_NEAREST )
2774 resizeNN( src, dst, inv_scale_x, inv_scale_y );
2779 // in case of scale_x && scale_y is equal to 2
2780 // INTER_AREA (fast) also is equal to INTER_LINEAR
2781 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2782 interpolation = INTER_AREA;
2784 // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
2785 // In other cases it is emulated using some variant of bilinear interpolation
2786 if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
2790 int area = iscale_x*iscale_y;
2791 size_t srcstep = src.step / src.elemSize1();
2792 AutoBuffer<int> _ofs(area + dsize.width*cn);
2794 int* xofs = ofs + area;
2795 ResizeAreaFastFunc func = areafast_tab[depth];
2796 CV_Assert( func != 0 );
2798 for( sy = 0, k = 0; sy < iscale_y; sy++ )
2799 for( sx = 0; sx < iscale_x; sx++ )
2800 ofs[k++] = (int)(sy*srcstep + sx*cn);
2802 for( dx = 0; dx < dsize.width; dx++ )
2806 for( k = 0; k < cn; k++ )
2807 xofs[j + k] = sx + k;
2810 func( src, dst, ofs, xofs, iscale_x, iscale_y );
2814 ResizeAreaFunc func = area_tab[depth];
2815 CV_Assert( func != 0 && cn <= 4 );
2817 AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
2818 DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
2820 int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
2821 int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
2823 AutoBuffer<int> _tabofs(dsize.height + 1);
2824 int* tabofs = _tabofs;
2825 for( k = 0, dy = 0; k < ytab_size; k++ )
2827 if( k == 0 || ytab[k].di != ytab[k-1].di )
2829 assert( ytab[k].di == dy );
2833 tabofs[dy] = ytab_size;
2835 func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
2840 int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
2841 bool area_mode = interpolation == INTER_AREA;
2842 bool fixpt = depth == CV_8U;
2845 int ksize=0, ksize2;
2846 if( interpolation == INTER_CUBIC )
2847 ksize = 4, func = cubic_tab[depth];
2848 else if( interpolation == INTER_LANCZOS4 )
2849 ksize = 8, func = lanczos4_tab[depth];
2850 else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
2851 ksize = 2, func = linear_tab[depth];
2853 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
2856 CV_Assert( func != 0 );
2858 AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
2859 int* xofs = (int*)(uchar*)_buffer;
2860 int* yofs = xofs + width;
2861 float* alpha = (float*)(yofs + dsize.height);
2862 short* ialpha = (short*)alpha;
2863 float* beta = alpha + width*ksize;
2864 short* ibeta = ialpha + width*ksize;
2865 float cbuf[MAX_ESIZE];
2867 for( dx = 0; dx < dsize.width; dx++ )
2871 fx = (float)((dx+0.5)*scale_x - 0.5);
2877 sx = cvFloor(dx*scale_x);
2878 fx = (float)((dx+1) - (sx+1)*inv_scale_x);
2879 fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
2885 if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
2889 if( sx + ksize2 >= ssize.width )
2891 xmax = std::min( xmax, dx );
2892 if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
2893 fx = 0, sx = ssize.width-1;
2896 for( k = 0, sx *= cn; k < cn; k++ )
2897 xofs[dx*cn + k] = sx + k;
2899 if( interpolation == INTER_CUBIC )
2900 interpolateCubic( fx, cbuf );
2901 else if( interpolation == INTER_LANCZOS4 )
2902 interpolateLanczos4( fx, cbuf );
2910 for( k = 0; k < ksize; k++ )
2911 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
2912 for( ; k < cn*ksize; k++ )
2913 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
2917 for( k = 0; k < ksize; k++ )
2918 alpha[dx*cn*ksize + k] = cbuf[k];
2919 for( ; k < cn*ksize; k++ )
2920 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
2924 for( dy = 0; dy < dsize.height; dy++ )
2928 fy = (float)((dy+0.5)*scale_y - 0.5);
2934 sy = cvFloor(dy*scale_y);
2935 fy = (float)((dy+1) - (sy+1)*inv_scale_y);
2936 fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
2940 if( interpolation == INTER_CUBIC )
2941 interpolateCubic( fy, cbuf );
2942 else if( interpolation == INTER_LANCZOS4 )
2943 interpolateLanczos4( fy, cbuf );
2952 for( k = 0; k < ksize; k++ )
2953 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
2957 for( k = 0; k < ksize; k++ )
2958 beta[dy*ksize + k] = cbuf[k];
2962 func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
2963 fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
2967 /****************************************************************************************\
2968 * General warping (affine, perspective, remap) *
2969 \****************************************************************************************/
2974 template<typename T>
2975 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
2976 int borderType, const Scalar& _borderValue )
2978 Size ssize = _src.size(), dsize = _dst.size();
2979 int cn = _src.channels();
2980 const T* S0 = _src.ptr<T>();
2981 size_t sstep = _src.step/sizeof(S0[0]);
2982 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
2983 saturate_cast<T>(_borderValue[1]),
2984 saturate_cast<T>(_borderValue[2]),
2985 saturate_cast<T>(_borderValue[3]));
2988 unsigned width1 = ssize.width, height1 = ssize.height;
2990 if( _dst.isContinuous() && _xy.isContinuous() )
2992 dsize.width *= dsize.height;
2996 for( dy = 0; dy < dsize.height; dy++ )
2998 T* D = _dst.ptr<T>(dy);
2999 const short* XY = _xy.ptr<short>(dy);
3003 for( dx = 0; dx < dsize.width; dx++ )
3005 int sx = XY[dx*2], sy = XY[dx*2+1];
3006 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3007 D[dx] = S0[sy*sstep + sx];
3010 if( borderType == BORDER_REPLICATE )
3012 sx = clip(sx, 0, ssize.width);
3013 sy = clip(sy, 0, ssize.height);
3014 D[dx] = S0[sy*sstep + sx];
3016 else if( borderType == BORDER_CONSTANT )
3018 else if( borderType != BORDER_TRANSPARENT )
3020 sx = borderInterpolate(sx, ssize.width, borderType);
3021 sy = borderInterpolate(sy, ssize.height, borderType);
3022 D[dx] = S0[sy*sstep + sx];
3029 for( dx = 0; dx < dsize.width; dx++, D += cn )
3031 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3033 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3037 S = S0 + sy*sstep + sx*3;
3038 D[0] = S[0], D[1] = S[1], D[2] = S[2];
3042 S = S0 + sy*sstep + sx*4;
3043 D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
3047 S = S0 + sy*sstep + sx*cn;
3048 for( k = 0; k < cn; k++ )
3052 else if( borderType != BORDER_TRANSPARENT )
3054 if( borderType == BORDER_REPLICATE )
3056 sx = clip(sx, 0, ssize.width);
3057 sy = clip(sy, 0, ssize.height);
3058 S = S0 + sy*sstep + sx*cn;
3060 else if( borderType == BORDER_CONSTANT )
3064 sx = borderInterpolate(sx, ssize.width, borderType);
3065 sy = borderInterpolate(sy, ssize.height, borderType);
3066 S = S0 + sy*sstep + sx*cn;
3068 for( k = 0; k < cn; k++ )
3079 int operator()( const Mat&, void*, const short*, const ushort*,
3080 const void*, int ) const { return 0; }
3087 int operator()( const Mat& _src, void* _dst, const short* XY,
3088 const ushort* FXY, const void* _wtab, int width ) const
3090 int cn = _src.channels(), x = 0, sstep = (int)_src.step;
3092 if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
3096 const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
3097 const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
3098 uchar* D = (uchar*)_dst;
3099 __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
3100 __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
3101 __m128i z = _mm_setzero_si128();
3102 int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
3106 for( ; x <= width - 8; x += 8 )
3108 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3109 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
3110 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
3113 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3114 xy1 = _mm_madd_epi16( xy1, xy2ofs );
3115 _mm_store_si128( (__m128i*)iofs0, xy0 );
3116 _mm_store_si128( (__m128i*)iofs1, xy1 );
3118 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
3119 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
3120 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3121 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
3122 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
3123 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3124 v0 = _mm_unpacklo_epi8(v0, z);
3125 v1 = _mm_unpacklo_epi8(v1, z);
3127 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
3128 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
3129 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
3130 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
3131 b0 = _mm_unpacklo_epi64(a0, a1);
3132 b1 = _mm_unpackhi_epi64(a0, a1);
3133 v0 = _mm_madd_epi16(v0, b0);
3134 v1 = _mm_madd_epi16(v1, b1);
3135 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
3137 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
3138 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
3139 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3140 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
3141 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
3142 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3143 v2 = _mm_unpacklo_epi8(v2, z);
3144 v3 = _mm_unpacklo_epi8(v3, z);
3146 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
3147 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
3148 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
3149 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
3150 b0 = _mm_unpacklo_epi64(a0, a1);
3151 b1 = _mm_unpackhi_epi64(a0, a1);
3152 v2 = _mm_madd_epi16(v2, b0);
3153 v3 = _mm_madd_epi16(v3, b1);
3154 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
3156 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
3157 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
3158 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
3159 _mm_storel_epi64( (__m128i*)(D + x), v0 );
3164 for( ; x <= width - 5; x += 4, D += 12 )
3166 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3167 __m128i u0, v0, u1, v1;
3169 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3170 _mm_store_si128( (__m128i*)iofs0, xy0 );
3171 const __m128i *w0, *w1;
3172 w0 = (const __m128i*)(wtab + FXY[x]*16);
3173 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3175 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3176 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
3177 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3178 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
3179 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3180 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
3181 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3182 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
3183 u0 = _mm_unpacklo_epi8(u0, z);
3184 v0 = _mm_unpacklo_epi8(v0, z);
3185 u1 = _mm_unpacklo_epi8(u1, z);
3186 v1 = _mm_unpacklo_epi8(v1, z);
3187 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3188 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3189 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3190 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3191 u0 = _mm_slli_si128(u0, 4);
3192 u0 = _mm_packs_epi32(u0, u1);
3193 u0 = _mm_packus_epi16(u0, u0);
3194 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
3196 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3197 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3199 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3200 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
3201 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3202 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
3203 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3204 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
3205 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3206 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
3207 u0 = _mm_unpacklo_epi8(u0, z);
3208 v0 = _mm_unpacklo_epi8(v0, z);
3209 u1 = _mm_unpacklo_epi8(u1, z);
3210 v1 = _mm_unpacklo_epi8(v1, z);
3211 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3212 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3213 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3214 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3215 u0 = _mm_slli_si128(u0, 4);
3216 u0 = _mm_packs_epi32(u0, u1);
3217 u0 = _mm_packus_epi16(u0, u0);
3218 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
3223 for( ; x <= width - 4; x += 4, D += 16 )
3225 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3226 __m128i u0, v0, u1, v1;
3228 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3229 _mm_store_si128( (__m128i*)iofs0, xy0 );
3230 const __m128i *w0, *w1;
3231 w0 = (const __m128i*)(wtab + FXY[x]*16);
3232 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3234 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3235 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
3236 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3237 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
3238 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3239 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
3240 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3241 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
3242 u0 = _mm_unpacklo_epi8(u0, z);
3243 v0 = _mm_unpacklo_epi8(v0, z);
3244 u1 = _mm_unpacklo_epi8(u1, z);
3245 v1 = _mm_unpacklo_epi8(v1, z);
3246 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3247 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3248 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3249 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3250 u0 = _mm_packs_epi32(u0, u1);
3251 u0 = _mm_packus_epi16(u0, u0);
3252 _mm_storel_epi64((__m128i*)D, u0);
3254 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3255 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3257 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3258 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
3259 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3260 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
3261 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3262 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
3263 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3264 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
3265 u0 = _mm_unpacklo_epi8(u0, z);
3266 v0 = _mm_unpacklo_epi8(v0, z);
3267 u1 = _mm_unpacklo_epi8(u1, z);
3268 v1 = _mm_unpacklo_epi8(v1, z);
3269 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3270 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3271 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3272 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3273 u0 = _mm_packs_epi32(u0, u1);
3274 u0 = _mm_packus_epi16(u0, u0);
3275 _mm_storel_epi64((__m128i*)(D + 8), u0);
3285 typedef RemapNoVec RemapVec_8u;
3290 template<class CastOp, class VecOp, typename AT>
3291 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
3292 const Mat& _fxy, const void* _wtab,
3293 int borderType, const Scalar& _borderValue )
3295 typedef typename CastOp::rtype T;
3296 typedef typename CastOp::type1 WT;
3297 Size ssize = _src.size(), dsize = _dst.size();
3298 int cn = _src.channels();
3299 const AT* wtab = (const AT*)_wtab;
3300 const T* S0 = _src.ptr<T>();
3301 size_t sstep = _src.step/sizeof(S0[0]);
3302 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3303 saturate_cast<T>(_borderValue[1]),
3304 saturate_cast<T>(_borderValue[2]),
3305 saturate_cast<T>(_borderValue[3]));
3310 unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
3311 CV_Assert( cn <= 4 && ssize.area() > 0 );
3313 if( _src.type() == CV_8UC3 )
3314 width1 = std::max(ssize.width-2, 0);
3317 for( dy = 0; dy < dsize.height; dy++ )
3319 T* D = _dst.ptr<T>(dy);
3320 const short* XY = _xy.ptr<short>(dy);
3321 const ushort* FXY = _fxy.ptr<ushort>(dy);
3323 bool prevInlier = false;
3325 for( dx = 0; dx <= dsize.width; dx++ )
3327 bool curInlier = dx < dsize.width ?
3328 (unsigned)XY[dx*2] < width1 &&
3329 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
3330 if( curInlier == prevInlier )
3336 prevInlier = curInlier;
3340 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
3346 for( ; dx < X1; dx++, D++ )
3348 int sx = XY[dx*2], sy = XY[dx*2+1];
3349 const AT* w = wtab + FXY[dx]*4;
3350 const T* S = S0 + sy*sstep + sx;
3351 *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
3355 for( ; dx < X1; dx++, D += 2 )
3357 int sx = XY[dx*2], sy = XY[dx*2+1];
3358 const AT* w = wtab + FXY[dx]*4;
3359 const T* S = S0 + sy*sstep + sx*2;
3360 WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
3361 WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
3362 D[0] = castOp(t0); D[1] = castOp(t1);
3365 for( ; dx < X1; dx++, D += 3 )
3367 int sx = XY[dx*2], sy = XY[dx*2+1];
3368 const AT* w = wtab + FXY[dx]*4;
3369 const T* S = S0 + sy*sstep + sx*3;
3370 WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
3371 WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
3372 WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
3373 D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
3376 for( ; dx < X1; dx++, D += 4 )
3378 int sx = XY[dx*2], sy = XY[dx*2+1];
3379 const AT* w = wtab + FXY[dx]*4;
3380 const T* S = S0 + sy*sstep + sx*4;
3381 WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
3382 WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
3383 D[0] = castOp(t0); D[1] = castOp(t1);
3384 t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
3385 t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
3386 D[2] = castOp(t0); D[3] = castOp(t1);
3391 if( borderType == BORDER_TRANSPARENT && cn != 3 )
3399 for( ; dx < X1; dx++, D++ )
3401 int sx = XY[dx*2], sy = XY[dx*2+1];
3402 if( borderType == BORDER_CONSTANT &&
3403 (sx >= ssize.width || sx+1 < 0 ||
3404 sy >= ssize.height || sy+1 < 0) )
3410 int sx0, sx1, sy0, sy1;
3412 const AT* w = wtab + FXY[dx]*4;
3413 if( borderType == BORDER_REPLICATE )
3415 sx0 = clip(sx, 0, ssize.width);
3416 sx1 = clip(sx+1, 0, ssize.width);
3417 sy0 = clip(sy, 0, ssize.height);
3418 sy1 = clip(sy+1, 0, ssize.height);
3419 v0 = S0[sy0*sstep + sx0];
3420 v1 = S0[sy0*sstep + sx1];
3421 v2 = S0[sy1*sstep + sx0];
3422 v3 = S0[sy1*sstep + sx1];
3426 sx0 = borderInterpolate(sx, ssize.width, borderType);
3427 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3428 sy0 = borderInterpolate(sy, ssize.height, borderType);
3429 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3430 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
3431 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
3432 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
3433 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
3435 D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
3439 for( ; dx < X1; dx++, D += cn )
3441 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3442 if( borderType == BORDER_CONSTANT &&
3443 (sx >= ssize.width || sx+1 < 0 ||
3444 sy >= ssize.height || sy+1 < 0) )
3446 for( k = 0; k < cn; k++ )
3451 int sx0, sx1, sy0, sy1;
3452 const T *v0, *v1, *v2, *v3;
3453 const AT* w = wtab + FXY[dx]*4;
3454 if( borderType == BORDER_REPLICATE )
3456 sx0 = clip(sx, 0, ssize.width);
3457 sx1 = clip(sx+1, 0, ssize.width);
3458 sy0 = clip(sy, 0, ssize.height);
3459 sy1 = clip(sy+1, 0, ssize.height);
3460 v0 = S0 + sy0*sstep + sx0*cn;
3461 v1 = S0 + sy0*sstep + sx1*cn;
3462 v2 = S0 + sy1*sstep + sx0*cn;
3463 v3 = S0 + sy1*sstep + sx1*cn;
3465 else if( borderType == BORDER_TRANSPARENT &&
3466 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
3467 (unsigned)sy >= (unsigned)(ssize.height-1)))
3471 sx0 = borderInterpolate(sx, ssize.width, borderType);
3472 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3473 sy0 = borderInterpolate(sy, ssize.height, borderType);
3474 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3475 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
3476 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
3477 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
3478 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
3480 for( k = 0; k < cn; k++ )
3481 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
3490 template<class CastOp, typename AT, int ONE>
3491 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
3492 const Mat& _fxy, const void* _wtab,
3493 int borderType, const Scalar& _borderValue )
3495 typedef typename CastOp::rtype T;
3496 typedef typename CastOp::type1 WT;
3497 Size ssize = _src.size(), dsize = _dst.size();
3498 int cn = _src.channels();
3499 const AT* wtab = (const AT*)_wtab;
3500 const T* S0 = _src.ptr<T>();
3501 size_t sstep = _src.step/sizeof(S0[0]);
3502 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3503 saturate_cast<T>(_borderValue[1]),
3504 saturate_cast<T>(_borderValue[2]),
3505 saturate_cast<T>(_borderValue[3]));
3508 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3510 unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
3512 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3514 dsize.width *= dsize.height;
3518 for( dy = 0; dy < dsize.height; dy++ )
3520 T* D = _dst.ptr<T>(dy);
3521 const short* XY = _xy.ptr<short>(dy);
3522 const ushort* FXY = _fxy.ptr<ushort>(dy);
3524 for( dx = 0; dx < dsize.width; dx++, D += cn )
3526 int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
3527 const AT* w = wtab + FXY[dx]*16;
3529 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3531 const T* S = S0 + sy*sstep + sx*cn;
3532 for( k = 0; k < cn; k++ )
3534 WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
3536 sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
3538 sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
3540 sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
3548 if( borderType == BORDER_TRANSPARENT &&
3549 ((unsigned)(sx+1) >= (unsigned)ssize.width ||
3550 (unsigned)(sy+1) >= (unsigned)ssize.height) )
3553 if( borderType1 == BORDER_CONSTANT &&
3554 (sx >= ssize.width || sx+4 <= 0 ||
3555 sy >= ssize.height || sy+4 <= 0))
3557 for( k = 0; k < cn; k++ )
3562 for( i = 0; i < 4; i++ )
3564 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3565 y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3568 for( k = 0; k < cn; k++, S0++, w -= 16 )
3570 WT cv = cval[k], sum = cv*ONE;
3571 for( i = 0; i < 4; i++, w += 4 )
3574 const T* S = S0 + yi*sstep;
3578 sum += (S[x[0]] - cv)*w[0];
3580 sum += (S[x[1]] - cv)*w[1];
3582 sum += (S[x[2]] - cv)*w[2];
3584 sum += (S[x[3]] - cv)*w[3];
3595 template<class CastOp, typename AT, int ONE>
3596 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
3597 const Mat& _fxy, const void* _wtab,
3598 int borderType, const Scalar& _borderValue )
3600 typedef typename CastOp::rtype T;
3601 typedef typename CastOp::type1 WT;
3602 Size ssize = _src.size(), dsize = _dst.size();
3603 int cn = _src.channels();
3604 const AT* wtab = (const AT*)_wtab;
3605 const T* S0 = _src.ptr<T>();
3606 size_t sstep = _src.step/sizeof(S0[0]);
3607 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3608 saturate_cast<T>(_borderValue[1]),
3609 saturate_cast<T>(_borderValue[2]),
3610 saturate_cast<T>(_borderValue[3]));
3613 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3615 unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
3617 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3619 dsize.width *= dsize.height;
3623 for( dy = 0; dy < dsize.height; dy++ )
3625 T* D = _dst.ptr<T>(dy);
3626 const short* XY = _xy.ptr<short>(dy);
3627 const ushort* FXY = _fxy.ptr<ushort>(dy);
3629 for( dx = 0; dx < dsize.width; dx++, D += cn )
3631 int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
3632 const AT* w = wtab + FXY[dx]*64;
3633 const T* S = S0 + sy*sstep + sx*cn;
3635 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3637 for( k = 0; k < cn; k++ )
3640 for( int r = 0; r < 8; r++, S += sstep, w += 8 )
3641 sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
3642 S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
3651 if( borderType == BORDER_TRANSPARENT &&
3652 ((unsigned)(sx+3) >= (unsigned)ssize.width ||
3653 (unsigned)(sy+3) >= (unsigned)ssize.height) )
3656 if( borderType1 == BORDER_CONSTANT &&
3657 (sx >= ssize.width || sx+8 <= 0 ||
3658 sy >= ssize.height || sy+8 <= 0))
3660 for( k = 0; k < cn; k++ )
3665 for( i = 0; i < 8; i++ )
3667 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3668 y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3671 for( k = 0; k < cn; k++, S0++, w -= 64 )
3673 WT cv = cval[k], sum = cv*ONE;
3674 for( i = 0; i < 8; i++, w += 8 )
3677 const T* S1 = S0 + yi*sstep;
3681 sum += (S1[x[0]] - cv)*w[0];
3683 sum += (S1[x[1]] - cv)*w[1];
3685 sum += (S1[x[2]] - cv)*w[2];
3687 sum += (S1[x[3]] - cv)*w[3];
3689 sum += (S1[x[4]] - cv)*w[4];
3691 sum += (S1[x[5]] - cv)*w[5];
3693 sum += (S1[x[6]] - cv)*w[6];
3695 sum += (S1[x[7]] - cv)*w[7];
3706 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3707 int borderType, const Scalar& _borderValue );
3709 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3710 const Mat& _fxy, const void* _wtab,
3711 int borderType, const Scalar& _borderValue);
3713 class RemapInvoker :
3714 public ParallelLoopBody
3717 RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
3718 const Mat *_m2, int _borderType, const Scalar &_borderValue,
3719 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
3720 ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
3721 borderType(_borderType), borderValue(_borderValue),
3722 planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
3726 virtual void operator() (const Range& range) const
3729 const int buf_size = 1 << 14;
3730 int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
3731 int bcols0 = std::min(buf_size/brows0, dst->cols);
3732 brows0 = std::min(buf_size/bcols0, dst->rows);
3734 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
3737 Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
3739 _bufa.create(brows0, bcols0, CV_16UC1);
3741 for( y = range.start; y < range.end; y += brows0 )
3743 for( x = 0; x < dst->cols; x += bcols0 )
3745 int brows = std::min(brows0, range.end - y);
3746 int bcols = std::min(bcols0, dst->cols - x);
3747 Mat dpart(*dst, Rect(x, y, bcols, brows));
3748 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
3752 if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
3753 bufxy = (*m1)(Rect(x, y, bcols, brows));
3754 else if( map_depth != CV_32F )
3756 for( y1 = 0; y1 < brows; y1++ )
3758 short* XY = bufxy.ptr<short>(y1);
3759 const short* sXY = m1->ptr<short>(y+y1) + x*2;
3760 const ushort* sA = m2->ptr<ushort>(y+y1) + x;
3762 for( x1 = 0; x1 < bcols; x1++ )
3764 int a = sA[x1] & (INTER_TAB_SIZE2-1);
3765 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
3766 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
3770 else if( !planar_input )
3771 (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
3774 for( y1 = 0; y1 < brows; y1++ )
3776 short* XY = bufxy.ptr<short>(y1);
3777 const float* sX = m1->ptr<float>(y+y1) + x;
3778 const float* sY = m2->ptr<float>(y+y1) + x;
3784 for( ; x1 <= bcols - 8; x1 += 8 )
3786 __m128 fx0 = _mm_loadu_ps(sX + x1);
3787 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
3788 __m128 fy0 = _mm_loadu_ps(sY + x1);
3789 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
3790 __m128i ix0 = _mm_cvtps_epi32(fx0);
3791 __m128i ix1 = _mm_cvtps_epi32(fx1);
3792 __m128i iy0 = _mm_cvtps_epi32(fy0);
3793 __m128i iy1 = _mm_cvtps_epi32(fy1);
3794 ix0 = _mm_packs_epi32(ix0, ix1);
3795 iy0 = _mm_packs_epi32(iy0, iy1);
3796 ix1 = _mm_unpacklo_epi16(ix0, iy0);
3797 iy1 = _mm_unpackhi_epi16(ix0, iy0);
3798 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
3799 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
3804 for( ; x1 < bcols; x1++ )
3806 XY[x1*2] = saturate_cast<short>(sX[x1]);
3807 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
3811 nnfunc( *src, dpart, bufxy, borderType, borderValue );
3815 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
3816 for( y1 = 0; y1 < brows; y1++ )
3818 short* XY = bufxy.ptr<short>(y1);
3819 ushort* A = bufa.ptr<ushort>(y1);
3821 if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
3823 bufxy = (*m1)(Rect(x, y, bcols, brows));
3825 const ushort* sA = m2->ptr<ushort>(y+y1) + x;
3829 uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
3830 for ( ; x1 <= bcols - 8; x1 += 8)
3831 vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
3834 for( ; x1 < bcols; x1++ )
3835 A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
3837 else if( planar_input )
3839 const float* sX = m1->ptr<float>(y+y1) + x;
3840 const float* sY = m2->ptr<float>(y+y1) + x;
3846 __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
3847 __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
3848 for( ; x1 <= bcols - 8; x1 += 8 )
3850 __m128 fx0 = _mm_loadu_ps(sX + x1);
3851 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
3852 __m128 fy0 = _mm_loadu_ps(sY + x1);
3853 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
3854 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
3855 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
3856 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
3857 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
3858 __m128i mx0 = _mm_and_si128(ix0, mask);
3859 __m128i mx1 = _mm_and_si128(ix1, mask);
3860 __m128i my0 = _mm_and_si128(iy0, mask);
3861 __m128i my1 = _mm_and_si128(iy1, mask);
3862 mx0 = _mm_packs_epi32(mx0, mx1);
3863 my0 = _mm_packs_epi32(my0, my1);
3864 my0 = _mm_slli_epi16(my0, INTER_BITS);
3865 mx0 = _mm_or_si128(mx0, my0);
3866 _mm_storeu_si128((__m128i*)(A + x1), mx0);
3867 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
3868 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
3869 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
3870 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
3871 ix0 = _mm_packs_epi32(ix0, ix1);
3872 iy0 = _mm_packs_epi32(iy0, iy1);
3873 ix1 = _mm_unpacklo_epi16(ix0, iy0);
3874 iy1 = _mm_unpackhi_epi16(ix0, iy0);
3875 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
3876 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
3880 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
3881 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
3883 for( ; x1 <= bcols - 4; x1 += 4 )
3885 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
3886 v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
3887 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
3888 vandq_s32(v_sy, v_scale2));
3889 vst1_u16(A + x1, vqmovun_s32(v_v));
3891 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
3892 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
3893 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
3897 for( ; x1 < bcols; x1++ )
3899 int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
3900 int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
3901 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
3902 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
3903 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
3909 const float* sXY = m1->ptr<float>(y+y1) + x*2;
3913 float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
3914 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
3916 for( ; x1 <= bcols - 4; x1 += 4 )
3918 float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
3919 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
3920 int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
3921 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
3922 vandq_s32(v_sy, v_scale2));
3923 vst1_u16(A + x1, vqmovun_s32(v_v));
3925 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
3926 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
3927 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
3931 for( x1 = 0; x1 < bcols; x1++ )
3933 int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
3934 int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
3935 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
3936 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
3937 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
3942 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
3961 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
3962 int interpolation, int borderType, const Scalar& borderValue)
3964 const ocl::Device & dev = ocl::Device::getDefault();
3965 int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
3966 rowsPerWI = dev.isIntel() ? 4 : 1;
3968 if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
3969 || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
3972 UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
3974 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
3975 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
3977 if (map1.type() != CV_16SC2)
3978 std::swap(map1, map2);
3981 CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
3983 _dst.create(map1.size(), type);
3984 UMat dst = _dst.getUMat();
3986 String kernelName = "remap";
3987 if (map1.type() == CV_32FC2 && map2.empty())
3988 kernelName += "_32FC2";
3989 else if (map1.type() == CV_16SC2)
3991 kernelName += "_16SC2";
3993 kernelName += "_16UC1";
3995 else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
3996 kernelName += "_2_32FC1";
3998 CV_Error(Error::StsBadArg, "Unsupported map types");
4000 static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
4001 static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4002 "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
4003 String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
4004 interMap[interpolation], borderMap[borderType],
4005 ocl::typeToStr(type), rowsPerWI);
4007 if (interpolation != INTER_NEAREST)
4010 int wdepth = std::max(CV_32F, depth);
4011 buildOptions = buildOptions
4012 + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
4013 " -D convertToWT2=%s -D WT2=%s",
4014 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
4015 ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
4016 ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
4017 ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
4018 ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
4020 int scalarcn = cn == 3 ? 4 : cn;
4021 int sctype = CV_MAKETYPE(depth, scalarcn);
4022 buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
4023 ocl::typeToStr(type), ocl::typeToStr(depth),
4024 cn, ocl::typeToStr(sctype), depth);
4026 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
4028 Mat scalar(1, 1, sctype, borderValue);
4029 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
4030 map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
4031 scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
4034 k.args(srcarg, dstarg, map1arg, scalararg);
4036 k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
4038 size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4039 return k.run(2, globalThreads, NULL, false);
4044 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4046 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
4047 const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
4048 void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
4050 class IPPRemapInvoker :
4051 public ParallelLoopBody
4054 IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
4055 int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
4056 ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
4057 ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
4062 virtual void operator() (const Range & range) const
4064 IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
4065 Mat dstRoi = dst.rowRange(range);
4066 IppiSize dstRoiSize = ippiSize(dstRoi.size());
4067 int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4069 if (borderType == BORDER_CONSTANT &&
4070 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
4076 if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
4077 map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
4078 dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
4082 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4087 Mat & src, & dst, & map1, & map2;
4089 int ippInterpolation, borderType;
4098 void cv::remap( InputArray _src, OutputArray _dst,
4099 InputArray _map1, InputArray _map2,
4100 int interpolation, int borderType, const Scalar& borderValue )
4102 static RemapNNFunc nn_tab[] =
4104 remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
4105 remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
4108 static RemapFunc linear_tab[] =
4110 remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
4111 remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
4112 remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
4113 remapBilinear<Cast<float, float>, RemapNoVec, float>,
4114 remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
4117 static RemapFunc cubic_tab[] =
4119 remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4120 remapBicubic<Cast<float, ushort>, float, 1>,
4121 remapBicubic<Cast<float, short>, float, 1>, 0,
4122 remapBicubic<Cast<float, float>, float, 1>,
4123 remapBicubic<Cast<double, double>, float, 1>, 0
4126 static RemapFunc lanczos4_tab[] =
4128 remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4129 remapLanczos4<Cast<float, ushort>, float, 1>,
4130 remapLanczos4<Cast<float, short>, float, 1>, 0,
4131 remapLanczos4<Cast<float, float>, float, 1>,
4132 remapLanczos4<Cast<double, double>, float, 1>, 0
4135 CV_Assert( _map1.size().area() > 0 );
4136 CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
4138 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4139 ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
4141 Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
4142 _dst.create( map1.size(), src.type() );
4143 Mat dst = _dst.getMat();
4144 if( dst.data == src.data )
4147 if( interpolation == INTER_AREA )
4148 interpolation = INTER_LINEAR;
4150 int type = src.type(), depth = CV_MAT_DEPTH(type);
4152 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4155 if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
4156 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
4157 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
4159 int ippInterpolation =
4160 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4161 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
4164 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
4165 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
4166 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
4167 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
4168 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
4169 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
4170 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
4171 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
4172 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
4177 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
4178 borderType, borderValue, &ok);
4179 Range range(0, dst.rows);
4180 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
4184 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4187 setIppErrorStatus();
4193 RemapNNFunc nnfunc = 0;
4194 RemapFunc ifunc = 0;
4195 const void* ctab = 0;
4196 bool fixpt = depth == CV_8U;
4197 bool planar_input = false;
4199 if( interpolation == INTER_NEAREST )
4201 nnfunc = nn_tab[depth];
4202 CV_Assert( nnfunc != 0 );
4206 if( interpolation == INTER_LINEAR )
4207 ifunc = linear_tab[depth];
4208 else if( interpolation == INTER_CUBIC )
4209 ifunc = cubic_tab[depth];
4210 else if( interpolation == INTER_LANCZOS4 )
4211 ifunc = lanczos4_tab[depth];
4213 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
4214 CV_Assert( ifunc != 0 );
4215 ctab = initInterTab2D( interpolation, fixpt );
4218 const Mat *m1 = &map1, *m2 = &map2;
4220 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
4221 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
4223 if( map1.type() != CV_16SC2 )
4228 CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
4229 (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4230 planar_input = map1.channels() == 1;
4233 RemapInvoker invoker(src, dst, m1, m2,
4234 borderType, borderValue, planar_input, nnfunc, ifunc,
4236 parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
4240 void cv::convertMaps( InputArray _map1, InputArray _map2,
4241 OutputArray _dstmap1, OutputArray _dstmap2,
4242 int dstm1type, bool nninterpolate )
4244 Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
4245 Size size = map1.size();
4246 const Mat *m1 = &map1, *m2 = &map2;
4247 int m1type = m1->type(), m2type = m2->type();
4249 CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
4250 (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
4251 (m1type == CV_32FC1 && m2type == CV_32FC1) ||
4252 (m1type == CV_32FC2 && m2->empty()) );
4254 if( m2type == CV_16SC2 )
4256 std::swap( m1, m2 );
4257 std::swap( m1type, m2type );
4260 if( dstm1type <= 0 )
4261 dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
4262 CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
4263 _dstmap1.create( size, dstm1type );
4264 dstmap1 = _dstmap1.getMat();
4266 if( !nninterpolate && dstm1type != CV_32FC2 )
4268 _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
4269 dstmap2 = _dstmap2.getMat();
4274 if( m1type == dstm1type || (nninterpolate &&
4275 ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
4276 (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
4278 m1->convertTo( dstmap1, dstmap1.type() );
4279 if( !dstmap2.empty() && dstmap2.type() == m2->type() )
4280 m2->copyTo( dstmap2 );
4284 if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
4286 Mat vdata[] = { *m1, *m2 };
4287 merge( vdata, 2, dstmap1 );
4291 if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
4293 Mat mv[] = { dstmap1, dstmap2 };
4298 if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
4299 dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
4301 size.width *= size.height;
4305 const float scale = 1.f/INTER_TAB_SIZE;
4307 for( y = 0; y < size.height; y++ )
4309 const float* src1f = m1->ptr<float>(y);
4310 const float* src2f = m2->ptr<float>(y);
4311 const short* src1 = (const short*)src1f;
4312 const ushort* src2 = (const ushort*)src2f;
4314 float* dst1f = dstmap1.ptr<float>(y);
4315 float* dst2f = dstmap2.ptr<float>(y);
4316 short* dst1 = (short*)dst1f;
4317 ushort* dst2 = (ushort*)dst2f;
4319 if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
4322 for( x = 0; x < size.width; x++ )
4324 dst1[x*2] = saturate_cast<short>(src1f[x]);
4325 dst1[x*2+1] = saturate_cast<short>(src2f[x]);
4328 for( x = 0; x < size.width; x++ )
4330 int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
4331 int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
4332 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4333 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4334 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4337 else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
4340 for( x = 0; x < size.width; x++ )
4342 dst1[x*2] = saturate_cast<short>(src1f[x*2]);
4343 dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
4346 for( x = 0; x < size.width; x++ )
4348 int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
4349 int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
4350 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4351 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4352 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4355 else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
4357 for( x = 0; x < size.width; x++ )
4359 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
4360 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4361 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4364 else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
4366 for( x = 0; x < size.width; x++ )
4368 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
4369 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4370 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4374 CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
4382 class WarpAffineInvoker :
4383 public ParallelLoopBody
4386 WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
4387 const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
4388 ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
4389 borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
4394 virtual void operator() (const Range& range) const
4396 const int BLOCK_SZ = 64;
4397 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
4398 const int AB_BITS = MAX(10, (int)INTER_BITS);
4399 const int AB_SCALE = 1 << AB_BITS;
4400 int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
4402 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
4405 int bh0 = std::min(BLOCK_SZ/2, dst.rows);
4406 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
4407 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
4409 for( y = range.start; y < range.end; y += bh0 )
4411 for( x = 0; x < dst.cols; x += bw0 )
4413 int bw = std::min( bw0, dst.cols - x);
4414 int bh = std::min( bh0, range.end - y);
4416 Mat _XY(bh, bw, CV_16SC2, XY), matA;
4417 Mat dpart(dst, Rect(x, y, bw, bh));
4419 for( y1 = 0; y1 < bh; y1++ )
4421 short* xy = XY + y1*bw*2;
4422 int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
4423 int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
4425 if( interpolation == INTER_NEAREST )
4426 for( x1 = 0; x1 < bw; x1++ )
4428 int X = (X0 + adelta[x+x1]) >> AB_BITS;
4429 int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
4430 xy[x1*2] = saturate_cast<short>(X);
4431 xy[x1*2+1] = saturate_cast<short>(Y);
4435 short* alpha = A + y1*bw;
4440 __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
4441 __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
4442 for( ; x1 <= bw - 8; x1 += 8 )
4444 __m128i tx0, tx1, ty0, ty1;
4445 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
4446 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
4447 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
4448 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
4450 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
4451 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
4452 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
4453 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
4455 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
4456 _mm_and_si128(tx1, fxy_mask));
4457 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
4458 _mm_and_si128(ty1, fxy_mask));
4459 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
4460 _mm_srai_epi32(tx1, INTER_BITS));
4461 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
4462 _mm_srai_epi32(ty1, INTER_BITS));
4463 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
4465 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
4466 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
4467 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
4471 for( ; x1 < bw; x1++ )
4473 int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
4474 int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
4475 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
4476 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
4477 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
4478 (X & (INTER_TAB_SIZE-1)));
4483 if( interpolation == INTER_NEAREST )
4484 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
4487 Mat _matA(bh, bw, CV_16U, A);
4488 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
4497 int interpolation, borderType;
4499 int *adelta, *bdelta;
4504 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4505 class IPPWarpAffineInvoker :
4506 public ParallelLoopBody
4509 IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
4510 const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
4511 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
4512 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
4517 virtual void operator() (const Range& range) const
4519 IppiSize srcsize = { src.cols, src.rows };
4520 IppiRect srcroi = { 0, 0, src.cols, src.rows };
4521 IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
4522 int cnn = src.channels();
4523 if( borderType == BORDER_CONSTANT )
4525 IppiSize setSize = { dst.cols, range.end - range.start };
4526 void *dataPointer = dst.ptr(range.start);
4527 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
4534 // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
4535 IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
4536 (int)dst.step[0], dstroi, coeffs, mode );
4541 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4548 double (&coeffs)[2][3];
4551 ippiWarpAffineBackFunc func;
4553 const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
4559 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
4561 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
4562 Size dsize, int flags, int borderType, const Scalar& borderValue,
4565 CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
4566 const ocl::Device & dev = ocl::Device::getDefault();
4568 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4569 double doubleSupport = dev.doubleFPConfig() > 0;
4571 int interpolation = flags & INTER_MAX;
4572 if( interpolation == INTER_AREA )
4573 interpolation = INTER_LINEAR;
4574 int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
4576 if ( !(borderType == cv::BORDER_CONSTANT &&
4577 (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
4578 (!doubleSupport && depth == CV_64F) || cn > 4)
4581 const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
4582 ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
4583 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
4584 const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
4586 int scalarcn = cn == 3 ? 4 : cn;
4587 bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
4588 int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
4589 int sctype = CV_MAKETYPE(wdepth, scalarcn);
4593 if (interpolation == INTER_NEAREST)
4595 opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
4596 ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4597 ocl::typeToStr(CV_MAT_DEPTH(type)),
4598 ocl::typeToStr(sctype), cn, rowsPerWI);
4603 opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
4604 " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
4605 interpolationMap[interpolation], ocl::typeToStr(type),
4606 ocl::typeToStr(CV_MAT_DEPTH(type)),
4607 ocl::typeToStr(sctype),
4608 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
4609 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
4610 ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
4611 doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
4614 k.create(kernelName, program, opts);
4618 double borderBuf[] = { 0, 0, 0, 0 };
4619 scalarToRawData(borderValue, borderBuf, sctype);
4621 UMat src = _src.getUMat(), M0;
4622 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4623 UMat dst = _dst.getUMat();
4626 int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
4627 Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
4628 CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
4629 M1.rows == matRows && M1.cols == 3 );
4630 M1.convertTo(matM, matM.type());
4632 if( !(flags & WARP_INVERSE_MAP) )
4634 if (op_type == OCL_OP_PERSPECTIVE)
4638 double D = M[0]*M[4] - M[1]*M[3];
4639 D = D != 0 ? 1./D : 0;
4640 double A11 = M[4]*D, A22=M[0]*D;
4641 M[0] = A11; M[1] *= -D;
4642 M[3] *= -D; M[4] = A22;
4643 double b1 = -M[0]*M[2] - M[1]*M[5];
4644 double b2 = -M[3]*M[2] - M[4]*M[5];
4645 M[2] = b1; M[5] = b2;
4648 matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
4650 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
4651 ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
4653 size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4654 return k.run(2, globalThreads, NULL, false);
4662 void cv::warpAffine( InputArray _src, OutputArray _dst,
4663 InputArray _M0, Size dsize,
4664 int flags, int borderType, const Scalar& borderValue )
4666 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4667 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
4668 borderValue, OCL_OP_AFFINE))
4670 Mat src = _src.getMat(), M0 = _M0.getMat();
4671 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4672 Mat dst = _dst.getMat();
4673 CV_Assert( src.cols > 0 && src.rows > 0 );
4674 if( dst.data == src.data )
4678 Mat matM(2, 3, CV_64F, M);
4679 int interpolation = flags & INTER_MAX;
4680 if( interpolation == INTER_AREA )
4681 interpolation = INTER_LINEAR;
4683 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
4684 M0.convertTo(matM, matM.type());
4686 #ifdef HAVE_TEGRA_OPTIMIZATION
4687 if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
4691 if( !(flags & WARP_INVERSE_MAP) )
4693 double D = M[0]*M[4] - M[1]*M[3];
4694 D = D != 0 ? 1./D : 0;
4695 double A11 = M[4]*D, A22=M[0]*D;
4696 M[0] = A11; M[1] *= -D;
4697 M[3] *= -D; M[4] = A22;
4698 double b1 = -M[0]*M[2] - M[1]*M[5];
4699 double b2 = -M[3]*M[2] - M[4]*M[5];
4700 M[2] = b1; M[5] = b2;
4704 AutoBuffer<int> _abdelta(dst.cols*2);
4705 int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
4706 const int AB_BITS = MAX(10, (int)INTER_BITS);
4707 const int AB_SCALE = 1 << AB_BITS;
4709 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4712 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4713 if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
4714 ( cn == 1 || cn == 3 || cn == 4 ) &&
4715 ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
4716 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
4718 ippiWarpAffineBackFunc ippFunc = 0;
4719 if ((flags & WARP_INVERSE_MAP) != 0)
4722 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
4723 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
4724 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
4725 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
4726 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
4727 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
4728 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
4729 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
4730 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
4736 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
4737 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
4738 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
4739 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
4740 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
4741 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
4742 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
4743 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
4744 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
4748 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
4749 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4750 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
4752 CV_Assert(mode && ippFunc);
4754 double coeffs[2][3];
4755 for( int i = 0; i < 2; i++ )
4756 for( int j = 0; j < 3; j++ )
4757 coeffs[i][j] = matM.at<double>(i, j);
4760 Range range(0, dst.rows);
4761 IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
4762 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
4765 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4768 setIppErrorStatus();
4773 for( x = 0; x < dst.cols; x++ )
4775 adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
4776 bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
4779 Range range(0, dst.rows);
4780 WarpAffineInvoker invoker(src, dst, interpolation, borderType,
4781 borderValue, adelta, bdelta, M);
4782 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
4789 class WarpPerspectiveInvoker :
4790 public ParallelLoopBody
4793 WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
4794 int _borderType, const Scalar &_borderValue) :
4795 ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
4796 borderType(_borderType), borderValue(_borderValue)
4800 virtual void operator() (const Range& range) const
4802 const int BLOCK_SZ = 32;
4803 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
4804 int x, y, x1, y1, width = dst.cols, height = dst.rows;
4806 int bh0 = std::min(BLOCK_SZ/2, height);
4807 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
4808 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
4810 for( y = range.start; y < range.end; y += bh0 )
4812 for( x = 0; x < width; x += bw0 )
4814 int bw = std::min( bw0, width - x);
4815 int bh = std::min( bh0, range.end - y); // height
4817 Mat _XY(bh, bw, CV_16SC2, XY), matA;
4818 Mat dpart(dst, Rect(x, y, bw, bh));
4820 for( y1 = 0; y1 < bh; y1++ )
4822 short* xy = XY + y1*bw*2;
4823 double X0 = M[0]*x + M[1]*(y + y1) + M[2];
4824 double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
4825 double W0 = M[6]*x + M[7]*(y + y1) + M[8];
4827 if( interpolation == INTER_NEAREST )
4828 for( x1 = 0; x1 < bw; x1++ )
4830 double W = W0 + M[6]*x1;
4832 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
4833 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
4834 int X = saturate_cast<int>(fX);
4835 int Y = saturate_cast<int>(fY);
4837 xy[x1*2] = saturate_cast<short>(X);
4838 xy[x1*2+1] = saturate_cast<short>(Y);
4842 short* alpha = A + y1*bw;
4843 for( x1 = 0; x1 < bw; x1++ )
4845 double W = W0 + M[6]*x1;
4846 W = W ? INTER_TAB_SIZE/W : 0;
4847 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
4848 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
4849 int X = saturate_cast<int>(fX);
4850 int Y = saturate_cast<int>(fY);
4852 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
4853 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
4854 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
4855 (X & (INTER_TAB_SIZE-1)));
4860 if( interpolation == INTER_NEAREST )
4861 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
4864 Mat _matA(bh, bw, CV_16U, A);
4865 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
4875 int interpolation, borderType;
4880 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4881 class IPPWarpPerspectiveInvoker :
4882 public ParallelLoopBody
4885 IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
4886 int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
4887 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
4888 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
4893 virtual void operator() (const Range& range) const
4895 IppiSize srcsize = {src.cols, src.rows};
4896 IppiRect srcroi = {0, 0, src.cols, src.rows};
4897 IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
4898 int cnn = src.channels();
4900 if( borderType == BORDER_CONSTANT )
4902 IppiSize setSize = {dst.cols, range.end - range.start};
4903 void *dataPointer = dst.ptr(range.start);
4904 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
4911 IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode);
4912 if (status != ippStsNoErr)
4916 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4923 double (&coeffs)[3][3];
4925 const Scalar borderValue;
4926 ippiWarpPerspectiveFunc func;
4929 const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
4934 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
4935 Size dsize, int flags, int borderType, const Scalar& borderValue )
4937 CV_Assert( _src.total() > 0 );
4939 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4940 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
4941 OCL_OP_PERSPECTIVE))
4943 Mat src = _src.getMat(), M0 = _M0.getMat();
4944 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4945 Mat dst = _dst.getMat();
4947 if( dst.data == src.data )
4951 Mat matM(3, 3, CV_64F, M);
4952 int interpolation = flags & INTER_MAX;
4953 if( interpolation == INTER_AREA )
4954 interpolation = INTER_LINEAR;
4956 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
4957 M0.convertTo(matM, matM.type());
4959 #ifdef HAVE_TEGRA_OPTIMIZATION
4960 if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
4965 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4968 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4969 if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
4970 (cn == 1 || cn == 3 || cn == 4) &&
4971 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
4972 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
4974 ippiWarpPerspectiveFunc ippFunc = 0;
4975 if ((flags & WARP_INVERSE_MAP) != 0)
4977 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
4978 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
4979 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
4980 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
4981 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
4982 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
4983 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
4984 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
4985 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
4989 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
4990 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
4991 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
4992 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
4993 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
4994 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
4995 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
4996 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
4997 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
5000 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5001 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5002 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
5003 CV_Assert(mode && ippFunc);
5005 double coeffs[3][3];
5006 for( int i = 0; i < 3; i++ )
5007 for( int j = 0; j < 3; j++ )
5008 coeffs[i][j] = matM.at<double>(i, j);
5011 Range range(0, dst.rows);
5012 IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5013 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5016 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5019 setIppErrorStatus();
5024 if( !(flags & WARP_INVERSE_MAP) )
5027 Range range(0, dst.rows);
5028 WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
5029 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5033 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
5036 double alpha = cos(angle)*scale;
5037 double beta = sin(angle)*scale;
5039 Mat M(2, 3, CV_64F);
5040 double* m = M.ptr<double>();
5044 m[2] = (1-alpha)*center.x - beta*center.y;
5047 m[5] = beta*center.x + (1-alpha)*center.y;
5052 /* Calculates coefficients of perspective transformation
5053 * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
5055 * c00*xi + c01*yi + c02
5056 * ui = ---------------------
5057 * c20*xi + c21*yi + c22
5059 * c10*xi + c11*yi + c12
5060 * vi = ---------------------
5061 * c20*xi + c21*yi + c22
5063 * Coefficients are calculated by solving linear system:
5064 * / x0 y0 1 0 0 0 -x0*u0 -y0*u0 \ /c00\ /u0\
5065 * | x1 y1 1 0 0 0 -x1*u1 -y1*u1 | |c01| |u1|
5066 * | x2 y2 1 0 0 0 -x2*u2 -y2*u2 | |c02| |u2|
5067 * | x3 y3 1 0 0 0 -x3*u3 -y3*u3 |.|c10|=|u3|,
5068 * | 0 0 0 x0 y0 1 -x0*v0 -y0*v0 | |c11| |v0|
5069 * | 0 0 0 x1 y1 1 -x1*v1 -y1*v1 | |c12| |v1|
5070 * | 0 0 0 x2 y2 1 -x2*v2 -y2*v2 | |c20| |v2|
5071 * \ 0 0 0 x3 y3 1 -x3*v3 -y3*v3 / \c21/ \v3/
5074 * cij - matrix coefficients, c22 = 1
5076 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
5078 Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
5079 double a[8][8], b[8];
5080 Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
5082 for( int i = 0; i < 4; ++i )
5084 a[i][0] = a[i+4][3] = src[i].x;
5085 a[i][1] = a[i+4][4] = src[i].y;
5086 a[i][2] = a[i+4][5] = 1;
5087 a[i][3] = a[i][4] = a[i][5] =
5088 a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
5089 a[i][6] = -src[i].x*dst[i].x;
5090 a[i][7] = -src[i].y*dst[i].x;
5091 a[i+4][6] = -src[i].x*dst[i].y;
5092 a[i+4][7] = -src[i].y*dst[i].y;
5097 solve( A, B, X, DECOMP_SVD );
5098 M.ptr<double>()[8] = 1.;
5103 /* Calculates coefficients of affine transformation
5104 * which maps (xi,yi) to (ui,vi), (i=1,2,3):
5106 * ui = c00*xi + c01*yi + c02
5108 * vi = c10*xi + c11*yi + c12
5110 * Coefficients are calculated by solving linear system:
5111 * / x0 y0 1 0 0 0 \ /c00\ /u0\
5112 * | x1 y1 1 0 0 0 | |c01| |u1|
5113 * | x2 y2 1 0 0 0 | |c02| |u2|
5114 * | 0 0 0 x0 y0 1 | |c10| |v0|
5115 * | 0 0 0 x1 y1 1 | |c11| |v1|
5116 * \ 0 0 0 x2 y2 1 / |c12| |v2|
5119 * cij - matrix coefficients
5122 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
5124 Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
5125 double a[6*6], b[6];
5126 Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
5128 for( int i = 0; i < 3; i++ )
5132 a[j] = a[k+3] = src[i].x;
5133 a[j+1] = a[k+4] = src[i].y;
5134 a[j+2] = a[k+5] = 1;
5135 a[j+3] = a[j+4] = a[j+5] = 0;
5136 a[k] = a[k+1] = a[k+2] = 0;
5138 b[i*2+1] = dst[i].y;
5145 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
5147 Mat matM = _matM.getMat();
5148 CV_Assert(matM.rows == 2 && matM.cols == 3);
5149 __iM.create(2, 3, matM.type());
5150 Mat _iM = __iM.getMat();
5152 if( matM.type() == CV_32F )
5154 const float* M = matM.ptr<float>();
5155 float* iM = _iM.ptr<float>();
5156 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5158 double D = M[0]*M[step+1] - M[1]*M[step];
5159 D = D != 0 ? 1./D : 0;
5160 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5161 double b1 = -A11*M[2] - A12*M[step+2];
5162 double b2 = -A21*M[2] - A22*M[step+2];
5164 iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
5165 iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
5167 else if( matM.type() == CV_64F )
5169 const double* M = matM.ptr<double>();
5170 double* iM = _iM.ptr<double>();
5171 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5173 double D = M[0]*M[step+1] - M[1]*M[step];
5174 D = D != 0 ? 1./D : 0;
5175 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5176 double b1 = -A11*M[2] - A12*M[step+2];
5177 double b2 = -A21*M[2] - A22*M[step+2];
5179 iM[0] = A11; iM[1] = A12; iM[2] = b1;
5180 iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
5183 CV_Error( CV_StsUnsupportedFormat, "" );
5186 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
5188 Mat src = _src.getMat(), dst = _dst.getMat();
5189 CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
5190 return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5193 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
5195 Mat src = _src.getMat(), dst = _dst.getMat();
5196 CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
5197 return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5201 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
5203 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5204 CV_Assert( src.type() == dst.type() );
5205 cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
5206 (double)dst.rows/src.rows, method );
5211 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5212 int flags, CvScalar fillval )
5214 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5215 cv::Mat matrix = cv::cvarrToMat(marr);
5216 CV_Assert( src.type() == dst.type() );
5217 cv::warpAffine( src, dst, matrix, dst.size(), flags,
5218 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5223 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5224 int flags, CvScalar fillval )
5226 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5227 cv::Mat matrix = cv::cvarrToMat(marr);
5228 CV_Assert( src.type() == dst.type() );
5229 cv::warpPerspective( src, dst, matrix, dst.size(), flags,
5230 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5235 cvRemap( const CvArr* srcarr, CvArr* dstarr,
5236 const CvArr* _mapx, const CvArr* _mapy,
5237 int flags, CvScalar fillval )
5239 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
5240 cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
5241 CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
5242 cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
5243 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5245 CV_Assert( dst0.data == dst.data );
5250 cv2DRotationMatrix( CvPoint2D32f center, double angle,
5251 double scale, CvMat* matrix )
5253 cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
5254 CV_Assert( M.size() == M0.size() );
5255 M.convertTo(M0, M0.type());
5261 cvGetPerspectiveTransform( const CvPoint2D32f* src,
5262 const CvPoint2D32f* dst,
5265 cv::Mat M0 = cv::cvarrToMat(matrix),
5266 M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5267 CV_Assert( M.size() == M0.size() );
5268 M.convertTo(M0, M0.type());
5274 cvGetAffineTransform( const CvPoint2D32f* src,
5275 const CvPoint2D32f* dst,
5278 cv::Mat M0 = cv::cvarrToMat(matrix),
5279 M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5280 CV_Assert( M.size() == M0.size() );
5281 M.convertTo(M0, M0.type());
5287 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
5289 cv::Mat map1 = cv::cvarrToMat(arr1), map2;
5290 cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
5293 map2 = cv::cvarrToMat(arr2);
5296 dstmap2 = cv::cvarrToMat(dstarr2);
5297 if( dstmap2.type() == CV_16SC1 )
5298 dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
5301 cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
5304 /****************************************************************************************\
5305 * Log-Polar Transform *
5306 \****************************************************************************************/
5308 /* now it is done via Remap; more correct implementation should use
5309 some super-sampling technique outside of the "fovea" circle */
5311 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
5312 CvPoint2D32f center, double M, int flags )
5314 cv::Ptr<CvMat> mapx, mapy;
5316 CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
5317 CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
5318 CvSize ssize, dsize;
5320 if( !CV_ARE_TYPES_EQ( src, dst ))
5321 CV_Error( CV_StsUnmatchedFormats, "" );
5324 CV_Error( CV_StsOutOfRange, "M should be >0" );
5326 ssize = cvGetMatSize(src);
5327 dsize = cvGetMatSize(dst);
5329 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5330 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5332 if( !(flags & CV_WARP_INVERSE_MAP) )
5335 cv::AutoBuffer<double> _exp_tab(dsize.width);
5336 double* exp_tab = _exp_tab;
5338 for( rho = 0; rho < dst->width; rho++ )
5339 exp_tab[rho] = std::exp(rho/M);
5341 for( phi = 0; phi < dsize.height; phi++ )
5343 double cp = cos(phi*2*CV_PI/dsize.height);
5344 double sp = sin(phi*2*CV_PI/dsize.height);
5345 float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5346 float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5348 for( rho = 0; rho < dsize.width; rho++ )
5350 double r = exp_tab[rho];
5351 double x = r*cp + center.x;
5352 double y = r*sp + center.y;
5362 CvMat bufx, bufy, bufp, bufa;
5363 double ascale = ssize.height/(2*CV_PI);
5364 cv::AutoBuffer<float> _buf(4*dsize.width);
5367 bufx = cvMat( 1, dsize.width, CV_32F, buf );
5368 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5369 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5370 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5372 for( x = 0; x < dsize.width; x++ )
5373 bufx.data.fl[x] = (float)x - center.x;
5375 for( y = 0; y < dsize.height; y++ )
5377 float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5378 float* my = (float*)(mapy->data.ptr + y*mapy->step);
5380 for( x = 0; x < dsize.width; x++ )
5381 bufy.data.fl[x] = (float)y - center.y;
5384 cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
5386 for( x = 0; x < dsize.width; x++ )
5387 bufp.data.fl[x] += 1.f;
5389 cvLog( &bufp, &bufp );
5391 for( x = 0; x < dsize.width; x++ )
5393 double rho = bufp.data.fl[x]*M;
5394 double phi = bufa.data.fl[x]*ascale;
5400 for( x = 0; x < dsize.width; x++ )
5402 double xx = bufx.data.fl[x];
5403 double yy = bufy.data.fl[x];
5405 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
5406 double a = atan2(yy,xx);
5418 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5421 void cv::logPolar( InputArray _src, OutputArray _dst,
5422 Point2f center, double M, int flags )
5424 Mat src = _src.getMat();
5425 _dst.create( src.size(), src.type() );
5426 CvMat c_src = src, c_dst = _dst.getMat();
5427 cvLogPolar( &c_src, &c_dst, center, M, flags );
5430 /****************************************************************************************
5431 Linear-Polar Transform
5432 J.L. Blanco, Apr 2009
5433 ****************************************************************************************/
5435 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
5436 CvPoint2D32f center, double maxRadius, int flags )
5438 cv::Ptr<CvMat> mapx, mapy;
5440 CvMat srcstub, *src = (CvMat*)srcarr;
5441 CvMat dststub, *dst = (CvMat*)dstarr;
5442 CvSize ssize, dsize;
5444 src = cvGetMat( srcarr, &srcstub,0,0 );
5445 dst = cvGetMat( dstarr, &dststub,0,0 );
5447 if( !CV_ARE_TYPES_EQ( src, dst ))
5448 CV_Error( CV_StsUnmatchedFormats, "" );
5450 ssize.width = src->cols;
5451 ssize.height = src->rows;
5452 dsize.width = dst->cols;
5453 dsize.height = dst->rows;
5455 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5456 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5458 if( !(flags & CV_WARP_INVERSE_MAP) )
5462 for( phi = 0; phi < dsize.height; phi++ )
5464 double cp = cos(phi*2*CV_PI/dsize.height);
5465 double sp = sin(phi*2*CV_PI/dsize.height);
5466 float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5467 float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5469 for( rho = 0; rho < dsize.width; rho++ )
5471 double r = maxRadius*(rho+1)/dsize.width;
5472 double x = r*cp + center.x;
5473 double y = r*sp + center.y;
5483 CvMat bufx, bufy, bufp, bufa;
5484 const double ascale = ssize.height/(2*CV_PI);
5485 const double pscale = ssize.width/maxRadius;
5487 cv::AutoBuffer<float> _buf(4*dsize.width);
5490 bufx = cvMat( 1, dsize.width, CV_32F, buf );
5491 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5492 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5493 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5495 for( x = 0; x < dsize.width; x++ )
5496 bufx.data.fl[x] = (float)x - center.x;
5498 for( y = 0; y < dsize.height; y++ )
5500 float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5501 float* my = (float*)(mapy->data.ptr + y*mapy->step);
5503 for( x = 0; x < dsize.width; x++ )
5504 bufy.data.fl[x] = (float)y - center.y;
5506 cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
5508 for( x = 0; x < dsize.width; x++ )
5509 bufp.data.fl[x] += 1.f;
5511 for( x = 0; x < dsize.width; x++ )
5513 double rho = bufp.data.fl[x]*pscale;
5514 double phi = bufa.data.fl[x]*ascale;
5521 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5524 void cv::linearPolar( InputArray _src, OutputArray _dst,
5525 Point2f center, double maxRadius, int flags )
5527 Mat src = _src.getMat();
5528 _dst.create( src.size(), src.type() );
5529 CvMat c_src = src, c_dst = _dst.getMat();
5530 cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );