1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 /* ////////////////////////////////////////////////////////////////////
45 // Geometrical transforms on images and matrices: rotation, zoom etc.
49 #include "precomp.hpp"
50 #include "opencl_kernels_imgproc.hpp"
52 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
53 static IppStatus sts = ippInit();
58 #if IPP_VERSION_X100 >= 701
59 typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
60 typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
61 typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
64 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) && 0
65 typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
66 typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
67 typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
69 template <int channels, typename Type>
70 bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
72 Type values[channels];
73 for( int i = 0; i < channels; i++ )
74 values[i] = saturate_cast<Type>(value[i]);
75 return func(values, dataPointer, step, size) >= 0;
78 static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
85 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
87 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
89 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
99 return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
101 return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
103 return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
106 else if( channels == 4 )
111 return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
113 return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
115 return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
123 /************** interpolation formulas and tables ***************/
125 const int INTER_RESIZE_COEF_BITS=11;
126 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
128 const int INTER_REMAP_COEF_BITS=15;
129 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
131 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
133 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
134 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
136 #if CV_SSE2 || CV_NEON
137 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
138 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
141 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
142 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
144 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
145 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
147 static inline void interpolateLinear( float x, float* coeffs )
153 static inline void interpolateCubic( float x, float* coeffs )
155 const float A = -0.75f;
157 coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
158 coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
159 coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
160 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
163 static inline void interpolateLanczos4( float x, float* coeffs )
165 static const double s45 = 0.70710678118654752440084436210485;
166 static const double cs[][2]=
167 {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
169 if( x < FLT_EPSILON )
171 for( int i = 0; i < 8; i++ )
178 double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
179 for(int i = 0; i < 8; i++ )
181 double y = -(x+3-i)*CV_PI*0.25;
182 coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
187 for(int i = 0; i < 8; i++ )
191 static void initInterTab1D(int method, float* tab, int tabsz)
193 float scale = 1.f/tabsz;
194 if( method == INTER_LINEAR )
196 for( int i = 0; i < tabsz; i++, tab += 2 )
197 interpolateLinear( i*scale, tab );
199 else if( method == INTER_CUBIC )
201 for( int i = 0; i < tabsz; i++, tab += 4 )
202 interpolateCubic( i*scale, tab );
204 else if( method == INTER_LANCZOS4 )
206 for( int i = 0; i < tabsz; i++, tab += 8 )
207 interpolateLanczos4( i*scale, tab );
210 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
214 static const void* initInterTab2D( int method, bool fixpt )
216 static bool inittab[INTER_MAX+1] = {false};
220 if( method == INTER_LINEAR )
221 tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
222 else if( method == INTER_CUBIC )
223 tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
224 else if( method == INTER_LANCZOS4 )
225 tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
227 CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
229 if( !inittab[method] )
231 AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
233 initInterTab1D(method, _tab, INTER_TAB_SIZE);
234 for( i = 0; i < INTER_TAB_SIZE; i++ )
235 for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
238 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
239 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
241 for( k1 = 0; k1 < ksize; k1++ )
243 float vy = _tab[i*ksize + k1];
244 for( k2 = 0; k2 < ksize; k2++ )
246 float v = vy*_tab[j*ksize + k2];
247 tab[k1*ksize + k2] = v;
248 isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
252 if( isum != INTER_REMAP_COEF_SCALE )
254 int diff = isum - INTER_REMAP_COEF_SCALE;
255 int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
256 for( k1 = ksize2; k1 < ksize2+2; k1++ )
257 for( k2 = ksize2; k2 < ksize2+2; k2++ )
259 if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
261 else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
265 itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
267 itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
270 tab -= INTER_TAB_SIZE2*ksize*ksize;
271 itab -= INTER_TAB_SIZE2*ksize*ksize;
272 #if CV_SSE2 || CV_NEON
273 if( method == INTER_LINEAR )
275 for( i = 0; i < INTER_TAB_SIZE2; i++ )
276 for( j = 0; j < 4; j++ )
278 BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
279 BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
280 BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
281 BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
285 inittab[method] = true;
287 return fixpt ? (const void*)itab : (const void*)tab;
291 static bool initAllInterTab2D()
293 return initInterTab2D( INTER_LINEAR, false ) &&
294 initInterTab2D( INTER_LINEAR, true ) &&
295 initInterTab2D( INTER_CUBIC, false ) &&
296 initInterTab2D( INTER_CUBIC, true ) &&
297 initInterTab2D( INTER_LANCZOS4, false ) &&
298 initInterTab2D( INTER_LANCZOS4, true );
301 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
304 template<typename ST, typename DT> struct Cast
309 DT operator()(ST val) const { return saturate_cast<DT>(val); }
312 template<typename ST, typename DT, int bits> struct FixedPtCast
316 enum { SHIFT = bits, DELTA = 1 << (bits-1) };
318 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
321 /****************************************************************************************\
323 \****************************************************************************************/
325 class resizeNNInvoker :
326 public ParallelLoopBody
329 resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
330 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
335 virtual void operator() (const Range& range) const
337 Size ssize = src.size(), dsize = dst.size();
338 int y, x, pix_size = (int)src.elemSize();
340 for( y = range.start; y < range.end; y++ )
342 uchar* D = dst.data + dst.step*y;
343 int sy = std::min(cvFloor(y*ify), ssize.height-1);
344 const uchar* S = src.ptr(sy);
349 for( x = 0; x <= dsize.width - 2; x += 2 )
351 uchar t0 = S[x_ofs[x]];
352 uchar t1 = S[x_ofs[x+1]];
357 for( ; x < dsize.width; x++ )
361 for( x = 0; x < dsize.width; x++ )
362 *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
365 for( x = 0; x < dsize.width; x++, D += 3 )
367 const uchar* _tS = S + x_ofs[x];
368 D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
372 for( x = 0; x < dsize.width; x++ )
373 *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
376 for( x = 0; x < dsize.width; x++, D += 6 )
378 const ushort* _tS = (const ushort*)(S + x_ofs[x]);
379 ushort* _tD = (ushort*)D;
380 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
384 for( x = 0; x < dsize.width; x++, D += 8 )
386 const int* _tS = (const int*)(S + x_ofs[x]);
388 _tD[0] = _tS[0]; _tD[1] = _tS[1];
392 for( x = 0; x < dsize.width; x++, D += 12 )
394 const int* _tS = (const int*)(S + x_ofs[x]);
396 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
400 for( x = 0; x < dsize.width; x++, D += pix_size )
402 const int* _tS = (const int*)(S + x_ofs[x]);
404 for( int k = 0; k < pix_size4; k++ )
414 int* x_ofs, pix_size4;
417 resizeNNInvoker(const resizeNNInvoker&);
418 resizeNNInvoker& operator=(const resizeNNInvoker&);
422 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
424 Size ssize = src.size(), dsize = dst.size();
425 AutoBuffer<int> _x_ofs(dsize.width);
427 int pix_size = (int)src.elemSize();
428 int pix_size4 = (int)(pix_size / sizeof(int));
429 double ifx = 1./fx, ify = 1./fy;
432 for( x = 0; x < dsize.width; x++ )
434 int sx = cvFloor(x*ifx);
435 x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
438 Range range(0, dsize.height);
439 resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
440 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
446 int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
451 int operator()(const uchar**, uchar**, int, const int*,
452 const uchar*, int, int, int, int, int) const { return 0; }
457 struct VResizeLinearVec_32s8u
459 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
461 if( !checkHardwareSupport(CV_CPU_SSE2) )
464 const int** src = (const int**)_src;
465 const short* beta = (const short*)_beta;
466 const int *S0 = src[0], *S1 = src[1];
468 __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
469 __m128i delta = _mm_set1_epi16(2);
471 if( (((size_t)S0|(size_t)S1)&15) == 0 )
472 for( ; x <= width - 16; x += 16 )
474 __m128i x0, x1, x2, y0, y1, y2;
475 x0 = _mm_load_si128((const __m128i*)(S0 + x));
476 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
477 y0 = _mm_load_si128((const __m128i*)(S1 + x));
478 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
479 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
480 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
482 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
483 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
484 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
485 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
486 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
487 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
489 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
490 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
492 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
493 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
494 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
497 for( ; x <= width - 16; x += 16 )
499 __m128i x0, x1, x2, y0, y1, y2;
500 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
501 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
502 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
503 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
504 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
505 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
507 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
508 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
509 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
510 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
511 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
512 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
514 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
515 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
517 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
518 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
519 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
522 for( ; x < width - 4; x += 4 )
525 x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
526 y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
527 x0 = _mm_packs_epi32(x0, x0);
528 y0 = _mm_packs_epi32(y0, y0);
529 x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
530 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
531 x0 = _mm_packus_epi16(x0, x0);
532 *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
540 template<int shiftval> struct VResizeLinearVec_32f16
542 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
544 if( !checkHardwareSupport(CV_CPU_SSE2) )
547 const float** src = (const float**)_src;
548 const float* beta = (const float*)_beta;
549 const float *S0 = src[0], *S1 = src[1];
550 ushort* dst = (ushort*)_dst;
553 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
554 __m128i preshift = _mm_set1_epi32(shiftval);
555 __m128i postshift = _mm_set1_epi16((short)shiftval);
557 if( (((size_t)S0|(size_t)S1)&15) == 0 )
558 for( ; x <= width - 16; x += 16 )
560 __m128 x0, x1, y0, y1;
562 x0 = _mm_load_ps(S0 + x);
563 x1 = _mm_load_ps(S0 + x + 4);
564 y0 = _mm_load_ps(S1 + x);
565 y1 = _mm_load_ps(S1 + x + 4);
567 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
568 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
569 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
570 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
571 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
573 x0 = _mm_load_ps(S0 + x + 8);
574 x1 = _mm_load_ps(S0 + x + 12);
575 y0 = _mm_load_ps(S1 + x + 8);
576 y1 = _mm_load_ps(S1 + x + 12);
578 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
579 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
580 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
581 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
582 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
584 _mm_storeu_si128( (__m128i*)(dst + x), t0);
585 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
588 for( ; x <= width - 16; x += 16 )
590 __m128 x0, x1, y0, y1;
592 x0 = _mm_loadu_ps(S0 + x);
593 x1 = _mm_loadu_ps(S0 + x + 4);
594 y0 = _mm_loadu_ps(S1 + x);
595 y1 = _mm_loadu_ps(S1 + x + 4);
597 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
598 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
599 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
600 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
601 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
603 x0 = _mm_loadu_ps(S0 + x + 8);
604 x1 = _mm_loadu_ps(S0 + x + 12);
605 y0 = _mm_loadu_ps(S1 + x + 8);
606 y1 = _mm_loadu_ps(S1 + x + 12);
608 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
609 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
610 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
611 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
612 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
614 _mm_storeu_si128( (__m128i*)(dst + x), t0);
615 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
618 for( ; x < width - 4; x += 4 )
622 x0 = _mm_loadu_ps(S0 + x);
623 y0 = _mm_loadu_ps(S1 + x);
625 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
626 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
627 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
628 _mm_storel_epi64( (__m128i*)(dst + x), t0);
635 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
636 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
638 struct VResizeLinearVec_32f
640 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
642 if( !checkHardwareSupport(CV_CPU_SSE) )
645 const float** src = (const float**)_src;
646 const float* beta = (const float*)_beta;
647 const float *S0 = src[0], *S1 = src[1];
648 float* dst = (float*)_dst;
651 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
653 if( (((size_t)S0|(size_t)S1)&15) == 0 )
654 for( ; x <= width - 8; x += 8 )
656 __m128 x0, x1, y0, y1;
657 x0 = _mm_load_ps(S0 + x);
658 x1 = _mm_load_ps(S0 + x + 4);
659 y0 = _mm_load_ps(S1 + x);
660 y1 = _mm_load_ps(S1 + x + 4);
662 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
663 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
665 _mm_storeu_ps( dst + x, x0);
666 _mm_storeu_ps( dst + x + 4, x1);
669 for( ; x <= width - 8; x += 8 )
671 __m128 x0, x1, y0, y1;
672 x0 = _mm_loadu_ps(S0 + x);
673 x1 = _mm_loadu_ps(S0 + x + 4);
674 y0 = _mm_loadu_ps(S1 + x);
675 y1 = _mm_loadu_ps(S1 + x + 4);
677 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
678 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
680 _mm_storeu_ps( dst + x, x0);
681 _mm_storeu_ps( dst + x + 4, x1);
689 struct VResizeCubicVec_32s8u
691 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
693 if( !checkHardwareSupport(CV_CPU_SSE2) )
696 const int** src = (const int**)_src;
697 const short* beta = (const short*)_beta;
698 const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
700 float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
701 __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
702 b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
704 if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
705 for( ; x <= width - 8; x += 8 )
707 __m128i x0, x1, y0, y1;
708 __m128 s0, s1, f0, f1;
709 x0 = _mm_load_si128((const __m128i*)(S0 + x));
710 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
711 y0 = _mm_load_si128((const __m128i*)(S1 + x));
712 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
714 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
715 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
716 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
717 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
718 s0 = _mm_add_ps(s0, f0);
719 s1 = _mm_add_ps(s1, f1);
721 x0 = _mm_load_si128((const __m128i*)(S2 + x));
722 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
723 y0 = _mm_load_si128((const __m128i*)(S3 + x));
724 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
726 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
727 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
728 s0 = _mm_add_ps(s0, f0);
729 s1 = _mm_add_ps(s1, f1);
730 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
731 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
732 s0 = _mm_add_ps(s0, f0);
733 s1 = _mm_add_ps(s1, f1);
735 x0 = _mm_cvtps_epi32(s0);
736 x1 = _mm_cvtps_epi32(s1);
738 x0 = _mm_packs_epi32(x0, x1);
739 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
742 for( ; x <= width - 8; x += 8 )
744 __m128i x0, x1, y0, y1;
745 __m128 s0, s1, f0, f1;
746 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
747 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
748 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
749 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
751 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
752 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
753 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
754 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
755 s0 = _mm_add_ps(s0, f0);
756 s1 = _mm_add_ps(s1, f1);
758 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
759 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
760 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
761 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
763 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
764 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
765 s0 = _mm_add_ps(s0, f0);
766 s1 = _mm_add_ps(s1, f1);
767 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
768 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
769 s0 = _mm_add_ps(s0, f0);
770 s1 = _mm_add_ps(s1, f1);
772 x0 = _mm_cvtps_epi32(s0);
773 x1 = _mm_cvtps_epi32(s1);
775 x0 = _mm_packs_epi32(x0, x1);
776 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
784 template<int shiftval> struct VResizeCubicVec_32f16
786 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
788 if( !checkHardwareSupport(CV_CPU_SSE2) )
791 const float** src = (const float**)_src;
792 const float* beta = (const float*)_beta;
793 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
794 ushort* dst = (ushort*)_dst;
796 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
797 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
798 __m128i preshift = _mm_set1_epi32(shiftval);
799 __m128i postshift = _mm_set1_epi16((short)shiftval);
801 for( ; x <= width - 8; x += 8 )
803 __m128 x0, x1, y0, y1, s0, s1;
805 x0 = _mm_loadu_ps(S0 + x);
806 x1 = _mm_loadu_ps(S0 + x + 4);
807 y0 = _mm_loadu_ps(S1 + x);
808 y1 = _mm_loadu_ps(S1 + x + 4);
810 s0 = _mm_mul_ps(x0, b0);
811 s1 = _mm_mul_ps(x1, b0);
812 y0 = _mm_mul_ps(y0, b1);
813 y1 = _mm_mul_ps(y1, b1);
814 s0 = _mm_add_ps(s0, y0);
815 s1 = _mm_add_ps(s1, y1);
817 x0 = _mm_loadu_ps(S2 + x);
818 x1 = _mm_loadu_ps(S2 + x + 4);
819 y0 = _mm_loadu_ps(S3 + x);
820 y1 = _mm_loadu_ps(S3 + x + 4);
822 x0 = _mm_mul_ps(x0, b2);
823 x1 = _mm_mul_ps(x1, b2);
824 y0 = _mm_mul_ps(y0, b3);
825 y1 = _mm_mul_ps(y1, b3);
826 s0 = _mm_add_ps(s0, x0);
827 s1 = _mm_add_ps(s1, x1);
828 s0 = _mm_add_ps(s0, y0);
829 s1 = _mm_add_ps(s1, y1);
831 t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
832 t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
834 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
835 _mm_storeu_si128( (__m128i*)(dst + x), t0);
842 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
843 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
845 struct VResizeCubicVec_32f
847 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
849 if( !checkHardwareSupport(CV_CPU_SSE) )
852 const float** src = (const float**)_src;
853 const float* beta = (const float*)_beta;
854 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
855 float* dst = (float*)_dst;
857 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
858 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
860 for( ; x <= width - 8; x += 8 )
862 __m128 x0, x1, y0, y1, s0, s1;
863 x0 = _mm_loadu_ps(S0 + x);
864 x1 = _mm_loadu_ps(S0 + x + 4);
865 y0 = _mm_loadu_ps(S1 + x);
866 y1 = _mm_loadu_ps(S1 + x + 4);
868 s0 = _mm_mul_ps(x0, b0);
869 s1 = _mm_mul_ps(x1, b0);
870 y0 = _mm_mul_ps(y0, b1);
871 y1 = _mm_mul_ps(y1, b1);
872 s0 = _mm_add_ps(s0, y0);
873 s1 = _mm_add_ps(s1, y1);
875 x0 = _mm_loadu_ps(S2 + x);
876 x1 = _mm_loadu_ps(S2 + x + 4);
877 y0 = _mm_loadu_ps(S3 + x);
878 y1 = _mm_loadu_ps(S3 + x + 4);
880 x0 = _mm_mul_ps(x0, b2);
881 x1 = _mm_mul_ps(x1, b2);
882 y0 = _mm_mul_ps(y0, b3);
883 y1 = _mm_mul_ps(y1, b3);
884 s0 = _mm_add_ps(s0, x0);
885 s1 = _mm_add_ps(s1, x1);
886 s0 = _mm_add_ps(s0, y0);
887 s1 = _mm_add_ps(s1, y1);
889 _mm_storeu_ps( dst + x, s0);
890 _mm_storeu_ps( dst + x + 4, s1);
897 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
898 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
899 typedef VResizeNoVec VResizeLanczos4Vec_32f;
903 struct VResizeLinearVec_32s8u
905 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
907 const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
908 const short* beta = (const short*)_beta;
910 int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
912 for( ; x <= width - 16; x += 16)
914 int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
915 int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
917 int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
918 int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
920 int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
921 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
922 v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
924 v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
925 v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
926 v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
927 v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
929 v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
930 v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
932 int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
933 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
934 v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
936 vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
943 struct VResizeLinearVec_32f16u
945 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
947 const float** src = (const float**)_src;
948 const float* beta = (const float*)_beta;
949 const float *S0 = src[0], *S1 = src[1];
950 ushort* dst = (ushort*)_dst;
953 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
955 for( ; x <= width - 8; x += 8 )
957 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
958 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
960 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
961 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
963 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
964 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
971 struct VResizeLinearVec_32f16s
973 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
975 const float** src = (const float**)_src;
976 const float* beta = (const float*)_beta;
977 const float *S0 = src[0], *S1 = src[1];
978 short* dst = (short*)_dst;
981 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
983 for( ; x <= width - 8; x += 8 )
985 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
986 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
988 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
989 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
991 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
992 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
999 struct VResizeLinearVec_32f
1001 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1003 const float** src = (const float**)_src;
1004 const float* beta = (const float*)_beta;
1005 const float *S0 = src[0], *S1 = src[1];
1006 float* dst = (float*)_dst;
1009 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1011 for( ; x <= width - 8; x += 8 )
1013 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1014 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1016 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
1017 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
1024 typedef VResizeNoVec VResizeCubicVec_32s8u;
1026 struct VResizeCubicVec_32f16u
1028 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1030 const float** src = (const float**)_src;
1031 const float* beta = (const float*)_beta;
1032 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1033 ushort* dst = (ushort*)_dst;
1035 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1036 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1038 for( ; x <= width - 8; x += 8 )
1040 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1041 v_b1, vld1q_f32(S1 + x)),
1042 v_b2, vld1q_f32(S2 + x)),
1043 v_b3, vld1q_f32(S3 + x));
1044 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1045 v_b1, vld1q_f32(S1 + x + 4)),
1046 v_b2, vld1q_f32(S2 + x + 4)),
1047 v_b3, vld1q_f32(S3 + x + 4));
1049 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1050 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1057 struct VResizeCubicVec_32f16s
1059 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1061 const float** src = (const float**)_src;
1062 const float* beta = (const float*)_beta;
1063 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1064 short* dst = (short*)_dst;
1066 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1067 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1069 for( ; x <= width - 8; x += 8 )
1071 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1072 v_b1, vld1q_f32(S1 + x)),
1073 v_b2, vld1q_f32(S2 + x)),
1074 v_b3, vld1q_f32(S3 + x));
1075 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1076 v_b1, vld1q_f32(S1 + x + 4)),
1077 v_b2, vld1q_f32(S2 + x + 4)),
1078 v_b3, vld1q_f32(S3 + x + 4));
1080 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1081 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1088 struct VResizeCubicVec_32f
1090 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1092 const float** src = (const float**)_src;
1093 const float* beta = (const float*)_beta;
1094 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1095 float* dst = (float*)_dst;
1097 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1098 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1100 for( ; x <= width - 8; x += 8 )
1102 vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1103 v_b1, vld1q_f32(S1 + x)),
1104 v_b2, vld1q_f32(S2 + x)),
1105 v_b3, vld1q_f32(S3 + x)));
1106 vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1107 v_b1, vld1q_f32(S1 + x + 4)),
1108 v_b2, vld1q_f32(S2 + x + 4)),
1109 v_b3, vld1q_f32(S3 + x + 4)));
1116 struct VResizeLanczos4Vec_32f16u
1118 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1120 const float** src = (const float**)_src;
1121 const float* beta = (const float*)_beta;
1122 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1123 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1124 ushort * dst = (ushort*)_dst;
1126 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1127 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1128 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1129 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1131 for( ; x <= width - 8; x += 8 )
1133 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1134 v_b1, vld1q_f32(S1 + x)),
1135 v_b2, vld1q_f32(S2 + x)),
1136 v_b3, vld1q_f32(S3 + x));
1137 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1138 v_b5, vld1q_f32(S5 + x)),
1139 v_b6, vld1q_f32(S6 + x)),
1140 v_b7, vld1q_f32(S7 + x));
1141 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1143 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1144 v_b1, vld1q_f32(S1 + x + 4)),
1145 v_b2, vld1q_f32(S2 + x + 4)),
1146 v_b3, vld1q_f32(S3 + x + 4));
1147 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1148 v_b5, vld1q_f32(S5 + x + 4)),
1149 v_b6, vld1q_f32(S6 + x + 4)),
1150 v_b7, vld1q_f32(S7 + x + 4));
1151 v_dst1 = vaddq_f32(v_dst0, v_dst1);
1153 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
1154 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1161 struct VResizeLanczos4Vec_32f16s
1163 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1165 const float** src = (const float**)_src;
1166 const float* beta = (const float*)_beta;
1167 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1168 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1169 short * dst = (short*)_dst;
1171 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1172 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1173 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1174 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1176 for( ; x <= width - 8; x += 8 )
1178 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1179 v_b1, vld1q_f32(S1 + x)),
1180 v_b2, vld1q_f32(S2 + x)),
1181 v_b3, vld1q_f32(S3 + x));
1182 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1183 v_b5, vld1q_f32(S5 + x)),
1184 v_b6, vld1q_f32(S6 + x)),
1185 v_b7, vld1q_f32(S7 + x));
1186 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1188 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1189 v_b1, vld1q_f32(S1 + x + 4)),
1190 v_b2, vld1q_f32(S2 + x + 4)),
1191 v_b3, vld1q_f32(S3 + x + 4));
1192 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1193 v_b5, vld1q_f32(S5 + x + 4)),
1194 v_b6, vld1q_f32(S6 + x + 4)),
1195 v_b7, vld1q_f32(S7 + x + 4));
1196 v_dst1 = vaddq_f32(v_dst0, v_dst1);
1198 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
1199 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1206 struct VResizeLanczos4Vec_32f
1208 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1210 const float** src = (const float**)_src;
1211 const float* beta = (const float*)_beta;
1212 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1213 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1214 float* dst = (float*)_dst;
1216 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1217 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1218 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1219 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1221 for( ; x <= width - 4; x += 4 )
1223 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1224 v_b1, vld1q_f32(S1 + x)),
1225 v_b2, vld1q_f32(S2 + x)),
1226 v_b3, vld1q_f32(S3 + x));
1227 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1228 v_b5, vld1q_f32(S5 + x)),
1229 v_b6, vld1q_f32(S6 + x)),
1230 v_b7, vld1q_f32(S7 + x));
1231 vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
1240 typedef VResizeNoVec VResizeLinearVec_32s8u;
1241 typedef VResizeNoVec VResizeLinearVec_32f16u;
1242 typedef VResizeNoVec VResizeLinearVec_32f16s;
1243 typedef VResizeNoVec VResizeLinearVec_32f;
1245 typedef VResizeNoVec VResizeCubicVec_32s8u;
1246 typedef VResizeNoVec VResizeCubicVec_32f16u;
1247 typedef VResizeNoVec VResizeCubicVec_32f16s;
1248 typedef VResizeNoVec VResizeCubicVec_32f;
1250 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
1251 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
1252 typedef VResizeNoVec VResizeLanczos4Vec_32f;
1256 typedef HResizeNoVec HResizeLinearVec_8u32s;
1257 typedef HResizeNoVec HResizeLinearVec_16u32f;
1258 typedef HResizeNoVec HResizeLinearVec_16s32f;
1259 typedef HResizeNoVec HResizeLinearVec_32f;
1260 typedef HResizeNoVec HResizeLinearVec_64f;
1263 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1264 struct HResizeLinear
1266 typedef T value_type;
1267 typedef WT buf_type;
1268 typedef AT alpha_type;
1270 void operator()(const T** src, WT** dst, int count,
1271 const int* xofs, const AT* alpha,
1272 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1277 int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
1278 xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
1280 for( k = 0; k <= count - 2; k++ )
1282 const T *S0 = src[k], *S1 = src[k+1];
1283 WT *D0 = dst[k], *D1 = dst[k+1];
1284 for( dx = dx0; dx < xmax; dx++ )
1287 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1288 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1289 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1290 D0[dx] = t0; D1[dx] = t1;
1293 for( ; dx < dwidth; dx++ )
1296 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1300 for( ; k < count; k++ )
1302 const T *S = src[k];
1304 for( dx = 0; dx < xmax; dx++ )
1307 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1310 for( ; dx < dwidth; dx++ )
1311 D[dx] = WT(S[xofs[dx]]*ONE);
1317 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1318 struct VResizeLinear
1320 typedef T value_type;
1321 typedef WT buf_type;
1322 typedef AT alpha_type;
1324 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1326 WT b0 = beta[0], b1 = beta[1];
1327 const WT *S0 = src[0], *S1 = src[1];
1331 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1332 #if CV_ENABLE_UNROLLED
1333 for( ; x <= width - 4; x += 4 )
1336 t0 = S0[x]*b0 + S1[x]*b1;
1337 t1 = S0[x+1]*b0 + S1[x+1]*b1;
1338 dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1339 t0 = S0[x+2]*b0 + S1[x+2]*b1;
1340 t1 = S0[x+3]*b0 + S1[x+3]*b1;
1341 dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1344 for( ; x < width; x++ )
1345 dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1350 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1352 typedef uchar value_type;
1353 typedef int buf_type;
1354 typedef short alpha_type;
1356 void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1358 alpha_type b0 = beta[0], b1 = beta[1];
1359 const buf_type *S0 = src[0], *S1 = src[1];
1360 VResizeLinearVec_32s8u vecOp;
1362 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1363 #if CV_ENABLE_UNROLLED
1364 for( ; x <= width - 4; x += 4 )
1366 dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1367 dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1368 dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1369 dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1372 for( ; x < width; x++ )
1373 dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1378 template<typename T, typename WT, typename AT>
1381 typedef T value_type;
1382 typedef WT buf_type;
1383 typedef AT alpha_type;
1385 void operator()(const T** src, WT** dst, int count,
1386 const int* xofs, const AT* alpha,
1387 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1389 for( int k = 0; k < count; k++ )
1391 const T *S = src[k];
1393 int dx = 0, limit = xmin;
1396 for( ; dx < limit; dx++, alpha += 4 )
1398 int j, sx = xofs[dx] - cn;
1400 for( j = 0; j < 4; j++ )
1402 int sxj = sx + j*cn;
1403 if( (unsigned)sxj >= (unsigned)swidth )
1407 while( sxj >= swidth )
1410 v += S[sxj]*alpha[j];
1414 if( limit == dwidth )
1416 for( ; dx < xmax; dx++, alpha += 4 )
1419 D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1420 S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1430 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1433 typedef T value_type;
1434 typedef WT buf_type;
1435 typedef AT alpha_type;
1437 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1439 WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1440 const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1444 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1445 for( ; x < width; x++ )
1446 dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
1451 template<typename T, typename WT, typename AT>
1452 struct HResizeLanczos4
1454 typedef T value_type;
1455 typedef WT buf_type;
1456 typedef AT alpha_type;
1458 void operator()(const T** src, WT** dst, int count,
1459 const int* xofs, const AT* alpha,
1460 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1462 for( int k = 0; k < count; k++ )
1464 const T *S = src[k];
1466 int dx = 0, limit = xmin;
1469 for( ; dx < limit; dx++, alpha += 8 )
1471 int j, sx = xofs[dx] - cn*3;
1473 for( j = 0; j < 8; j++ )
1475 int sxj = sx + j*cn;
1476 if( (unsigned)sxj >= (unsigned)swidth )
1480 while( sxj >= swidth )
1483 v += S[sxj]*alpha[j];
1487 if( limit == dwidth )
1489 for( ; dx < xmax; dx++, alpha += 8 )
1492 D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
1493 S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
1494 S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
1495 S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
1505 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1506 struct VResizeLanczos4
1508 typedef T value_type;
1509 typedef WT buf_type;
1510 typedef AT alpha_type;
1512 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1516 int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1517 #if CV_ENABLE_UNROLLED
1518 for( ; x <= width - 4; x += 4 )
1521 const WT* S = src[0];
1522 WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
1524 for( k = 1; k < 8; k++ )
1526 b = beta[k]; S = src[k];
1527 s0 += S[x]*b; s1 += S[x+1]*b;
1528 s2 += S[x+2]*b; s3 += S[x+3]*b;
1531 dst[x] = castOp(s0); dst[x+1] = castOp(s1);
1532 dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
1535 for( ; x < width; x++ )
1537 dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
1538 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
1539 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
1545 static inline int clip(int x, int a, int b)
1547 return x >= a ? (x < b ? x : b-1) : a;
1550 static const int MAX_ESIZE=16;
1552 template <typename HResize, typename VResize>
1553 class resizeGeneric_Invoker :
1554 public ParallelLoopBody
1557 typedef typename HResize::value_type T;
1558 typedef typename HResize::buf_type WT;
1559 typedef typename HResize::alpha_type AT;
1561 resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
1562 const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
1563 int _ksize, int _xmin, int _xmax) :
1564 ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
1565 alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
1566 ksize(_ksize), xmin(_xmin), xmax(_xmax)
1568 CV_Assert(ksize <= MAX_ESIZE);
1571 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1572 # pragma GCC diagnostic push
1573 # pragma GCC diagnostic ignored "-Warray-bounds"
1575 virtual void operator() (const Range& range) const
1577 int dy, cn = src.channels();
1581 int bufstep = (int)alignSize(dsize.width, 16);
1582 AutoBuffer<WT> _buffer(bufstep*ksize);
1583 const T* srows[MAX_ESIZE]={0};
1584 WT* rows[MAX_ESIZE]={0};
1585 int prev_sy[MAX_ESIZE];
1587 for(int k = 0; k < ksize; k++ )
1590 rows[k] = (WT*)_buffer + bufstep*k;
1593 const AT* beta = _beta + ksize * range.start;
1595 for( dy = range.start; dy < range.end; dy++, beta += ksize )
1597 int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
1599 for(int k = 0; k < ksize; k++ )
1601 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
1602 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
1604 if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
1607 memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
1612 k0 = std::min(k0, k); // remember the first row that needs to be computed
1613 srows[k] = src.template ptr<T>(sy);
1618 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
1619 ssize.width, dsize.width, cn, xmin, xmax );
1620 vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
1623 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1624 # pragma GCC diagnostic pop
1630 const int* xofs, *yofs;
1631 const AT* alpha, *_beta;
1633 const int ksize, xmin, xmax;
1635 resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
1638 template<class HResize, class VResize>
1639 static void resizeGeneric_( const Mat& src, Mat& dst,
1640 const int* xofs, const void* _alpha,
1641 const int* yofs, const void* _beta,
1642 int xmin, int xmax, int ksize )
1644 typedef typename HResize::alpha_type AT;
1646 const AT* beta = (const AT*)_beta;
1647 Size ssize = src.size(), dsize = dst.size();
1648 int cn = src.channels();
1653 // image resize is a separable operation. In case of not too strong
1655 Range range(0, dsize.height);
1656 resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
1657 ssize, dsize, ksize, xmin, xmax);
1658 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1661 template <typename T, typename WT>
1662 struct ResizeAreaFastNoVec
1664 ResizeAreaFastNoVec(int, int) { }
1665 ResizeAreaFastNoVec(int, int, int, int) { }
1666 int operator() (const T*, T*, int) const
1672 class ResizeAreaFastVec_SIMD_8u
1675 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1676 cn(_cn), step(_step)
1680 int operator() (const uchar* S, uchar* D, int w) const
1683 const uchar* S0 = S, * S1 = S0 + step;
1685 uint16x8_t v_2 = vdupq_n_u16(2);
1689 for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
1691 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
1693 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
1694 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
1695 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
1697 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
1698 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
1699 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
1701 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
1706 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1708 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
1710 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
1711 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
1712 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
1713 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
1715 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
1716 vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
1717 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
1718 vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
1719 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
1721 vst1_u8(D, vmovn_u16(v_dst));
1732 class ResizeAreaFastVec_SIMD_16u
1735 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1736 cn(_cn), step(_step)
1740 int operator() (const ushort * S, ushort * D, int w) const
1743 const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
1745 uint32x4_t v_2 = vdupq_n_u32(2);
1749 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1751 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
1753 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
1754 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
1755 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
1757 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
1758 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
1759 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
1761 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
1766 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1768 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
1769 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
1770 vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
1771 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
1782 class ResizeAreaFastVec_SIMD_16s
1785 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
1786 cn(_cn), step(_step)
1790 int operator() (const short * S, short * D, int w) const
1793 const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
1795 int32x4_t v_2 = vdupq_n_s32(2);
1799 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1801 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
1803 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
1804 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
1805 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
1807 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
1808 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
1809 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
1811 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
1816 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1818 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
1819 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
1820 vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
1821 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
1832 struct ResizeAreaFastVec_SIMD_32f
1834 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
1835 scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
1837 fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
1840 int operator() (const float * S, float * D, int w) const
1845 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
1848 float32x4_t v_025 = vdupq_n_f32(0.25f);
1852 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1854 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
1856 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
1857 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
1859 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
1864 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1866 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
1867 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
1869 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
1877 int scale_x, scale_y;
1885 class ResizeAreaFastVec_SIMD_8u
1888 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1889 cn(_cn), step(_step)
1891 use_simd = checkHardwareSupport(CV_CPU_SSE2);
1894 int operator() (const uchar* S, uchar* D, int w) const
1900 const uchar* S0 = S;
1901 const uchar* S1 = S0 + step;
1902 __m128i zero = _mm_setzero_si128();
1903 __m128i delta2 = _mm_set1_epi16(2);
1907 __m128i masklow = _mm_set1_epi16(0x00ff);
1908 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1910 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1911 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1913 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
1914 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
1915 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
1916 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1918 _mm_storel_epi64((__m128i*)D, s0);
1922 for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
1924 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1925 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1927 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1928 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
1929 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1930 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
1932 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
1933 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
1934 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1935 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1936 _mm_storel_epi64((__m128i*)D, s0);
1938 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
1939 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
1940 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1941 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
1942 _mm_storel_epi64((__m128i*)(D+3), s0);
1947 int v[] = { 0, 0, -1, -1 };
1948 __m128i mask = _mm_loadu_si128((const __m128i*)v);
1950 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1952 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
1953 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
1955 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
1956 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
1957 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
1958 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
1960 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
1961 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
1962 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1963 __m128i res0 = _mm_srli_epi16(s0, 2);
1965 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
1966 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
1967 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
1968 __m128i res1 = _mm_srli_epi16(s0, 2);
1969 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
1970 _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
1971 _mm_storel_epi64((__m128i*)(D), s0);
1984 class ResizeAreaFastVec_SIMD_16u
1987 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1988 cn(_cn), step(_step)
1990 use_simd = checkHardwareSupport(CV_CPU_SSE2);
1993 int operator() (const ushort* S, ushort* D, int w) const
1999 const ushort* S0 = (const ushort*)S;
2000 const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
2001 __m128i masklow = _mm_set1_epi32(0x0000ffff);
2002 __m128i zero = _mm_setzero_si128();
2003 __m128i delta2 = _mm_set1_epi32(2);
2005 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
2009 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2011 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2012 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2014 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
2015 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
2016 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
2017 s0 = _mm_srli_epi32(s0, 2);
2018 s0 = _mm_packus_epi32(s0, zero);
2020 _mm_storel_epi64((__m128i*)D, s0);
2024 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2026 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2027 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2029 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
2030 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
2031 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
2032 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
2034 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
2035 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
2036 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
2037 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2038 _mm_storel_epi64((__m128i*)D, s0);
2043 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2045 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2046 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2048 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
2049 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
2050 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
2051 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
2053 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
2054 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
2055 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
2056 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2057 _mm_storel_epi64((__m128i*)D, s0);
2061 #undef _mm_packus_epi32
2072 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2073 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2077 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
2078 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
2079 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2080 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2084 template<typename T, typename SIMDVecOp>
2085 struct ResizeAreaFastVec
2087 ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
2088 scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
2090 fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
2093 int operator() (const T* S, T* D, int w) const
2098 const T* nextS = (const T*)((const uchar*)S + step);
2099 int dx = vecOp(S, D, w);
2102 for( ; dx < w; ++dx )
2105 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
2108 for( ; dx < w; dx += 3 )
2111 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
2112 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
2113 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
2118 for( ; dx < w; dx += 4 )
2121 D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
2122 D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
2123 D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
2124 D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
2132 int scale_x, scale_y;
2139 template <typename T, typename WT, typename VecOp>
2140 class resizeAreaFast_Invoker :
2141 public ParallelLoopBody
2144 resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
2145 int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
2146 ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
2147 scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
2151 virtual void operator() (const Range& range) const
2153 Size ssize = src.size(), dsize = dst.size();
2154 int cn = src.channels();
2155 int area = scale_x*scale_y;
2156 float scale = 1.f/(area);
2157 int dwidth1 = (ssize.width/scale_x)*cn;
2162 VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
2164 for( dy = range.start; dy < range.end; dy++ )
2166 T* D = (T*)(dst.data + dst.step*dy);
2167 int sy0 = dy*scale_y;
2168 int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
2170 if( sy0 >= ssize.height )
2172 for( dx = 0; dx < dsize.width; dx++ )
2177 dx = vop(src.template ptr<T>(sy0), D, w);
2178 for( ; dx < w; dx++ )
2180 const T* S = src.template ptr<T>(sy0) + xofs[dx];
2183 #if CV_ENABLE_UNROLLED
2184 for( ; k <= area - 4; k += 4 )
2185 sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
2187 for( ; k < area; k++ )
2190 D[dx] = saturate_cast<T>(sum * scale);
2193 for( ; dx < dsize.width; dx++ )
2196 int count = 0, sx0 = xofs[dx];
2197 if( sx0 >= ssize.width )
2200 for( int sy = 0; sy < scale_y; sy++ )
2202 if( sy0 + sy >= ssize.height )
2204 const T* S = src.template ptr<T>(sy0 + sy) + sx0;
2205 for( int sx = 0; sx < scale_x*cn; sx += cn )
2207 if( sx0 + sx >= ssize.width )
2214 D[dx] = saturate_cast<T>((float)sum/count);
2222 int scale_x, scale_y;
2223 const int *ofs, *xofs;
2226 template<typename T, typename WT, typename VecOp>
2227 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
2228 int scale_x, int scale_y )
2230 Range range(0, dst.rows);
2231 resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
2232 scale_y, ofs, xofs);
2233 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2236 struct DecimateAlpha
2243 template<typename T, typename WT> class ResizeArea_Invoker :
2244 public ParallelLoopBody
2247 ResizeArea_Invoker( const Mat& _src, Mat& _dst,
2248 const DecimateAlpha* _xtab, int _xtab_size,
2249 const DecimateAlpha* _ytab, int _ytab_size,
2250 const int* _tabofs )
2255 xtab_size0 = _xtab_size;
2257 ytab_size = _ytab_size;
2261 virtual void operator() (const Range& range) const
2263 Size dsize = dst->size();
2264 int cn = dst->channels();
2266 AutoBuffer<WT> _buffer(dsize.width*2);
2267 const DecimateAlpha* xtab = xtab0;
2268 int xtab_size = xtab_size0;
2269 WT *buf = _buffer, *sum = buf + dsize.width;
2270 int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
2272 for( dx = 0; dx < dsize.width; dx++ )
2275 for( j = j_start; j < j_end; j++ )
2277 WT beta = ytab[j].alpha;
2278 int dy = ytab[j].di;
2279 int sy = ytab[j].si;
2282 const T* S = src->template ptr<T>(sy);
2283 for( dx = 0; dx < dsize.width; dx++ )
2287 for( k = 0; k < xtab_size; k++ )
2289 int dxn = xtab[k].di;
2290 WT alpha = xtab[k].alpha;
2291 buf[dxn] += S[xtab[k].si]*alpha;
2294 for( k = 0; k < xtab_size; k++ )
2296 int sxn = xtab[k].si;
2297 int dxn = xtab[k].di;
2298 WT alpha = xtab[k].alpha;
2299 WT t0 = buf[dxn] + S[sxn]*alpha;
2300 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2301 buf[dxn] = t0; buf[dxn+1] = t1;
2304 for( k = 0; k < xtab_size; k++ )
2306 int sxn = xtab[k].si;
2307 int dxn = xtab[k].di;
2308 WT alpha = xtab[k].alpha;
2309 WT t0 = buf[dxn] + S[sxn]*alpha;
2310 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2311 WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
2312 buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
2316 for( k = 0; k < xtab_size; k++ )
2318 int sxn = xtab[k].si;
2319 int dxn = xtab[k].di;
2320 WT alpha = xtab[k].alpha;
2321 WT t0 = buf[dxn] + S[sxn]*alpha;
2322 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2323 buf[dxn] = t0; buf[dxn+1] = t1;
2324 t0 = buf[dxn+2] + S[sxn+2]*alpha;
2325 t1 = buf[dxn+3] + S[sxn+3]*alpha;
2326 buf[dxn+2] = t0; buf[dxn+3] = t1;
2331 for( k = 0; k < xtab_size; k++ )
2333 int sxn = xtab[k].si;
2334 int dxn = xtab[k].di;
2335 WT alpha = xtab[k].alpha;
2336 for( int c = 0; c < cn; c++ )
2337 buf[dxn + c] += S[sxn + c]*alpha;
2344 T* D = dst->template ptr<T>(prev_dy);
2346 for( dx = 0; dx < dsize.width; dx++ )
2348 D[dx] = saturate_cast<T>(sum[dx]);
2349 sum[dx] = beta*buf[dx];
2355 for( dx = 0; dx < dsize.width; dx++ )
2356 sum[dx] += beta*buf[dx];
2361 T* D = dst->template ptr<T>(prev_dy);
2362 for( dx = 0; dx < dsize.width; dx++ )
2363 D[dx] = saturate_cast<T>(sum[dx]);
2370 const DecimateAlpha* xtab0;
2371 const DecimateAlpha* ytab;
2372 int xtab_size0, ytab_size;
2377 template <typename T, typename WT>
2378 static void resizeArea_( const Mat& src, Mat& dst,
2379 const DecimateAlpha* xtab, int xtab_size,
2380 const DecimateAlpha* ytab, int ytab_size,
2383 parallel_for_(Range(0, dst.rows),
2384 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
2385 dst.total()/((double)(1 << 16)));
2389 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
2390 const int* xofs, const void* alpha,
2391 const int* yofs, const void* beta,
2392 int xmin, int xmax, int ksize );
2394 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
2395 const int* ofs, const int *xofs,
2396 int scale_x, int scale_y );
2398 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
2399 const DecimateAlpha* xtab, int xtab_size,
2400 const DecimateAlpha* ytab, int ytab_size,
2404 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
2407 for(int dx = 0; dx < dsize; dx++ )
2409 double fsx1 = dx * scale;
2410 double fsx2 = fsx1 + scale;
2411 double cellWidth = std::min(scale, ssize - fsx1);
2413 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2415 sx2 = std::min(sx2, ssize - 1);
2416 sx1 = std::min(sx1, sx2);
2418 if( sx1 - fsx1 > 1e-3 )
2420 assert( k < ssize*2 );
2421 tab[k].di = dx * cn;
2422 tab[k].si = (sx1 - 1) * cn;
2423 tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
2426 for(int sx = sx1; sx < sx2; sx++ )
2428 assert( k < ssize*2 );
2429 tab[k].di = dx * cn;
2430 tab[k].si = sx * cn;
2431 tab[k++].alpha = float(1.0 / cellWidth);
2434 if( fsx2 - sx2 > 1e-3 )
2436 assert( k < ssize*2 );
2437 tab[k].di = dx * cn;
2438 tab[k].si = sx2 * cn;
2439 tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2445 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
2447 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
2448 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2449 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2450 specBuf.allocate(specSize);\
2451 pSpec = (uchar*)specBuf;\
2452 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
2454 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
2455 if (mode == (int)ippCubic) { *ok = false; return; } \
2456 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2457 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2458 specBuf.allocate(specSize);\
2459 pSpec = (uchar*)specBuf;\
2460 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
2461 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
2462 getSrcOffsetFunc = (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
2464 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
2465 func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
2466 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2467 specBuf.allocate(specSize);\
2468 pSpec = (uchar*)specBuf;\
2469 AutoBuffer<uchar> buf(initSize);\
2470 uchar* pInit = (uchar*)buf;\
2471 CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
2473 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
2474 if (mode == (int)ippLinear) { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
2475 else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
2476 else { *ok = false; return; } \
2477 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
2478 getSrcOffsetFunc = (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
2480 #if IPP_VERSION_X100 >= 701
2481 class IPPresizeInvoker :
2482 public ParallelLoopBody
2485 IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
2486 ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
2487 inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
2488 func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
2491 IppiSize srcSize, dstSize;
2492 int type = src.type(), specSize = 0, initSize = 0;
2493 srcSize.width = src.cols;
2494 srcSize.height = src.rows;
2495 dstSize.width = dst.cols;
2496 dstSize.height = dst.rows;
2500 #if 0 // disabled since it breaks tests for CascadeClassifier
2501 case CV_8UC1: SET_IPP_RESIZE_PTR(8u,C1); break;
2502 case CV_8UC3: SET_IPP_RESIZE_PTR(8u,C3); break;
2503 case CV_8UC4: SET_IPP_RESIZE_PTR(8u,C4); break;
2505 case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
2506 case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
2507 case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
2508 case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
2509 case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
2510 case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
2511 case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
2512 case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
2513 case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
2514 case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
2515 case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
2516 case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
2517 default: { *ok = false; return; } break;
2525 virtual void operator() (const Range& range) const
2530 int cn = src.channels();
2531 int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
2532 int dstwidth = min(cvRound(src.cols * inv_scale_x), dst.cols);
2533 int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
2535 IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
2536 IppiSize dstSize = { dstwidth, dstheight - dsty };
2537 int bufsize = 0, itemSize = (int)src.elemSize1();
2539 CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
2540 CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
2542 const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
2543 Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
2545 AutoBuffer<uchar> buf(bufsize + 64);
2546 uchar* bufptr = alignPtr((uchar*)buf, 32);
2548 if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
2552 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2561 AutoBuffer<uchar> specBuf;
2563 ippiResizeFunc func;
2564 ippiResizeGetBufferSize getBufferSizeFunc;
2565 ippiResizeGetSrcOffset getSrcOffsetFunc;
2567 const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
2574 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
2575 float * const alpha_tab, int * const ofs_tab)
2578 for ( ; dx < dsize; dx++)
2582 double fsx1 = dx * scale;
2583 double fsx2 = fsx1 + scale;
2584 double cellWidth = std::min(scale, ssize - fsx1);
2586 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2588 sx2 = std::min(sx2, ssize - 1);
2589 sx1 = std::min(sx1, sx2);
2591 if (sx1 - fsx1 > 1e-3)
2593 map_tab[k] = sx1 - 1;
2594 alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
2597 for (int sx = sx1; sx < sx2; sx++)
2600 alpha_tab[k++] = float(1.0 / cellWidth);
2603 if (fsx2 - sx2 > 1e-3)
2606 alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2612 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
2613 double fx, double fy, int interpolation)
2615 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2617 double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
2618 float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
2619 int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
2620 bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
2621 std::abs(inv_fy - iscale_y) < DBL_EPSILON;
2623 // in case of scale_x && scale_y is equal to 2
2624 // INTER_AREA (fast) also is equal to INTER_LINEAR
2625 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2626 /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
2629 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
2630 (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
2633 UMat src = _src.getUMat();
2634 _dst.create(dsize, type);
2635 UMat dst = _dst.getUMat();
2637 Size ssize = src.size();
2639 size_t globalsize[] = { dst.cols, dst.rows };
2641 ocl::Image2D srcImage;
2643 // See if this could be done with a sampler. We stick with integer
2644 // datatypes because the observed error is low.
2645 bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
2646 ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
2647 ocl::Image2D::isFormatSupported(depth, cn, true) &&
2651 int wdepth = std::max(depth, CV_32S);
2653 cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
2654 "-D convertToDT=%s -D cn=%d",
2655 depth, ocl::typeToStr(type), ocl::typeToStr(depth),
2656 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2658 k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
2664 // Convert the input into an OpenCL image type, using normalized channel data types
2665 // and aliasing the UMat.
2666 srcImage = ocl::Image2D(src, true, true);
2667 k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
2668 (float)inv_fx, (float)inv_fy);
2672 if (interpolation == INTER_LINEAR && !useSampler)
2676 // integer path is slower because of CPU part, so it's disabled
2677 if (depth == CV_8U && ((void)0, 0))
2679 AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
2680 int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
2681 short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
2685 for (int dx = 0; dx < dsize.width; dx++)
2687 fxx = (float)((dx+0.5)*inv_fx - 0.5);
2694 if (sx >= ssize.width-1)
2695 fxx = 0, sx = ssize.width-1;
2698 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
2699 ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE);
2702 for (int dy = 0; dy < dsize.height; dy++)
2704 fyy = (float)((dy+0.5)*inv_fy - 0.5);
2709 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
2710 ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE);
2713 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2715 Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
2717 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2718 format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
2719 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2720 "-D INTER_RESIZE_COEF_BITS=%d",
2721 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2722 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2723 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2724 cn, INTER_RESIZE_COEF_BITS));
2728 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2729 ocl::KernelArg::PtrReadOnly(coeffs));
2733 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2734 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2735 format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
2736 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2737 "-D INTER_RESIZE_COEF_BITS=%d",
2738 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2739 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2740 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2741 cn, INTER_RESIZE_COEF_BITS));
2745 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2746 (float)inv_fx, (float)inv_fy);
2749 else if (interpolation == INTER_NEAREST)
2751 k.create("resizeNN", ocl::imgproc::resize_oclsrc,
2752 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
2753 ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
2757 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
2758 (float)inv_fx, (float)inv_fy);
2760 else if (interpolation == INTER_AREA)
2762 int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
2763 int wtype = CV_MAKE_TYPE(wdepth, cn);
2766 String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
2767 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2768 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
2770 UMat alphaOcl, tabofsOcl, mapOcl;
2775 int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
2776 buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
2777 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
2778 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
2779 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
2780 iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
2782 k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
2788 buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
2789 k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
2793 int xytab_size = (ssize.width + ssize.height) << 1;
2794 int tabofs_size = dsize.height + dsize.width + 2;
2796 AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
2797 AutoBuffer<float> _xyalpha_tab(xytab_size);
2798 int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
2799 float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
2800 int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
2802 ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
2803 ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
2805 // loading precomputed arrays to GPU
2806 Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
2807 Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
2808 Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
2811 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
2814 k.args(srcarg, dstarg);
2816 k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
2817 ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
2819 return k.run(2, globalsize, NULL, false);
2822 return k.run(2, globalsize, 0, false);
2829 //////////////////////////////////////////////////////////////////////////////////////////
2831 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
2832 double inv_scale_x, double inv_scale_y, int interpolation )
2834 static ResizeFunc linear_tab[] =
2837 HResizeLinear<uchar, int, short,
2838 INTER_RESIZE_COEF_SCALE,
2839 HResizeLinearVec_8u32s>,
2840 VResizeLinear<uchar, int, short,
2841 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2842 VResizeLinearVec_32s8u> >,
2845 HResizeLinear<ushort, float, float, 1,
2846 HResizeLinearVec_16u32f>,
2847 VResizeLinear<ushort, float, float, Cast<float, ushort>,
2848 VResizeLinearVec_32f16u> >,
2850 HResizeLinear<short, float, float, 1,
2851 HResizeLinearVec_16s32f>,
2852 VResizeLinear<short, float, float, Cast<float, short>,
2853 VResizeLinearVec_32f16s> >,
2856 HResizeLinear<float, float, float, 1,
2857 HResizeLinearVec_32f>,
2858 VResizeLinear<float, float, float, Cast<float, float>,
2859 VResizeLinearVec_32f> >,
2861 HResizeLinear<double, double, float, 1,
2863 VResizeLinear<double, double, float, Cast<double, double>,
2868 static ResizeFunc cubic_tab[] =
2871 HResizeCubic<uchar, int, short>,
2872 VResizeCubic<uchar, int, short,
2873 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2874 VResizeCubicVec_32s8u> >,
2877 HResizeCubic<ushort, float, float>,
2878 VResizeCubic<ushort, float, float, Cast<float, ushort>,
2879 VResizeCubicVec_32f16u> >,
2881 HResizeCubic<short, float, float>,
2882 VResizeCubic<short, float, float, Cast<float, short>,
2883 VResizeCubicVec_32f16s> >,
2886 HResizeCubic<float, float, float>,
2887 VResizeCubic<float, float, float, Cast<float, float>,
2888 VResizeCubicVec_32f> >,
2890 HResizeCubic<double, double, float>,
2891 VResizeCubic<double, double, float, Cast<double, double>,
2896 static ResizeFunc lanczos4_tab[] =
2898 resizeGeneric_<HResizeLanczos4<uchar, int, short>,
2899 VResizeLanczos4<uchar, int, short,
2900 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
2903 resizeGeneric_<HResizeLanczos4<ushort, float, float>,
2904 VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
2905 VResizeLanczos4Vec_32f16u> >,
2906 resizeGeneric_<HResizeLanczos4<short, float, float>,
2907 VResizeLanczos4<short, float, float, Cast<float, short>,
2908 VResizeLanczos4Vec_32f16s> >,
2910 resizeGeneric_<HResizeLanczos4<float, float, float>,
2911 VResizeLanczos4<float, float, float, Cast<float, float>,
2912 VResizeLanczos4Vec_32f> >,
2913 resizeGeneric_<HResizeLanczos4<double, double, float>,
2914 VResizeLanczos4<double, double, float, Cast<double, double>,
2919 static ResizeAreaFastFunc areafast_tab[] =
2921 resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
2923 resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
2924 resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
2926 resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
2927 resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
2931 static ResizeAreaFunc area_tab[] =
2933 resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
2934 resizeArea_<short, float>, 0, resizeArea_<float, float>,
2935 resizeArea_<double, double>, 0
2938 Size ssize = _src.size();
2940 CV_Assert( ssize.area() > 0 );
2941 CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
2942 if( dsize.area() == 0 )
2944 dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
2945 saturate_cast<int>(ssize.height*inv_scale_y));
2946 CV_Assert( dsize.area() > 0 );
2950 inv_scale_x = (double)dsize.width/ssize.width;
2951 inv_scale_y = (double)dsize.height/ssize.height;
2954 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
2955 ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
2957 Mat src = _src.getMat();
2958 _dst.create(dsize, src.type());
2959 Mat dst = _dst.getMat();
2961 #ifdef HAVE_TEGRA_OPTIMIZATION
2962 if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
2966 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2967 double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
2968 int k, sx, sy, dx, dy;
2970 int iscale_x = saturate_cast<int>(scale_x);
2971 int iscale_y = saturate_cast<int>(scale_y);
2973 bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
2974 std::abs(scale_y - iscale_y) < DBL_EPSILON;
2976 #if IPP_VERSION_X100 >= 701
2979 #define IPP_RESIZE_EPS 1e-10
2981 double ex = fabs((double)dsize.width / src.cols - inv_scale_x) / inv_scale_x;
2982 double ey = fabs((double)dsize.height / src.rows - inv_scale_y) / inv_scale_y;
2984 if ( ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
2985 (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
2986 !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U))
2989 if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
2991 else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
2994 if( mode >= 0 && (cn == 1 || cn == 3 || cn == 4) &&
2995 (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
2996 (depth == CV_64F && mode == ippLinear)))
2999 Range range(0, src.rows);
3000 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
3001 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3004 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3007 setIppErrorStatus();
3010 #undef IPP_RESIZE_EPS
3014 if( interpolation == INTER_NEAREST )
3016 resizeNN( src, dst, inv_scale_x, inv_scale_y );
3021 // in case of scale_x && scale_y is equal to 2
3022 // INTER_AREA (fast) also is equal to INTER_LINEAR
3023 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3024 interpolation = INTER_AREA;
3026 // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
3027 // In other cases it is emulated using some variant of bilinear interpolation
3028 if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
3032 int area = iscale_x*iscale_y;
3033 size_t srcstep = src.step / src.elemSize1();
3034 AutoBuffer<int> _ofs(area + dsize.width*cn);
3036 int* xofs = ofs + area;
3037 ResizeAreaFastFunc func = areafast_tab[depth];
3038 CV_Assert( func != 0 );
3040 for( sy = 0, k = 0; sy < iscale_y; sy++ )
3041 for( sx = 0; sx < iscale_x; sx++ )
3042 ofs[k++] = (int)(sy*srcstep + sx*cn);
3044 for( dx = 0; dx < dsize.width; dx++ )
3048 for( k = 0; k < cn; k++ )
3049 xofs[j + k] = sx + k;
3052 func( src, dst, ofs, xofs, iscale_x, iscale_y );
3056 ResizeAreaFunc func = area_tab[depth];
3057 CV_Assert( func != 0 && cn <= 4 );
3059 AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
3060 DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
3062 int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
3063 int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
3065 AutoBuffer<int> _tabofs(dsize.height + 1);
3066 int* tabofs = _tabofs;
3067 for( k = 0, dy = 0; k < ytab_size; k++ )
3069 if( k == 0 || ytab[k].di != ytab[k-1].di )
3071 assert( ytab[k].di == dy );
3075 tabofs[dy] = ytab_size;
3077 func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
3082 int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
3083 bool area_mode = interpolation == INTER_AREA;
3084 bool fixpt = depth == CV_8U;
3087 int ksize=0, ksize2;
3088 if( interpolation == INTER_CUBIC )
3089 ksize = 4, func = cubic_tab[depth];
3090 else if( interpolation == INTER_LANCZOS4 )
3091 ksize = 8, func = lanczos4_tab[depth];
3092 else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
3093 ksize = 2, func = linear_tab[depth];
3095 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
3098 CV_Assert( func != 0 );
3100 AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
3101 int* xofs = (int*)(uchar*)_buffer;
3102 int* yofs = xofs + width;
3103 float* alpha = (float*)(yofs + dsize.height);
3104 short* ialpha = (short*)alpha;
3105 float* beta = alpha + width*ksize;
3106 short* ibeta = ialpha + width*ksize;
3107 float cbuf[MAX_ESIZE];
3109 for( dx = 0; dx < dsize.width; dx++ )
3113 fx = (float)((dx+0.5)*scale_x - 0.5);
3119 sx = cvFloor(dx*scale_x);
3120 fx = (float)((dx+1) - (sx+1)*inv_scale_x);
3121 fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
3127 if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3131 if( sx + ksize2 >= ssize.width )
3133 xmax = std::min( xmax, dx );
3134 if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3135 fx = 0, sx = ssize.width-1;
3138 for( k = 0, sx *= cn; k < cn; k++ )
3139 xofs[dx*cn + k] = sx + k;
3141 if( interpolation == INTER_CUBIC )
3142 interpolateCubic( fx, cbuf );
3143 else if( interpolation == INTER_LANCZOS4 )
3144 interpolateLanczos4( fx, cbuf );
3152 for( k = 0; k < ksize; k++ )
3153 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3154 for( ; k < cn*ksize; k++ )
3155 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
3159 for( k = 0; k < ksize; k++ )
3160 alpha[dx*cn*ksize + k] = cbuf[k];
3161 for( ; k < cn*ksize; k++ )
3162 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
3166 for( dy = 0; dy < dsize.height; dy++ )
3170 fy = (float)((dy+0.5)*scale_y - 0.5);
3176 sy = cvFloor(dy*scale_y);
3177 fy = (float)((dy+1) - (sy+1)*inv_scale_y);
3178 fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
3182 if( interpolation == INTER_CUBIC )
3183 interpolateCubic( fy, cbuf );
3184 else if( interpolation == INTER_LANCZOS4 )
3185 interpolateLanczos4( fy, cbuf );
3194 for( k = 0; k < ksize; k++ )
3195 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3199 for( k = 0; k < ksize; k++ )
3200 beta[dy*ksize + k] = cbuf[k];
3204 func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
3205 fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
3209 /****************************************************************************************\
3210 * General warping (affine, perspective, remap) *
3211 \****************************************************************************************/
3216 template<typename T>
3217 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
3218 int borderType, const Scalar& _borderValue )
3220 Size ssize = _src.size(), dsize = _dst.size();
3221 int cn = _src.channels();
3222 const T* S0 = _src.ptr<T>();
3223 size_t sstep = _src.step/sizeof(S0[0]);
3224 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3225 saturate_cast<T>(_borderValue[1]),
3226 saturate_cast<T>(_borderValue[2]),
3227 saturate_cast<T>(_borderValue[3]));
3230 unsigned width1 = ssize.width, height1 = ssize.height;
3232 if( _dst.isContinuous() && _xy.isContinuous() )
3234 dsize.width *= dsize.height;
3238 for( dy = 0; dy < dsize.height; dy++ )
3240 T* D = _dst.ptr<T>(dy);
3241 const short* XY = _xy.ptr<short>(dy);
3245 for( dx = 0; dx < dsize.width; dx++ )
3247 int sx = XY[dx*2], sy = XY[dx*2+1];
3248 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3249 D[dx] = S0[sy*sstep + sx];
3252 if( borderType == BORDER_REPLICATE )
3254 sx = clip(sx, 0, ssize.width);
3255 sy = clip(sy, 0, ssize.height);
3256 D[dx] = S0[sy*sstep + sx];
3258 else if( borderType == BORDER_CONSTANT )
3260 else if( borderType != BORDER_TRANSPARENT )
3262 sx = borderInterpolate(sx, ssize.width, borderType);
3263 sy = borderInterpolate(sy, ssize.height, borderType);
3264 D[dx] = S0[sy*sstep + sx];
3271 for( dx = 0; dx < dsize.width; dx++, D += cn )
3273 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3275 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3279 S = S0 + sy*sstep + sx*3;
3280 D[0] = S[0], D[1] = S[1], D[2] = S[2];
3284 S = S0 + sy*sstep + sx*4;
3285 D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
3289 S = S0 + sy*sstep + sx*cn;
3290 for( k = 0; k < cn; k++ )
3294 else if( borderType != BORDER_TRANSPARENT )
3296 if( borderType == BORDER_REPLICATE )
3298 sx = clip(sx, 0, ssize.width);
3299 sy = clip(sy, 0, ssize.height);
3300 S = S0 + sy*sstep + sx*cn;
3302 else if( borderType == BORDER_CONSTANT )
3306 sx = borderInterpolate(sx, ssize.width, borderType);
3307 sy = borderInterpolate(sy, ssize.height, borderType);
3308 S = S0 + sy*sstep + sx*cn;
3310 for( k = 0; k < cn; k++ )
3321 int operator()( const Mat&, void*, const short*, const ushort*,
3322 const void*, int ) const { return 0; }
3329 int operator()( const Mat& _src, void* _dst, const short* XY,
3330 const ushort* FXY, const void* _wtab, int width ) const
3332 int cn = _src.channels(), x = 0, sstep = (int)_src.step;
3334 if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
3338 const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
3339 const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
3340 uchar* D = (uchar*)_dst;
3341 __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
3342 __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
3343 __m128i z = _mm_setzero_si128();
3344 int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
3348 for( ; x <= width - 8; x += 8 )
3350 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3351 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
3352 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
3355 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3356 xy1 = _mm_madd_epi16( xy1, xy2ofs );
3357 _mm_store_si128( (__m128i*)iofs0, xy0 );
3358 _mm_store_si128( (__m128i*)iofs1, xy1 );
3360 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
3361 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
3362 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3363 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
3364 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
3365 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3366 v0 = _mm_unpacklo_epi8(v0, z);
3367 v1 = _mm_unpacklo_epi8(v1, z);
3369 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
3370 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
3371 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
3372 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
3373 b0 = _mm_unpacklo_epi64(a0, a1);
3374 b1 = _mm_unpackhi_epi64(a0, a1);
3375 v0 = _mm_madd_epi16(v0, b0);
3376 v1 = _mm_madd_epi16(v1, b1);
3377 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
3379 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
3380 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
3381 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3382 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
3383 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
3384 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3385 v2 = _mm_unpacklo_epi8(v2, z);
3386 v3 = _mm_unpacklo_epi8(v3, z);
3388 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
3389 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
3390 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
3391 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
3392 b0 = _mm_unpacklo_epi64(a0, a1);
3393 b1 = _mm_unpackhi_epi64(a0, a1);
3394 v2 = _mm_madd_epi16(v2, b0);
3395 v3 = _mm_madd_epi16(v3, b1);
3396 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
3398 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
3399 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
3400 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
3401 _mm_storel_epi64( (__m128i*)(D + x), v0 );
3406 for( ; x <= width - 5; x += 4, D += 12 )
3408 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3409 __m128i u0, v0, u1, v1;
3411 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3412 _mm_store_si128( (__m128i*)iofs0, xy0 );
3413 const __m128i *w0, *w1;
3414 w0 = (const __m128i*)(wtab + FXY[x]*16);
3415 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3417 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3418 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
3419 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3420 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
3421 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3422 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
3423 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3424 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
3425 u0 = _mm_unpacklo_epi8(u0, z);
3426 v0 = _mm_unpacklo_epi8(v0, z);
3427 u1 = _mm_unpacklo_epi8(u1, z);
3428 v1 = _mm_unpacklo_epi8(v1, z);
3429 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3430 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3431 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3432 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3433 u0 = _mm_slli_si128(u0, 4);
3434 u0 = _mm_packs_epi32(u0, u1);
3435 u0 = _mm_packus_epi16(u0, u0);
3436 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
3438 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3439 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3441 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3442 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
3443 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3444 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
3445 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3446 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
3447 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3448 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
3449 u0 = _mm_unpacklo_epi8(u0, z);
3450 v0 = _mm_unpacklo_epi8(v0, z);
3451 u1 = _mm_unpacklo_epi8(u1, z);
3452 v1 = _mm_unpacklo_epi8(v1, z);
3453 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3454 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3455 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3456 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3457 u0 = _mm_slli_si128(u0, 4);
3458 u0 = _mm_packs_epi32(u0, u1);
3459 u0 = _mm_packus_epi16(u0, u0);
3460 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
3465 for( ; x <= width - 4; x += 4, D += 16 )
3467 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3468 __m128i u0, v0, u1, v1;
3470 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3471 _mm_store_si128( (__m128i*)iofs0, xy0 );
3472 const __m128i *w0, *w1;
3473 w0 = (const __m128i*)(wtab + FXY[x]*16);
3474 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3476 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3477 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
3478 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3479 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
3480 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3481 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
3482 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3483 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
3484 u0 = _mm_unpacklo_epi8(u0, z);
3485 v0 = _mm_unpacklo_epi8(v0, z);
3486 u1 = _mm_unpacklo_epi8(u1, z);
3487 v1 = _mm_unpacklo_epi8(v1, z);
3488 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3489 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3490 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3491 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3492 u0 = _mm_packs_epi32(u0, u1);
3493 u0 = _mm_packus_epi16(u0, u0);
3494 _mm_storel_epi64((__m128i*)D, u0);
3496 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3497 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3499 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3500 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
3501 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3502 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
3503 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3504 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
3505 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3506 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
3507 u0 = _mm_unpacklo_epi8(u0, z);
3508 v0 = _mm_unpacklo_epi8(v0, z);
3509 u1 = _mm_unpacklo_epi8(u1, z);
3510 v1 = _mm_unpacklo_epi8(v1, z);
3511 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3512 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3513 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3514 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3515 u0 = _mm_packs_epi32(u0, u1);
3516 u0 = _mm_packus_epi16(u0, u0);
3517 _mm_storel_epi64((__m128i*)(D + 8), u0);
3527 typedef RemapNoVec RemapVec_8u;
3532 template<class CastOp, class VecOp, typename AT>
3533 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
3534 const Mat& _fxy, const void* _wtab,
3535 int borderType, const Scalar& _borderValue )
3537 typedef typename CastOp::rtype T;
3538 typedef typename CastOp::type1 WT;
3539 Size ssize = _src.size(), dsize = _dst.size();
3540 int cn = _src.channels();
3541 const AT* wtab = (const AT*)_wtab;
3542 const T* S0 = _src.ptr<T>();
3543 size_t sstep = _src.step/sizeof(S0[0]);
3544 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3545 saturate_cast<T>(_borderValue[1]),
3546 saturate_cast<T>(_borderValue[2]),
3547 saturate_cast<T>(_borderValue[3]));
3552 unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
3553 CV_Assert( cn <= 4 && ssize.area() > 0 );
3555 if( _src.type() == CV_8UC3 )
3556 width1 = std::max(ssize.width-2, 0);
3559 for( dy = 0; dy < dsize.height; dy++ )
3561 T* D = _dst.ptr<T>(dy);
3562 const short* XY = _xy.ptr<short>(dy);
3563 const ushort* FXY = _fxy.ptr<ushort>(dy);
3565 bool prevInlier = false;
3567 for( dx = 0; dx <= dsize.width; dx++ )
3569 bool curInlier = dx < dsize.width ?
3570 (unsigned)XY[dx*2] < width1 &&
3571 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
3572 if( curInlier == prevInlier )
3578 prevInlier = curInlier;
3582 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
3588 for( ; dx < X1; dx++, D++ )
3590 int sx = XY[dx*2], sy = XY[dx*2+1];
3591 const AT* w = wtab + FXY[dx]*4;
3592 const T* S = S0 + sy*sstep + sx;
3593 *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
3597 for( ; dx < X1; dx++, D += 2 )
3599 int sx = XY[dx*2], sy = XY[dx*2+1];
3600 const AT* w = wtab + FXY[dx]*4;
3601 const T* S = S0 + sy*sstep + sx*2;
3602 WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
3603 WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
3604 D[0] = castOp(t0); D[1] = castOp(t1);
3607 for( ; dx < X1; dx++, D += 3 )
3609 int sx = XY[dx*2], sy = XY[dx*2+1];
3610 const AT* w = wtab + FXY[dx]*4;
3611 const T* S = S0 + sy*sstep + sx*3;
3612 WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
3613 WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
3614 WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
3615 D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
3618 for( ; dx < X1; dx++, D += 4 )
3620 int sx = XY[dx*2], sy = XY[dx*2+1];
3621 const AT* w = wtab + FXY[dx]*4;
3622 const T* S = S0 + sy*sstep + sx*4;
3623 WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
3624 WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
3625 D[0] = castOp(t0); D[1] = castOp(t1);
3626 t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
3627 t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
3628 D[2] = castOp(t0); D[3] = castOp(t1);
3633 if( borderType == BORDER_TRANSPARENT && cn != 3 )
3641 for( ; dx < X1; dx++, D++ )
3643 int sx = XY[dx*2], sy = XY[dx*2+1];
3644 if( borderType == BORDER_CONSTANT &&
3645 (sx >= ssize.width || sx+1 < 0 ||
3646 sy >= ssize.height || sy+1 < 0) )
3652 int sx0, sx1, sy0, sy1;
3654 const AT* w = wtab + FXY[dx]*4;
3655 if( borderType == BORDER_REPLICATE )
3657 sx0 = clip(sx, 0, ssize.width);
3658 sx1 = clip(sx+1, 0, ssize.width);
3659 sy0 = clip(sy, 0, ssize.height);
3660 sy1 = clip(sy+1, 0, ssize.height);
3661 v0 = S0[sy0*sstep + sx0];
3662 v1 = S0[sy0*sstep + sx1];
3663 v2 = S0[sy1*sstep + sx0];
3664 v3 = S0[sy1*sstep + sx1];
3668 sx0 = borderInterpolate(sx, ssize.width, borderType);
3669 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3670 sy0 = borderInterpolate(sy, ssize.height, borderType);
3671 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3672 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
3673 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
3674 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
3675 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
3677 D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
3681 for( ; dx < X1; dx++, D += cn )
3683 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3684 if( borderType == BORDER_CONSTANT &&
3685 (sx >= ssize.width || sx+1 < 0 ||
3686 sy >= ssize.height || sy+1 < 0) )
3688 for( k = 0; k < cn; k++ )
3693 int sx0, sx1, sy0, sy1;
3694 const T *v0, *v1, *v2, *v3;
3695 const AT* w = wtab + FXY[dx]*4;
3696 if( borderType == BORDER_REPLICATE )
3698 sx0 = clip(sx, 0, ssize.width);
3699 sx1 = clip(sx+1, 0, ssize.width);
3700 sy0 = clip(sy, 0, ssize.height);
3701 sy1 = clip(sy+1, 0, ssize.height);
3702 v0 = S0 + sy0*sstep + sx0*cn;
3703 v1 = S0 + sy0*sstep + sx1*cn;
3704 v2 = S0 + sy1*sstep + sx0*cn;
3705 v3 = S0 + sy1*sstep + sx1*cn;
3707 else if( borderType == BORDER_TRANSPARENT &&
3708 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
3709 (unsigned)sy >= (unsigned)(ssize.height-1)))
3713 sx0 = borderInterpolate(sx, ssize.width, borderType);
3714 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3715 sy0 = borderInterpolate(sy, ssize.height, borderType);
3716 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3717 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
3718 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
3719 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
3720 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
3722 for( k = 0; k < cn; k++ )
3723 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
3732 template<class CastOp, typename AT, int ONE>
3733 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
3734 const Mat& _fxy, const void* _wtab,
3735 int borderType, const Scalar& _borderValue )
3737 typedef typename CastOp::rtype T;
3738 typedef typename CastOp::type1 WT;
3739 Size ssize = _src.size(), dsize = _dst.size();
3740 int cn = _src.channels();
3741 const AT* wtab = (const AT*)_wtab;
3742 const T* S0 = _src.ptr<T>();
3743 size_t sstep = _src.step/sizeof(S0[0]);
3744 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3745 saturate_cast<T>(_borderValue[1]),
3746 saturate_cast<T>(_borderValue[2]),
3747 saturate_cast<T>(_borderValue[3]));
3750 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3752 unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
3754 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3756 dsize.width *= dsize.height;
3760 for( dy = 0; dy < dsize.height; dy++ )
3762 T* D = _dst.ptr<T>(dy);
3763 const short* XY = _xy.ptr<short>(dy);
3764 const ushort* FXY = _fxy.ptr<ushort>(dy);
3766 for( dx = 0; dx < dsize.width; dx++, D += cn )
3768 int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
3769 const AT* w = wtab + FXY[dx]*16;
3771 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3773 const T* S = S0 + sy*sstep + sx*cn;
3774 for( k = 0; k < cn; k++ )
3776 WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
3778 sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
3780 sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
3782 sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
3790 if( borderType == BORDER_TRANSPARENT &&
3791 ((unsigned)(sx+1) >= (unsigned)ssize.width ||
3792 (unsigned)(sy+1) >= (unsigned)ssize.height) )
3795 if( borderType1 == BORDER_CONSTANT &&
3796 (sx >= ssize.width || sx+4 <= 0 ||
3797 sy >= ssize.height || sy+4 <= 0))
3799 for( k = 0; k < cn; k++ )
3804 for( i = 0; i < 4; i++ )
3806 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3807 y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3810 for( k = 0; k < cn; k++, S0++, w -= 16 )
3812 WT cv = cval[k], sum = cv*ONE;
3813 for( i = 0; i < 4; i++, w += 4 )
3816 const T* S = S0 + yi*sstep;
3820 sum += (S[x[0]] - cv)*w[0];
3822 sum += (S[x[1]] - cv)*w[1];
3824 sum += (S[x[2]] - cv)*w[2];
3826 sum += (S[x[3]] - cv)*w[3];
3837 template<class CastOp, typename AT, int ONE>
3838 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
3839 const Mat& _fxy, const void* _wtab,
3840 int borderType, const Scalar& _borderValue )
3842 typedef typename CastOp::rtype T;
3843 typedef typename CastOp::type1 WT;
3844 Size ssize = _src.size(), dsize = _dst.size();
3845 int cn = _src.channels();
3846 const AT* wtab = (const AT*)_wtab;
3847 const T* S0 = _src.ptr<T>();
3848 size_t sstep = _src.step/sizeof(S0[0]);
3849 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3850 saturate_cast<T>(_borderValue[1]),
3851 saturate_cast<T>(_borderValue[2]),
3852 saturate_cast<T>(_borderValue[3]));
3855 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
3857 unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
3859 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
3861 dsize.width *= dsize.height;
3865 for( dy = 0; dy < dsize.height; dy++ )
3867 T* D = _dst.ptr<T>(dy);
3868 const short* XY = _xy.ptr<short>(dy);
3869 const ushort* FXY = _fxy.ptr<ushort>(dy);
3871 for( dx = 0; dx < dsize.width; dx++, D += cn )
3873 int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
3874 const AT* w = wtab + FXY[dx]*64;
3875 const T* S = S0 + sy*sstep + sx*cn;
3877 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3879 for( k = 0; k < cn; k++ )
3882 for( int r = 0; r < 8; r++, S += sstep, w += 8 )
3883 sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
3884 S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
3893 if( borderType == BORDER_TRANSPARENT &&
3894 ((unsigned)(sx+3) >= (unsigned)ssize.width ||
3895 (unsigned)(sy+3) >= (unsigned)ssize.height) )
3898 if( borderType1 == BORDER_CONSTANT &&
3899 (sx >= ssize.width || sx+8 <= 0 ||
3900 sy >= ssize.height || sy+8 <= 0))
3902 for( k = 0; k < cn; k++ )
3907 for( i = 0; i < 8; i++ )
3909 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
3910 y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
3913 for( k = 0; k < cn; k++, S0++, w -= 64 )
3915 WT cv = cval[k], sum = cv*ONE;
3916 for( i = 0; i < 8; i++, w += 8 )
3919 const T* S1 = S0 + yi*sstep;
3923 sum += (S1[x[0]] - cv)*w[0];
3925 sum += (S1[x[1]] - cv)*w[1];
3927 sum += (S1[x[2]] - cv)*w[2];
3929 sum += (S1[x[3]] - cv)*w[3];
3931 sum += (S1[x[4]] - cv)*w[4];
3933 sum += (S1[x[5]] - cv)*w[5];
3935 sum += (S1[x[6]] - cv)*w[6];
3937 sum += (S1[x[7]] - cv)*w[7];
3948 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3949 int borderType, const Scalar& _borderValue );
3951 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
3952 const Mat& _fxy, const void* _wtab,
3953 int borderType, const Scalar& _borderValue);
3955 class RemapInvoker :
3956 public ParallelLoopBody
3959 RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
3960 const Mat *_m2, int _borderType, const Scalar &_borderValue,
3961 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
3962 ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
3963 borderType(_borderType), borderValue(_borderValue),
3964 planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
3968 virtual void operator() (const Range& range) const
3971 const int buf_size = 1 << 14;
3972 int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
3973 int bcols0 = std::min(buf_size/brows0, dst->cols);
3974 brows0 = std::min(buf_size/bcols0, dst->rows);
3976 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
3979 Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
3981 _bufa.create(brows0, bcols0, CV_16UC1);
3983 for( y = range.start; y < range.end; y += brows0 )
3985 for( x = 0; x < dst->cols; x += bcols0 )
3987 int brows = std::min(brows0, range.end - y);
3988 int bcols = std::min(bcols0, dst->cols - x);
3989 Mat dpart(*dst, Rect(x, y, bcols, brows));
3990 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
3994 if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
3995 bufxy = (*m1)(Rect(x, y, bcols, brows));
3996 else if( map_depth != CV_32F )
3998 for( y1 = 0; y1 < brows; y1++ )
4000 short* XY = bufxy.ptr<short>(y1);
4001 const short* sXY = m1->ptr<short>(y+y1) + x*2;
4002 const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4004 for( x1 = 0; x1 < bcols; x1++ )
4006 int a = sA[x1] & (INTER_TAB_SIZE2-1);
4007 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
4008 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
4012 else if( !planar_input )
4013 (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
4016 for( y1 = 0; y1 < brows; y1++ )
4018 short* XY = bufxy.ptr<short>(y1);
4019 const float* sX = m1->ptr<float>(y+y1) + x;
4020 const float* sY = m2->ptr<float>(y+y1) + x;
4026 for( ; x1 <= bcols - 8; x1 += 8 )
4028 __m128 fx0 = _mm_loadu_ps(sX + x1);
4029 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4030 __m128 fy0 = _mm_loadu_ps(sY + x1);
4031 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4032 __m128i ix0 = _mm_cvtps_epi32(fx0);
4033 __m128i ix1 = _mm_cvtps_epi32(fx1);
4034 __m128i iy0 = _mm_cvtps_epi32(fy0);
4035 __m128i iy1 = _mm_cvtps_epi32(fy1);
4036 ix0 = _mm_packs_epi32(ix0, ix1);
4037 iy0 = _mm_packs_epi32(iy0, iy1);
4038 ix1 = _mm_unpacklo_epi16(ix0, iy0);
4039 iy1 = _mm_unpackhi_epi16(ix0, iy0);
4040 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4041 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4046 for( ; x1 < bcols; x1++ )
4048 XY[x1*2] = saturate_cast<short>(sX[x1]);
4049 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
4053 nnfunc( *src, dpart, bufxy, borderType, borderValue );
4057 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
4058 for( y1 = 0; y1 < brows; y1++ )
4060 short* XY = bufxy.ptr<short>(y1);
4061 ushort* A = bufa.ptr<ushort>(y1);
4063 if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
4065 bufxy = (*m1)(Rect(x, y, bcols, brows));
4067 const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4071 uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
4072 for ( ; x1 <= bcols - 8; x1 += 8)
4073 vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
4076 for( ; x1 < bcols; x1++ )
4077 A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
4079 else if( planar_input )
4081 const float* sX = m1->ptr<float>(y+y1) + x;
4082 const float* sY = m2->ptr<float>(y+y1) + x;
4088 __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
4089 __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
4090 for( ; x1 <= bcols - 8; x1 += 8 )
4092 __m128 fx0 = _mm_loadu_ps(sX + x1);
4093 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4094 __m128 fy0 = _mm_loadu_ps(sY + x1);
4095 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4096 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
4097 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
4098 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
4099 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
4100 __m128i mx0 = _mm_and_si128(ix0, mask);
4101 __m128i mx1 = _mm_and_si128(ix1, mask);
4102 __m128i my0 = _mm_and_si128(iy0, mask);
4103 __m128i my1 = _mm_and_si128(iy1, mask);
4104 mx0 = _mm_packs_epi32(mx0, mx1);
4105 my0 = _mm_packs_epi32(my0, my1);
4106 my0 = _mm_slli_epi16(my0, INTER_BITS);
4107 mx0 = _mm_or_si128(mx0, my0);
4108 _mm_storeu_si128((__m128i*)(A + x1), mx0);
4109 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
4110 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
4111 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
4112 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
4113 ix0 = _mm_packs_epi32(ix0, ix1);
4114 iy0 = _mm_packs_epi32(iy0, iy1);
4115 ix1 = _mm_unpacklo_epi16(ix0, iy0);
4116 iy1 = _mm_unpackhi_epi16(ix0, iy0);
4117 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4118 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4122 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
4123 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4125 for( ; x1 <= bcols - 4; x1 += 4 )
4127 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
4128 v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
4129 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4130 vandq_s32(v_sy, v_scale2));
4131 vst1_u16(A + x1, vqmovun_s32(v_v));
4133 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4134 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4135 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4139 for( ; x1 < bcols; x1++ )
4141 int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
4142 int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
4143 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4144 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4145 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4151 const float* sXY = m1->ptr<float>(y+y1) + x*2;
4155 float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
4156 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4158 for( ; x1 <= bcols - 4; x1 += 4 )
4160 float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
4161 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
4162 int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
4163 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4164 vandq_s32(v_sy, v_scale2));
4165 vst1_u16(A + x1, vqmovun_s32(v_v));
4167 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4168 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4169 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4173 for( x1 = 0; x1 < bcols; x1++ )
4175 int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
4176 int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
4177 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4178 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4179 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4184 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
4203 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
4204 int interpolation, int borderType, const Scalar& borderValue)
4206 const ocl::Device & dev = ocl::Device::getDefault();
4207 int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
4208 rowsPerWI = dev.isIntel() ? 4 : 1;
4210 if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
4211 || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
4214 UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
4216 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
4217 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
4219 if (map1.type() != CV_16SC2)
4220 std::swap(map1, map2);
4223 CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4225 _dst.create(map1.size(), type);
4226 UMat dst = _dst.getUMat();
4228 String kernelName = "remap";
4229 if (map1.type() == CV_32FC2 && map2.empty())
4230 kernelName += "_32FC2";
4231 else if (map1.type() == CV_16SC2)
4233 kernelName += "_16SC2";
4235 kernelName += "_16UC1";
4237 else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
4238 kernelName += "_2_32FC1";
4240 CV_Error(Error::StsBadArg, "Unsupported map types");
4242 static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
4243 static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4244 "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
4245 String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
4246 interMap[interpolation], borderMap[borderType],
4247 ocl::typeToStr(type), rowsPerWI);
4249 if (interpolation != INTER_NEAREST)
4252 int wdepth = std::max(CV_32F, depth);
4253 buildOptions = buildOptions
4254 + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
4255 " -D convertToWT2=%s -D WT2=%s",
4256 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
4257 ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
4258 ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
4259 ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
4260 ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
4262 int scalarcn = cn == 3 ? 4 : cn;
4263 int sctype = CV_MAKETYPE(depth, scalarcn);
4264 buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
4265 ocl::typeToStr(type), ocl::typeToStr(depth),
4266 cn, ocl::typeToStr(sctype), depth);
4268 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
4270 Mat scalar(1, 1, sctype, borderValue);
4271 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
4272 map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
4273 scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
4276 k.args(srcarg, dstarg, map1arg, scalararg);
4278 k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
4280 size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4281 return k.run(2, globalThreads, NULL, false);
4286 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4288 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
4289 const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
4290 void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
4292 class IPPRemapInvoker :
4293 public ParallelLoopBody
4296 IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
4297 int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
4298 ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
4299 ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
4304 virtual void operator() (const Range & range) const
4306 IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
4307 Mat dstRoi = dst.rowRange(range);
4308 IppiSize dstRoiSize = ippiSize(dstRoi.size());
4309 int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4311 if (borderType == BORDER_CONSTANT &&
4312 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
4318 if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
4319 map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
4320 dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
4324 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4329 Mat & src, & dst, & map1, & map2;
4331 int ippInterpolation, borderType;
4340 void cv::remap( InputArray _src, OutputArray _dst,
4341 InputArray _map1, InputArray _map2,
4342 int interpolation, int borderType, const Scalar& borderValue )
4344 static RemapNNFunc nn_tab[] =
4346 remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
4347 remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
4350 static RemapFunc linear_tab[] =
4352 remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
4353 remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
4354 remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
4355 remapBilinear<Cast<float, float>, RemapNoVec, float>,
4356 remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
4359 static RemapFunc cubic_tab[] =
4361 remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4362 remapBicubic<Cast<float, ushort>, float, 1>,
4363 remapBicubic<Cast<float, short>, float, 1>, 0,
4364 remapBicubic<Cast<float, float>, float, 1>,
4365 remapBicubic<Cast<double, double>, float, 1>, 0
4368 static RemapFunc lanczos4_tab[] =
4370 remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4371 remapLanczos4<Cast<float, ushort>, float, 1>,
4372 remapLanczos4<Cast<float, short>, float, 1>, 0,
4373 remapLanczos4<Cast<float, float>, float, 1>,
4374 remapLanczos4<Cast<double, double>, float, 1>, 0
4377 CV_Assert( _map1.size().area() > 0 );
4378 CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
4380 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4381 ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
4383 Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
4384 _dst.create( map1.size(), src.type() );
4385 Mat dst = _dst.getMat();
4386 if( dst.data == src.data )
4389 if( interpolation == INTER_AREA )
4390 interpolation = INTER_LINEAR;
4392 int type = src.type(), depth = CV_MAT_DEPTH(type);
4394 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
4397 if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
4398 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
4399 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
4401 int ippInterpolation =
4402 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4403 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
4406 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
4407 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
4408 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
4409 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
4410 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
4411 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
4412 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
4413 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
4414 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
4419 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
4420 borderType, borderValue, &ok);
4421 Range range(0, dst.rows);
4422 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
4426 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4429 setIppErrorStatus();
4435 RemapNNFunc nnfunc = 0;
4436 RemapFunc ifunc = 0;
4437 const void* ctab = 0;
4438 bool fixpt = depth == CV_8U;
4439 bool planar_input = false;
4441 if( interpolation == INTER_NEAREST )
4443 nnfunc = nn_tab[depth];
4444 CV_Assert( nnfunc != 0 );
4448 if( interpolation == INTER_LINEAR )
4449 ifunc = linear_tab[depth];
4450 else if( interpolation == INTER_CUBIC )
4451 ifunc = cubic_tab[depth];
4452 else if( interpolation == INTER_LANCZOS4 )
4453 ifunc = lanczos4_tab[depth];
4455 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
4456 CV_Assert( ifunc != 0 );
4457 ctab = initInterTab2D( interpolation, fixpt );
4460 const Mat *m1 = &map1, *m2 = &map2;
4462 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
4463 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
4465 if( map1.type() != CV_16SC2 )
4470 CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
4471 (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4472 planar_input = map1.channels() == 1;
4475 RemapInvoker invoker(src, dst, m1, m2,
4476 borderType, borderValue, planar_input, nnfunc, ifunc,
4478 parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
4482 void cv::convertMaps( InputArray _map1, InputArray _map2,
4483 OutputArray _dstmap1, OutputArray _dstmap2,
4484 int dstm1type, bool nninterpolate )
4486 Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
4487 Size size = map1.size();
4488 const Mat *m1 = &map1, *m2 = &map2;
4489 int m1type = m1->type(), m2type = m2->type();
4491 CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
4492 (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
4493 (m1type == CV_32FC1 && m2type == CV_32FC1) ||
4494 (m1type == CV_32FC2 && m2->empty()) );
4496 if( m2type == CV_16SC2 )
4498 std::swap( m1, m2 );
4499 std::swap( m1type, m2type );
4502 if( dstm1type <= 0 )
4503 dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
4504 CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
4505 _dstmap1.create( size, dstm1type );
4506 dstmap1 = _dstmap1.getMat();
4508 if( !nninterpolate && dstm1type != CV_32FC2 )
4510 _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
4511 dstmap2 = _dstmap2.getMat();
4516 if( m1type == dstm1type || (nninterpolate &&
4517 ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
4518 (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
4520 m1->convertTo( dstmap1, dstmap1.type() );
4521 if( !dstmap2.empty() && dstmap2.type() == m2->type() )
4522 m2->copyTo( dstmap2 );
4526 if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
4528 Mat vdata[] = { *m1, *m2 };
4529 merge( vdata, 2, dstmap1 );
4533 if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
4535 Mat mv[] = { dstmap1, dstmap2 };
4540 if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
4541 dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
4543 size.width *= size.height;
4547 const float scale = 1.f/INTER_TAB_SIZE;
4549 for( y = 0; y < size.height; y++ )
4551 const float* src1f = m1->ptr<float>(y);
4552 const float* src2f = m2->ptr<float>(y);
4553 const short* src1 = (const short*)src1f;
4554 const ushort* src2 = (const ushort*)src2f;
4556 float* dst1f = dstmap1.ptr<float>(y);
4557 float* dst2f = dstmap2.ptr<float>(y);
4558 short* dst1 = (short*)dst1f;
4559 ushort* dst2 = (ushort*)dst2f;
4561 if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
4564 for( x = 0; x < size.width; x++ )
4566 dst1[x*2] = saturate_cast<short>(src1f[x]);
4567 dst1[x*2+1] = saturate_cast<short>(src2f[x]);
4570 for( x = 0; x < size.width; x++ )
4572 int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
4573 int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
4574 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4575 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4576 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4579 else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
4582 for( x = 0; x < size.width; x++ )
4584 dst1[x*2] = saturate_cast<short>(src1f[x*2]);
4585 dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
4588 for( x = 0; x < size.width; x++ )
4590 int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
4591 int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
4592 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
4593 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
4594 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
4597 else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
4599 for( x = 0; x < size.width; x++ )
4601 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
4602 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4603 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4606 else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
4608 for( x = 0; x < size.width; x++ )
4610 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
4611 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
4612 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
4616 CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
4624 class WarpAffineInvoker :
4625 public ParallelLoopBody
4628 WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
4629 const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
4630 ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
4631 borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
4636 virtual void operator() (const Range& range) const
4638 const int BLOCK_SZ = 64;
4639 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
4640 const int AB_BITS = MAX(10, (int)INTER_BITS);
4641 const int AB_SCALE = 1 << AB_BITS;
4642 int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
4644 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
4647 int bh0 = std::min(BLOCK_SZ/2, dst.rows);
4648 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
4649 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
4651 for( y = range.start; y < range.end; y += bh0 )
4653 for( x = 0; x < dst.cols; x += bw0 )
4655 int bw = std::min( bw0, dst.cols - x);
4656 int bh = std::min( bh0, range.end - y);
4658 Mat _XY(bh, bw, CV_16SC2, XY), matA;
4659 Mat dpart(dst, Rect(x, y, bw, bh));
4661 for( y1 = 0; y1 < bh; y1++ )
4663 short* xy = XY + y1*bw*2;
4664 int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
4665 int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
4667 if( interpolation == INTER_NEAREST )
4671 int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
4672 for( ; x1 <= bw - 4; x1 += 4 )
4674 int32x4_t v_X = vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
4675 int32x4_t v_Y = vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
4677 vst1q_s16(xy + (x1 << 1), vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X, INTER_BITS)),
4678 vqmovn_s32(vshrq_n_s32(v_Y, INTER_BITS))));
4681 for( ; x1 < bw; x1++ )
4683 int X = (X0 + adelta[x+x1]) >> AB_BITS;
4684 int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
4685 xy[x1*2] = saturate_cast<short>(X);
4686 xy[x1*2+1] = saturate_cast<short>(Y);
4691 short* alpha = A + y1*bw;
4696 __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
4697 __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
4698 for( ; x1 <= bw - 8; x1 += 8 )
4700 __m128i tx0, tx1, ty0, ty1;
4701 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
4702 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
4703 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
4704 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
4706 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
4707 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
4708 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
4709 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
4711 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
4712 _mm_and_si128(tx1, fxy_mask));
4713 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
4714 _mm_and_si128(ty1, fxy_mask));
4715 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
4716 _mm_srai_epi32(tx1, INTER_BITS));
4717 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
4718 _mm_srai_epi32(ty1, INTER_BITS));
4719 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
4721 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
4722 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
4723 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
4727 // int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
4728 // for( ; x1 <= bw - 4; x1 += 4 )
4730 // int32x4_t v_X = vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
4731 // int32x4_t v_Y = vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
4733 // vst1q_s16(xy + (x1 << 1), vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X, INTER_BITS)),
4734 // vqmovn_s32(vshrq_n_s32(v_Y, INTER_BITS))));
4735 // vst1_s16(alpha + x1, vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y, v_mask), INTER_BITS),
4736 // vandq_s32(v_X, v_mask))));
4739 for( ; x1 < bw; x1++ )
4741 int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
4742 int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
4743 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
4744 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
4745 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
4746 (X & (INTER_TAB_SIZE-1)));
4751 if( interpolation == INTER_NEAREST )
4752 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
4755 Mat _matA(bh, bw, CV_16U, A);
4756 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
4765 int interpolation, borderType;
4767 int *adelta, *bdelta;
4772 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4773 class IPPWarpAffineInvoker :
4774 public ParallelLoopBody
4777 IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
4778 const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
4779 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
4780 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
4785 virtual void operator() (const Range& range) const
4787 IppiSize srcsize = { src.cols, src.rows };
4788 IppiRect srcroi = { 0, 0, src.cols, src.rows };
4789 IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
4790 int cnn = src.channels();
4791 if( borderType == BORDER_CONSTANT )
4793 IppiSize setSize = { dst.cols, range.end - range.start };
4794 void *dataPointer = dst.ptr(range.start);
4795 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
4802 // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
4803 IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
4804 (int)dst.step[0], dstroi, coeffs, mode );
4809 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4816 double (&coeffs)[2][3];
4819 ippiWarpAffineBackFunc func;
4821 const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
4827 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
4829 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
4830 Size dsize, int flags, int borderType, const Scalar& borderValue,
4833 CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
4834 const ocl::Device & dev = ocl::Device::getDefault();
4836 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4837 double doubleSupport = dev.doubleFPConfig() > 0;
4839 int interpolation = flags & INTER_MAX;
4840 if( interpolation == INTER_AREA )
4841 interpolation = INTER_LINEAR;
4842 int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
4844 if ( !(borderType == cv::BORDER_CONSTANT &&
4845 (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
4846 (!doubleSupport && depth == CV_64F) || cn > 4)
4849 const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
4850 ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
4851 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
4852 const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
4854 int scalarcn = cn == 3 ? 4 : cn;
4855 bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
4856 int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
4857 int sctype = CV_MAKETYPE(wdepth, scalarcn);
4861 if (interpolation == INTER_NEAREST)
4863 opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
4864 ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4865 ocl::typeToStr(CV_MAT_DEPTH(type)),
4866 ocl::typeToStr(sctype), cn, rowsPerWI);
4871 opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
4872 " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
4873 interpolationMap[interpolation], ocl::typeToStr(type),
4874 ocl::typeToStr(CV_MAT_DEPTH(type)),
4875 ocl::typeToStr(sctype),
4876 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
4877 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
4878 ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
4879 doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
4882 k.create(kernelName, program, opts);
4886 double borderBuf[] = { 0, 0, 0, 0 };
4887 scalarToRawData(borderValue, borderBuf, sctype);
4889 UMat src = _src.getUMat(), M0;
4890 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4891 UMat dst = _dst.getUMat();
4894 int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
4895 Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
4896 CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
4897 M1.rows == matRows && M1.cols == 3 );
4898 M1.convertTo(matM, matM.type());
4900 if( !(flags & WARP_INVERSE_MAP) )
4902 if (op_type == OCL_OP_PERSPECTIVE)
4906 double D = M[0]*M[4] - M[1]*M[3];
4907 D = D != 0 ? 1./D : 0;
4908 double A11 = M[4]*D, A22=M[0]*D;
4909 M[0] = A11; M[1] *= -D;
4910 M[3] *= -D; M[4] = A22;
4911 double b1 = -M[0]*M[2] - M[1]*M[5];
4912 double b2 = -M[3]*M[2] - M[4]*M[5];
4913 M[2] = b1; M[5] = b2;
4916 matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
4918 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
4919 ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
4921 size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4922 return k.run(2, globalThreads, NULL, false);
4930 void cv::warpAffine( InputArray _src, OutputArray _dst,
4931 InputArray _M0, Size dsize,
4932 int flags, int borderType, const Scalar& borderValue )
4934 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4935 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
4936 borderValue, OCL_OP_AFFINE))
4938 Mat src = _src.getMat(), M0 = _M0.getMat();
4939 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
4940 Mat dst = _dst.getMat();
4941 CV_Assert( src.cols > 0 && src.rows > 0 );
4942 if( dst.data == src.data )
4946 Mat matM(2, 3, CV_64F, M);
4947 int interpolation = flags & INTER_MAX;
4948 if( interpolation == INTER_AREA )
4949 interpolation = INTER_LINEAR;
4951 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
4952 M0.convertTo(matM, matM.type());
4954 #ifdef HAVE_TEGRA_OPTIMIZATION
4955 if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
4959 if( !(flags & WARP_INVERSE_MAP) )
4961 double D = M[0]*M[4] - M[1]*M[3];
4962 D = D != 0 ? 1./D : 0;
4963 double A11 = M[4]*D, A22=M[0]*D;
4964 M[0] = A11; M[1] *= -D;
4965 M[3] *= -D; M[4] = A22;
4966 double b1 = -M[0]*M[2] - M[1]*M[5];
4967 double b2 = -M[3]*M[2] - M[4]*M[5];
4968 M[2] = b1; M[5] = b2;
4972 AutoBuffer<int> _abdelta(dst.cols*2);
4973 int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
4974 const int AB_BITS = MAX(10, (int)INTER_BITS);
4975 const int AB_SCALE = 1 << AB_BITS;
4977 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
4980 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4981 if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
4982 ( cn == 1 || cn == 3 || cn == 4 ) &&
4983 ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
4984 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
4986 ippiWarpAffineBackFunc ippFunc = 0;
4987 if ((flags & WARP_INVERSE_MAP) != 0)
4990 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
4991 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
4992 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
4993 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
4994 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
4995 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
4996 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
4997 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
4998 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
5004 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
5005 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
5006 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
5007 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
5008 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
5009 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
5010 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
5011 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
5012 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
5016 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5017 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5018 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
5020 CV_Assert(mode && ippFunc);
5022 double coeffs[2][3];
5023 for( int i = 0; i < 2; i++ )
5024 for( int j = 0; j < 3; j++ )
5025 coeffs[i][j] = matM.at<double>(i, j);
5028 Range range(0, dst.rows);
5029 IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5030 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5033 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5036 setIppErrorStatus();
5041 for( x = 0; x < dst.cols; x++ )
5043 adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
5044 bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
5047 Range range(0, dst.rows);
5048 WarpAffineInvoker invoker(src, dst, interpolation, borderType,
5049 borderValue, adelta, bdelta, M);
5050 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5057 class WarpPerspectiveInvoker :
5058 public ParallelLoopBody
5061 WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
5062 int _borderType, const Scalar &_borderValue) :
5063 ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
5064 borderType(_borderType), borderValue(_borderValue)
5068 virtual void operator() (const Range& range) const
5070 const int BLOCK_SZ = 32;
5071 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
5072 int x, y, x1, y1, width = dst.cols, height = dst.rows;
5074 int bh0 = std::min(BLOCK_SZ/2, height);
5075 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
5076 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
5078 for( y = range.start; y < range.end; y += bh0 )
5080 for( x = 0; x < width; x += bw0 )
5082 int bw = std::min( bw0, width - x);
5083 int bh = std::min( bh0, range.end - y); // height
5085 Mat _XY(bh, bw, CV_16SC2, XY), matA;
5086 Mat dpart(dst, Rect(x, y, bw, bh));
5088 for( y1 = 0; y1 < bh; y1++ )
5090 short* xy = XY + y1*bw*2;
5091 double X0 = M[0]*x + M[1]*(y + y1) + M[2];
5092 double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
5093 double W0 = M[6]*x + M[7]*(y + y1) + M[8];
5095 if( interpolation == INTER_NEAREST )
5096 for( x1 = 0; x1 < bw; x1++ )
5098 double W = W0 + M[6]*x1;
5100 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
5101 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
5102 int X = saturate_cast<int>(fX);
5103 int Y = saturate_cast<int>(fY);
5105 xy[x1*2] = saturate_cast<short>(X);
5106 xy[x1*2+1] = saturate_cast<short>(Y);
5110 short* alpha = A + y1*bw;
5111 for( x1 = 0; x1 < bw; x1++ )
5113 double W = W0 + M[6]*x1;
5114 W = W ? INTER_TAB_SIZE/W : 0;
5115 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
5116 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
5117 int X = saturate_cast<int>(fX);
5118 int Y = saturate_cast<int>(fY);
5120 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
5121 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
5122 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
5123 (X & (INTER_TAB_SIZE-1)));
5128 if( interpolation == INTER_NEAREST )
5129 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
5132 Mat _matA(bh, bw, CV_16U, A);
5133 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
5143 int interpolation, borderType;
5148 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
5149 class IPPWarpPerspectiveInvoker :
5150 public ParallelLoopBody
5153 IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
5154 int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
5155 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
5156 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
5161 virtual void operator() (const Range& range) const
5163 IppiSize srcsize = {src.cols, src.rows};
5164 IppiRect srcroi = {0, 0, src.cols, src.rows};
5165 IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
5166 int cnn = src.channels();
5168 if( borderType == BORDER_CONSTANT )
5170 IppiSize setSize = {dst.cols, range.end - range.start};
5171 void *dataPointer = dst.ptr(range.start);
5172 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
5179 IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode);
5180 if (status != ippStsNoErr)
5184 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5191 double (&coeffs)[3][3];
5193 const Scalar borderValue;
5194 ippiWarpPerspectiveFunc func;
5197 const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
5202 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
5203 Size dsize, int flags, int borderType, const Scalar& borderValue )
5205 CV_Assert( _src.total() > 0 );
5207 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
5208 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
5209 OCL_OP_PERSPECTIVE))
5211 Mat src = _src.getMat(), M0 = _M0.getMat();
5212 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5213 Mat dst = _dst.getMat();
5215 if( dst.data == src.data )
5219 Mat matM(3, 3, CV_64F, M);
5220 int interpolation = flags & INTER_MAX;
5221 if( interpolation == INTER_AREA )
5222 interpolation = INTER_LINEAR;
5224 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
5225 M0.convertTo(matM, matM.type());
5227 #ifdef HAVE_TEGRA_OPTIMIZATION
5228 if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
5233 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
5236 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5237 if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
5238 (cn == 1 || cn == 3 || cn == 4) &&
5239 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
5240 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
5242 ippiWarpPerspectiveFunc ippFunc = 0;
5243 if ((flags & WARP_INVERSE_MAP) != 0)
5245 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
5246 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
5247 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
5248 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
5249 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
5250 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
5251 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
5252 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
5253 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
5257 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
5258 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
5259 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
5260 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
5261 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
5262 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
5263 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
5264 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
5265 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
5268 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5269 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5270 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
5271 CV_Assert(mode && ippFunc);
5273 double coeffs[3][3];
5274 for( int i = 0; i < 3; i++ )
5275 for( int j = 0; j < 3; j++ )
5276 coeffs[i][j] = matM.at<double>(i, j);
5279 Range range(0, dst.rows);
5280 IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5281 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5284 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5287 setIppErrorStatus();
5292 if( !(flags & WARP_INVERSE_MAP) )
5295 Range range(0, dst.rows);
5296 WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
5297 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5301 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
5304 double alpha = cos(angle)*scale;
5305 double beta = sin(angle)*scale;
5307 Mat M(2, 3, CV_64F);
5308 double* m = M.ptr<double>();
5312 m[2] = (1-alpha)*center.x - beta*center.y;
5315 m[5] = beta*center.x + (1-alpha)*center.y;
5320 /* Calculates coefficients of perspective transformation
5321 * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
5323 * c00*xi + c01*yi + c02
5324 * ui = ---------------------
5325 * c20*xi + c21*yi + c22
5327 * c10*xi + c11*yi + c12
5328 * vi = ---------------------
5329 * c20*xi + c21*yi + c22
5331 * Coefficients are calculated by solving linear system:
5332 * / x0 y0 1 0 0 0 -x0*u0 -y0*u0 \ /c00\ /u0\
5333 * | x1 y1 1 0 0 0 -x1*u1 -y1*u1 | |c01| |u1|
5334 * | x2 y2 1 0 0 0 -x2*u2 -y2*u2 | |c02| |u2|
5335 * | x3 y3 1 0 0 0 -x3*u3 -y3*u3 |.|c10|=|u3|,
5336 * | 0 0 0 x0 y0 1 -x0*v0 -y0*v0 | |c11| |v0|
5337 * | 0 0 0 x1 y1 1 -x1*v1 -y1*v1 | |c12| |v1|
5338 * | 0 0 0 x2 y2 1 -x2*v2 -y2*v2 | |c20| |v2|
5339 * \ 0 0 0 x3 y3 1 -x3*v3 -y3*v3 / \c21/ \v3/
5342 * cij - matrix coefficients, c22 = 1
5344 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
5346 Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
5347 double a[8][8], b[8];
5348 Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
5350 for( int i = 0; i < 4; ++i )
5352 a[i][0] = a[i+4][3] = src[i].x;
5353 a[i][1] = a[i+4][4] = src[i].y;
5354 a[i][2] = a[i+4][5] = 1;
5355 a[i][3] = a[i][4] = a[i][5] =
5356 a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
5357 a[i][6] = -src[i].x*dst[i].x;
5358 a[i][7] = -src[i].y*dst[i].x;
5359 a[i+4][6] = -src[i].x*dst[i].y;
5360 a[i+4][7] = -src[i].y*dst[i].y;
5365 solve( A, B, X, DECOMP_SVD );
5366 M.ptr<double>()[8] = 1.;
5371 /* Calculates coefficients of affine transformation
5372 * which maps (xi,yi) to (ui,vi), (i=1,2,3):
5374 * ui = c00*xi + c01*yi + c02
5376 * vi = c10*xi + c11*yi + c12
5378 * Coefficients are calculated by solving linear system:
5379 * / x0 y0 1 0 0 0 \ /c00\ /u0\
5380 * | x1 y1 1 0 0 0 | |c01| |u1|
5381 * | x2 y2 1 0 0 0 | |c02| |u2|
5382 * | 0 0 0 x0 y0 1 | |c10| |v0|
5383 * | 0 0 0 x1 y1 1 | |c11| |v1|
5384 * \ 0 0 0 x2 y2 1 / |c12| |v2|
5387 * cij - matrix coefficients
5390 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
5392 Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
5393 double a[6*6], b[6];
5394 Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
5396 for( int i = 0; i < 3; i++ )
5400 a[j] = a[k+3] = src[i].x;
5401 a[j+1] = a[k+4] = src[i].y;
5402 a[j+2] = a[k+5] = 1;
5403 a[j+3] = a[j+4] = a[j+5] = 0;
5404 a[k] = a[k+1] = a[k+2] = 0;
5406 b[i*2+1] = dst[i].y;
5413 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
5415 Mat matM = _matM.getMat();
5416 CV_Assert(matM.rows == 2 && matM.cols == 3);
5417 __iM.create(2, 3, matM.type());
5418 Mat _iM = __iM.getMat();
5420 if( matM.type() == CV_32F )
5422 const float* M = matM.ptr<float>();
5423 float* iM = _iM.ptr<float>();
5424 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5426 double D = M[0]*M[step+1] - M[1]*M[step];
5427 D = D != 0 ? 1./D : 0;
5428 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5429 double b1 = -A11*M[2] - A12*M[step+2];
5430 double b2 = -A21*M[2] - A22*M[step+2];
5432 iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
5433 iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
5435 else if( matM.type() == CV_64F )
5437 const double* M = matM.ptr<double>();
5438 double* iM = _iM.ptr<double>();
5439 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
5441 double D = M[0]*M[step+1] - M[1]*M[step];
5442 D = D != 0 ? 1./D : 0;
5443 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
5444 double b1 = -A11*M[2] - A12*M[step+2];
5445 double b2 = -A21*M[2] - A22*M[step+2];
5447 iM[0] = A11; iM[1] = A12; iM[2] = b1;
5448 iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
5451 CV_Error( CV_StsUnsupportedFormat, "" );
5454 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
5456 Mat src = _src.getMat(), dst = _dst.getMat();
5457 CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
5458 return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5461 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
5463 Mat src = _src.getMat(), dst = _dst.getMat();
5464 CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
5465 return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
5469 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
5471 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5472 CV_Assert( src.type() == dst.type() );
5473 cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
5474 (double)dst.rows/src.rows, method );
5479 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5480 int flags, CvScalar fillval )
5482 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5483 cv::Mat matrix = cv::cvarrToMat(marr);
5484 CV_Assert( src.type() == dst.type() );
5485 cv::warpAffine( src, dst, matrix, dst.size(), flags,
5486 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5491 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
5492 int flags, CvScalar fillval )
5494 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5495 cv::Mat matrix = cv::cvarrToMat(marr);
5496 CV_Assert( src.type() == dst.type() );
5497 cv::warpPerspective( src, dst, matrix, dst.size(), flags,
5498 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5503 cvRemap( const CvArr* srcarr, CvArr* dstarr,
5504 const CvArr* _mapx, const CvArr* _mapy,
5505 int flags, CvScalar fillval )
5507 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
5508 cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
5509 CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
5510 cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
5511 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
5513 CV_Assert( dst0.data == dst.data );
5518 cv2DRotationMatrix( CvPoint2D32f center, double angle,
5519 double scale, CvMat* matrix )
5521 cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
5522 CV_Assert( M.size() == M0.size() );
5523 M.convertTo(M0, M0.type());
5529 cvGetPerspectiveTransform( const CvPoint2D32f* src,
5530 const CvPoint2D32f* dst,
5533 cv::Mat M0 = cv::cvarrToMat(matrix),
5534 M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5535 CV_Assert( M.size() == M0.size() );
5536 M.convertTo(M0, M0.type());
5542 cvGetAffineTransform( const CvPoint2D32f* src,
5543 const CvPoint2D32f* dst,
5546 cv::Mat M0 = cv::cvarrToMat(matrix),
5547 M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
5548 CV_Assert( M.size() == M0.size() );
5549 M.convertTo(M0, M0.type());
5555 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
5557 cv::Mat map1 = cv::cvarrToMat(arr1), map2;
5558 cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
5561 map2 = cv::cvarrToMat(arr2);
5564 dstmap2 = cv::cvarrToMat(dstarr2);
5565 if( dstmap2.type() == CV_16SC1 )
5566 dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
5569 cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
5572 /****************************************************************************************\
5573 * Log-Polar Transform *
5574 \****************************************************************************************/
5576 /* now it is done via Remap; more correct implementation should use
5577 some super-sampling technique outside of the "fovea" circle */
5579 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
5580 CvPoint2D32f center, double M, int flags )
5582 cv::Ptr<CvMat> mapx, mapy;
5584 CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
5585 CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
5586 CvSize ssize, dsize;
5588 if( !CV_ARE_TYPES_EQ( src, dst ))
5589 CV_Error( CV_StsUnmatchedFormats, "" );
5592 CV_Error( CV_StsOutOfRange, "M should be >0" );
5594 ssize = cvGetMatSize(src);
5595 dsize = cvGetMatSize(dst);
5597 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5598 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5600 if( !(flags & CV_WARP_INVERSE_MAP) )
5603 cv::AutoBuffer<double> _exp_tab(dsize.width);
5604 double* exp_tab = _exp_tab;
5606 for( rho = 0; rho < dst->width; rho++ )
5607 exp_tab[rho] = std::exp(rho/M);
5609 for( phi = 0; phi < dsize.height; phi++ )
5611 double cp = cos(phi*2*CV_PI/dsize.height);
5612 double sp = sin(phi*2*CV_PI/dsize.height);
5613 float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5614 float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5616 for( rho = 0; rho < dsize.width; rho++ )
5618 double r = exp_tab[rho];
5619 double x = r*cp + center.x;
5620 double y = r*sp + center.y;
5630 CvMat bufx, bufy, bufp, bufa;
5631 double ascale = ssize.height/(2*CV_PI);
5632 cv::AutoBuffer<float> _buf(4*dsize.width);
5635 bufx = cvMat( 1, dsize.width, CV_32F, buf );
5636 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5637 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5638 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5640 for( x = 0; x < dsize.width; x++ )
5641 bufx.data.fl[x] = (float)x - center.x;
5643 for( y = 0; y < dsize.height; y++ )
5645 float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5646 float* my = (float*)(mapy->data.ptr + y*mapy->step);
5648 for( x = 0; x < dsize.width; x++ )
5649 bufy.data.fl[x] = (float)y - center.y;
5652 cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
5654 for( x = 0; x < dsize.width; x++ )
5655 bufp.data.fl[x] += 1.f;
5657 cvLog( &bufp, &bufp );
5659 for( x = 0; x < dsize.width; x++ )
5661 double rho = bufp.data.fl[x]*M;
5662 double phi = bufa.data.fl[x]*ascale;
5668 for( x = 0; x < dsize.width; x++ )
5670 double xx = bufx.data.fl[x];
5671 double yy = bufy.data.fl[x];
5673 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
5674 double a = atan2(yy,xx);
5686 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5689 void cv::logPolar( InputArray _src, OutputArray _dst,
5690 Point2f center, double M, int flags )
5692 Mat src = _src.getMat();
5693 _dst.create( src.size(), src.type() );
5694 CvMat c_src = src, c_dst = _dst.getMat();
5695 cvLogPolar( &c_src, &c_dst, center, M, flags );
5698 /****************************************************************************************
5699 Linear-Polar Transform
5700 J.L. Blanco, Apr 2009
5701 ****************************************************************************************/
5703 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
5704 CvPoint2D32f center, double maxRadius, int flags )
5706 cv::Ptr<CvMat> mapx, mapy;
5708 CvMat srcstub, *src = (CvMat*)srcarr;
5709 CvMat dststub, *dst = (CvMat*)dstarr;
5710 CvSize ssize, dsize;
5712 src = cvGetMat( srcarr, &srcstub,0,0 );
5713 dst = cvGetMat( dstarr, &dststub,0,0 );
5715 if( !CV_ARE_TYPES_EQ( src, dst ))
5716 CV_Error( CV_StsUnmatchedFormats, "" );
5718 ssize.width = src->cols;
5719 ssize.height = src->rows;
5720 dsize.width = dst->cols;
5721 dsize.height = dst->rows;
5723 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5724 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
5726 if( !(flags & CV_WARP_INVERSE_MAP) )
5730 for( phi = 0; phi < dsize.height; phi++ )
5732 double cp = cos(phi*2*CV_PI/dsize.height);
5733 double sp = sin(phi*2*CV_PI/dsize.height);
5734 float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
5735 float* my = (float*)(mapy->data.ptr + phi*mapy->step);
5737 for( rho = 0; rho < dsize.width; rho++ )
5739 double r = maxRadius*(rho+1)/dsize.width;
5740 double x = r*cp + center.x;
5741 double y = r*sp + center.y;
5751 CvMat bufx, bufy, bufp, bufa;
5752 const double ascale = ssize.height/(2*CV_PI);
5753 const double pscale = ssize.width/maxRadius;
5755 cv::AutoBuffer<float> _buf(4*dsize.width);
5758 bufx = cvMat( 1, dsize.width, CV_32F, buf );
5759 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
5760 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
5761 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
5763 for( x = 0; x < dsize.width; x++ )
5764 bufx.data.fl[x] = (float)x - center.x;
5766 for( y = 0; y < dsize.height; y++ )
5768 float* mx = (float*)(mapx->data.ptr + y*mapx->step);
5769 float* my = (float*)(mapy->data.ptr + y*mapy->step);
5771 for( x = 0; x < dsize.width; x++ )
5772 bufy.data.fl[x] = (float)y - center.y;
5774 cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
5776 for( x = 0; x < dsize.width; x++ )
5777 bufp.data.fl[x] += 1.f;
5779 for( x = 0; x < dsize.width; x++ )
5781 double rho = bufp.data.fl[x]*pscale;
5782 double phi = bufa.data.fl[x]*ascale;
5789 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
5792 void cv::linearPolar( InputArray _src, OutputArray _dst,
5793 Point2f center, double maxRadius, int flags )
5795 Mat src = _src.getMat();
5796 _dst.create( src.size(), src.type() );
5797 CvMat c_src = src, c_dst = _dst.getMat();
5798 cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );