1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
44 /* ////////////////////////////////////////////////////////////////////
46 // Geometrical transforms on images and matrices: rotation, zoom etc.
50 #include "precomp.hpp"
51 #include "opencl_kernels_imgproc.hpp"
52 #include "hal_replacement.hpp"
58 #if IPP_VERSION_X100 >= 710
59 typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
60 typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
61 typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
64 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
65 typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
66 typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
67 typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
69 template <int channels, typename Type>
70 bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
72 CV_INSTRUMENT_REGION_IPP()
74 Type values[channels];
75 for( int i = 0; i < channels; i++ )
76 values[i] = saturate_cast<Type>(value[i]);
77 return func(values, dataPointer, step, size) >= 0;
80 static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
82 CV_INSTRUMENT_REGION_IPP()
89 return CV_INSTRUMENT_FUN_IPP(ippiSet_8u_C1R,(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size)) >= 0;
91 return CV_INSTRUMENT_FUN_IPP(ippiSet_16u_C1R,(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size)) >= 0;
93 return CV_INSTRUMENT_FUN_IPP(ippiSet_32f_C1R,(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size)) >= 0;
103 return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
105 return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
107 return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
110 else if( channels == 4 )
115 return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
117 return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
119 return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
127 /************** interpolation formulas and tables ***************/
129 const int INTER_RESIZE_COEF_BITS=11;
130 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
132 const int INTER_REMAP_COEF_BITS=15;
133 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
135 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
137 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
138 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
140 #if CV_SSE2 || CV_NEON
141 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
142 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
145 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
146 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
148 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
149 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
151 static inline void interpolateLinear( float x, float* coeffs )
157 static inline void interpolateCubic( float x, float* coeffs )
159 const float A = -0.75f;
161 coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
162 coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
163 coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
164 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
167 static inline void interpolateLanczos4( float x, float* coeffs )
169 static const double s45 = 0.70710678118654752440084436210485;
170 static const double cs[][2]=
171 {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
173 if( x < FLT_EPSILON )
175 for( int i = 0; i < 8; i++ )
182 double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
183 for(int i = 0; i < 8; i++ )
185 double y = -(x+3-i)*CV_PI*0.25;
186 coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
191 for(int i = 0; i < 8; i++ )
195 static void initInterTab1D(int method, float* tab, int tabsz)
197 float scale = 1.f/tabsz;
198 if( method == INTER_LINEAR )
200 for( int i = 0; i < tabsz; i++, tab += 2 )
201 interpolateLinear( i*scale, tab );
203 else if( method == INTER_CUBIC )
205 for( int i = 0; i < tabsz; i++, tab += 4 )
206 interpolateCubic( i*scale, tab );
208 else if( method == INTER_LANCZOS4 )
210 for( int i = 0; i < tabsz; i++, tab += 8 )
211 interpolateLanczos4( i*scale, tab );
214 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
218 static const void* initInterTab2D( int method, bool fixpt )
220 static bool inittab[INTER_MAX+1] = {false};
224 if( method == INTER_LINEAR )
225 tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
226 else if( method == INTER_CUBIC )
227 tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
228 else if( method == INTER_LANCZOS4 )
229 tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
231 CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
233 if( !inittab[method] )
235 AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
237 initInterTab1D(method, _tab, INTER_TAB_SIZE);
238 for( i = 0; i < INTER_TAB_SIZE; i++ )
239 for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
242 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
243 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
245 for( k1 = 0; k1 < ksize; k1++ )
247 float vy = _tab[i*ksize + k1];
248 for( k2 = 0; k2 < ksize; k2++ )
250 float v = vy*_tab[j*ksize + k2];
251 tab[k1*ksize + k2] = v;
252 isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
256 if( isum != INTER_REMAP_COEF_SCALE )
258 int diff = isum - INTER_REMAP_COEF_SCALE;
259 int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
260 for( k1 = ksize2; k1 < ksize2+2; k1++ )
261 for( k2 = ksize2; k2 < ksize2+2; k2++ )
263 if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
265 else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
269 itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
271 itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
274 tab -= INTER_TAB_SIZE2*ksize*ksize;
275 itab -= INTER_TAB_SIZE2*ksize*ksize;
276 #if CV_SSE2 || CV_NEON
277 if( method == INTER_LINEAR )
279 for( i = 0; i < INTER_TAB_SIZE2; i++ )
280 for( j = 0; j < 4; j++ )
282 BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
283 BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
284 BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
285 BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
289 inittab[method] = true;
291 return fixpt ? (const void*)itab : (const void*)tab;
295 static bool initAllInterTab2D()
297 return initInterTab2D( INTER_LINEAR, false ) &&
298 initInterTab2D( INTER_LINEAR, true ) &&
299 initInterTab2D( INTER_CUBIC, false ) &&
300 initInterTab2D( INTER_CUBIC, true ) &&
301 initInterTab2D( INTER_LANCZOS4, false ) &&
302 initInterTab2D( INTER_LANCZOS4, true );
305 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
308 template<typename ST, typename DT> struct Cast
313 DT operator()(ST val) const { return saturate_cast<DT>(val); }
316 template<typename ST, typename DT, int bits> struct FixedPtCast
320 enum { SHIFT = bits, DELTA = 1 << (bits-1) };
322 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
325 /****************************************************************************************\
327 \****************************************************************************************/
329 class resizeNNInvoker :
330 public ParallelLoopBody
333 resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
334 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
339 virtual void operator() (const Range& range) const
341 Size ssize = src.size(), dsize = dst.size();
342 int y, x, pix_size = (int)src.elemSize();
344 for( y = range.start; y < range.end; y++ )
346 uchar* D = dst.data + dst.step*y;
347 int sy = std::min(cvFloor(y*ify), ssize.height-1);
348 const uchar* S = src.ptr(sy);
353 for( x = 0; x <= dsize.width - 2; x += 2 )
355 uchar t0 = S[x_ofs[x]];
356 uchar t1 = S[x_ofs[x+1]];
361 for( ; x < dsize.width; x++ )
365 for( x = 0; x < dsize.width; x++ )
366 *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
369 for( x = 0; x < dsize.width; x++, D += 3 )
371 const uchar* _tS = S + x_ofs[x];
372 D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
376 for( x = 0; x < dsize.width; x++ )
377 *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
380 for( x = 0; x < dsize.width; x++, D += 6 )
382 const ushort* _tS = (const ushort*)(S + x_ofs[x]);
383 ushort* _tD = (ushort*)D;
384 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
388 for( x = 0; x < dsize.width; x++, D += 8 )
390 const int* _tS = (const int*)(S + x_ofs[x]);
392 _tD[0] = _tS[0]; _tD[1] = _tS[1];
396 for( x = 0; x < dsize.width; x++, D += 12 )
398 const int* _tS = (const int*)(S + x_ofs[x]);
400 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
404 for( x = 0; x < dsize.width; x++, D += pix_size )
406 const int* _tS = (const int*)(S + x_ofs[x]);
408 for( int k = 0; k < pix_size4; k++ )
418 int* x_ofs, pix_size4;
421 resizeNNInvoker(const resizeNNInvoker&);
422 resizeNNInvoker& operator=(const resizeNNInvoker&);
426 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
428 Size ssize = src.size(), dsize = dst.size();
429 AutoBuffer<int> _x_ofs(dsize.width);
431 int pix_size = (int)src.elemSize();
432 int pix_size4 = (int)(pix_size / sizeof(int));
433 double ifx = 1./fx, ify = 1./fy;
436 for( x = 0; x < dsize.width; x++ )
438 int sx = cvFloor(x*ifx);
439 x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
442 Range range(0, dsize.height);
443 resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
444 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
450 int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
455 int operator()(const uchar**, uchar**, int, const int*,
456 const uchar*, int, int, int, int, int) const { return 0; }
461 struct VResizeLinearVec_32s8u
463 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
465 if( !checkHardwareSupport(CV_CPU_SSE2) )
468 const int** src = (const int**)_src;
469 const short* beta = (const short*)_beta;
470 const int *S0 = src[0], *S1 = src[1];
472 __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
473 __m128i delta = _mm_set1_epi16(2);
475 if( (((size_t)S0|(size_t)S1)&15) == 0 )
476 for( ; x <= width - 16; x += 16 )
478 __m128i x0, x1, x2, y0, y1, y2;
479 x0 = _mm_load_si128((const __m128i*)(S0 + x));
480 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
481 y0 = _mm_load_si128((const __m128i*)(S1 + x));
482 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
483 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
484 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
486 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
487 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
488 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
489 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
490 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
491 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
493 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
494 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
496 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
497 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
498 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
501 for( ; x <= width - 16; x += 16 )
503 __m128i x0, x1, x2, y0, y1, y2;
504 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
505 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
506 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
507 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
508 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
509 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
511 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
512 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
513 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
514 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
515 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
516 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
518 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
519 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
521 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
522 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
523 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
526 for( ; x < width - 4; x += 4 )
529 x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
530 y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
531 x0 = _mm_packs_epi32(x0, x0);
532 y0 = _mm_packs_epi32(y0, y0);
533 x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
534 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
535 x0 = _mm_packus_epi16(x0, x0);
536 *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
544 template<int shiftval> struct VResizeLinearVec_32f16
546 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
548 if( !checkHardwareSupport(CV_CPU_SSE2) )
551 const float** src = (const float**)_src;
552 const float* beta = (const float*)_beta;
553 const float *S0 = src[0], *S1 = src[1];
554 ushort* dst = (ushort*)_dst;
557 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
558 __m128i preshift = _mm_set1_epi32(shiftval);
559 __m128i postshift = _mm_set1_epi16((short)shiftval);
561 if( (((size_t)S0|(size_t)S1)&15) == 0 )
562 for( ; x <= width - 16; x += 16 )
564 __m128 x0, x1, y0, y1;
566 x0 = _mm_load_ps(S0 + x);
567 x1 = _mm_load_ps(S0 + x + 4);
568 y0 = _mm_load_ps(S1 + x);
569 y1 = _mm_load_ps(S1 + x + 4);
571 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
572 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
573 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
574 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
575 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
577 x0 = _mm_load_ps(S0 + x + 8);
578 x1 = _mm_load_ps(S0 + x + 12);
579 y0 = _mm_load_ps(S1 + x + 8);
580 y1 = _mm_load_ps(S1 + x + 12);
582 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
583 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
584 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
585 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
586 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
588 _mm_storeu_si128( (__m128i*)(dst + x), t0);
589 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
592 for( ; x <= width - 16; x += 16 )
594 __m128 x0, x1, y0, y1;
596 x0 = _mm_loadu_ps(S0 + x);
597 x1 = _mm_loadu_ps(S0 + x + 4);
598 y0 = _mm_loadu_ps(S1 + x);
599 y1 = _mm_loadu_ps(S1 + x + 4);
601 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
602 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
603 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
604 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
605 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
607 x0 = _mm_loadu_ps(S0 + x + 8);
608 x1 = _mm_loadu_ps(S0 + x + 12);
609 y0 = _mm_loadu_ps(S1 + x + 8);
610 y1 = _mm_loadu_ps(S1 + x + 12);
612 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
613 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
614 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
615 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
616 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
618 _mm_storeu_si128( (__m128i*)(dst + x), t0);
619 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
622 for( ; x < width - 4; x += 4 )
626 x0 = _mm_loadu_ps(S0 + x);
627 y0 = _mm_loadu_ps(S1 + x);
629 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
630 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
631 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
632 _mm_storel_epi64( (__m128i*)(dst + x), t0);
639 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
640 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
642 struct VResizeLinearVec_32f
644 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
646 if( !checkHardwareSupport(CV_CPU_SSE) )
649 const float** src = (const float**)_src;
650 const float* beta = (const float*)_beta;
651 const float *S0 = src[0], *S1 = src[1];
652 float* dst = (float*)_dst;
655 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
657 if( (((size_t)S0|(size_t)S1)&15) == 0 )
658 for( ; x <= width - 8; x += 8 )
660 __m128 x0, x1, y0, y1;
661 x0 = _mm_load_ps(S0 + x);
662 x1 = _mm_load_ps(S0 + x + 4);
663 y0 = _mm_load_ps(S1 + x);
664 y1 = _mm_load_ps(S1 + x + 4);
666 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
667 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
669 _mm_storeu_ps( dst + x, x0);
670 _mm_storeu_ps( dst + x + 4, x1);
673 for( ; x <= width - 8; x += 8 )
675 __m128 x0, x1, y0, y1;
676 x0 = _mm_loadu_ps(S0 + x);
677 x1 = _mm_loadu_ps(S0 + x + 4);
678 y0 = _mm_loadu_ps(S1 + x);
679 y1 = _mm_loadu_ps(S1 + x + 4);
681 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
682 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
684 _mm_storeu_ps( dst + x, x0);
685 _mm_storeu_ps( dst + x + 4, x1);
693 struct VResizeCubicVec_32s8u
695 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
697 if( !checkHardwareSupport(CV_CPU_SSE2) )
700 const int** src = (const int**)_src;
701 const short* beta = (const short*)_beta;
702 const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
704 float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
705 __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
706 b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
708 if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
709 for( ; x <= width - 8; x += 8 )
711 __m128i x0, x1, y0, y1;
712 __m128 s0, s1, f0, f1;
713 x0 = _mm_load_si128((const __m128i*)(S0 + x));
714 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
715 y0 = _mm_load_si128((const __m128i*)(S1 + x));
716 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
718 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
719 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
720 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
721 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
722 s0 = _mm_add_ps(s0, f0);
723 s1 = _mm_add_ps(s1, f1);
725 x0 = _mm_load_si128((const __m128i*)(S2 + x));
726 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
727 y0 = _mm_load_si128((const __m128i*)(S3 + x));
728 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
730 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
731 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
732 s0 = _mm_add_ps(s0, f0);
733 s1 = _mm_add_ps(s1, f1);
734 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
735 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
736 s0 = _mm_add_ps(s0, f0);
737 s1 = _mm_add_ps(s1, f1);
739 x0 = _mm_cvtps_epi32(s0);
740 x1 = _mm_cvtps_epi32(s1);
742 x0 = _mm_packs_epi32(x0, x1);
743 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
746 for( ; x <= width - 8; x += 8 )
748 __m128i x0, x1, y0, y1;
749 __m128 s0, s1, f0, f1;
750 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
751 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
752 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
753 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
755 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
756 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
757 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
758 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
759 s0 = _mm_add_ps(s0, f0);
760 s1 = _mm_add_ps(s1, f1);
762 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
763 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
764 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
765 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
767 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
768 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
769 s0 = _mm_add_ps(s0, f0);
770 s1 = _mm_add_ps(s1, f1);
771 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
772 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
773 s0 = _mm_add_ps(s0, f0);
774 s1 = _mm_add_ps(s1, f1);
776 x0 = _mm_cvtps_epi32(s0);
777 x1 = _mm_cvtps_epi32(s1);
779 x0 = _mm_packs_epi32(x0, x1);
780 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
788 template<int shiftval> struct VResizeCubicVec_32f16
790 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
792 if( !checkHardwareSupport(CV_CPU_SSE2) )
795 const float** src = (const float**)_src;
796 const float* beta = (const float*)_beta;
797 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
798 ushort* dst = (ushort*)_dst;
800 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
801 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
802 __m128i preshift = _mm_set1_epi32(shiftval);
803 __m128i postshift = _mm_set1_epi16((short)shiftval);
805 for( ; x <= width - 8; x += 8 )
807 __m128 x0, x1, y0, y1, s0, s1;
809 x0 = _mm_loadu_ps(S0 + x);
810 x1 = _mm_loadu_ps(S0 + x + 4);
811 y0 = _mm_loadu_ps(S1 + x);
812 y1 = _mm_loadu_ps(S1 + x + 4);
814 s0 = _mm_mul_ps(x0, b0);
815 s1 = _mm_mul_ps(x1, b0);
816 y0 = _mm_mul_ps(y0, b1);
817 y1 = _mm_mul_ps(y1, b1);
818 s0 = _mm_add_ps(s0, y0);
819 s1 = _mm_add_ps(s1, y1);
821 x0 = _mm_loadu_ps(S2 + x);
822 x1 = _mm_loadu_ps(S2 + x + 4);
823 y0 = _mm_loadu_ps(S3 + x);
824 y1 = _mm_loadu_ps(S3 + x + 4);
826 x0 = _mm_mul_ps(x0, b2);
827 x1 = _mm_mul_ps(x1, b2);
828 y0 = _mm_mul_ps(y0, b3);
829 y1 = _mm_mul_ps(y1, b3);
830 s0 = _mm_add_ps(s0, x0);
831 s1 = _mm_add_ps(s1, x1);
832 s0 = _mm_add_ps(s0, y0);
833 s1 = _mm_add_ps(s1, y1);
835 t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
836 t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
838 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
839 _mm_storeu_si128( (__m128i*)(dst + x), t0);
846 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
847 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
849 struct VResizeCubicVec_32f
851 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
853 if( !checkHardwareSupport(CV_CPU_SSE) )
856 const float** src = (const float**)_src;
857 const float* beta = (const float*)_beta;
858 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
859 float* dst = (float*)_dst;
861 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
862 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
864 for( ; x <= width - 8; x += 8 )
866 __m128 x0, x1, y0, y1, s0, s1;
867 x0 = _mm_loadu_ps(S0 + x);
868 x1 = _mm_loadu_ps(S0 + x + 4);
869 y0 = _mm_loadu_ps(S1 + x);
870 y1 = _mm_loadu_ps(S1 + x + 4);
872 s0 = _mm_mul_ps(x0, b0);
873 s1 = _mm_mul_ps(x1, b0);
874 y0 = _mm_mul_ps(y0, b1);
875 y1 = _mm_mul_ps(y1, b1);
876 s0 = _mm_add_ps(s0, y0);
877 s1 = _mm_add_ps(s1, y1);
879 x0 = _mm_loadu_ps(S2 + x);
880 x1 = _mm_loadu_ps(S2 + x + 4);
881 y0 = _mm_loadu_ps(S3 + x);
882 y1 = _mm_loadu_ps(S3 + x + 4);
884 x0 = _mm_mul_ps(x0, b2);
885 x1 = _mm_mul_ps(x1, b2);
886 y0 = _mm_mul_ps(y0, b3);
887 y1 = _mm_mul_ps(y1, b3);
888 s0 = _mm_add_ps(s0, x0);
889 s1 = _mm_add_ps(s1, x1);
890 s0 = _mm_add_ps(s0, y0);
891 s1 = _mm_add_ps(s1, y1);
893 _mm_storeu_ps( dst + x, s0);
894 _mm_storeu_ps( dst + x + 4, s1);
903 struct VResizeLanczos4Vec_32f16u
905 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
907 const float** src = (const float**)_src;
908 const float* beta = (const float*)_beta;
909 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
910 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
911 short * dst = (short*)_dst;
913 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
914 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
915 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
916 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
918 for( ; x <= width - 8; x += 8 )
920 __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
921 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
922 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
923 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
924 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
925 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
926 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
927 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
929 __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
930 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
931 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
932 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
933 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
934 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
935 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
936 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
938 __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
939 __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
941 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
950 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
954 struct VResizeLanczos4Vec_32f16s
956 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
958 const float** src = (const float**)_src;
959 const float* beta = (const float*)_beta;
960 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
961 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
962 short * dst = (short*)_dst;
964 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
965 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
966 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
967 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
969 for( ; x <= width - 8; x += 8 )
971 __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
972 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
973 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
974 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
975 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
976 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
977 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
978 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
980 __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
981 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
982 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
983 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
984 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
985 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
986 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
987 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
989 __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
990 __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
992 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
1000 struct VResizeLanczos4Vec_32f
1002 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1004 const float** src = (const float**)_src;
1005 const float* beta = (const float*)_beta;
1006 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1007 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1008 float* dst = (float*)_dst;
1011 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
1012 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
1013 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
1014 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
1016 for( ; x <= width - 4; x += 4 )
1018 __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
1019 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
1020 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
1021 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
1022 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
1023 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
1024 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
1025 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
1027 _mm_storeu_ps(dst + x, v_dst);
1037 struct VResizeLinearVec_32s8u
1039 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
1041 const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
1042 const short* beta = (const short*)_beta;
1044 int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
1046 for( ; x <= width - 16; x += 16)
1048 int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
1049 int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
1051 int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
1052 int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
1054 int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
1055 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
1056 v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
1058 v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
1059 v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
1060 v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
1061 v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
1063 v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
1064 v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
1066 int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
1067 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
1068 v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
1070 vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
1077 struct VResizeLinearVec_32f16u
1079 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1081 const float** src = (const float**)_src;
1082 const float* beta = (const float*)_beta;
1083 const float *S0 = src[0], *S1 = src[1];
1084 ushort* dst = (ushort*)_dst;
1087 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1089 for( ; x <= width - 8; x += 8 )
1091 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1092 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1094 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
1095 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
1097 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1098 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1105 struct VResizeLinearVec_32f16s
1107 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1109 const float** src = (const float**)_src;
1110 const float* beta = (const float*)_beta;
1111 const float *S0 = src[0], *S1 = src[1];
1112 short* dst = (short*)_dst;
1115 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1117 for( ; x <= width - 8; x += 8 )
1119 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1120 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1122 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
1123 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
1125 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1126 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1133 struct VResizeLinearVec_32f
1135 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1137 const float** src = (const float**)_src;
1138 const float* beta = (const float*)_beta;
1139 const float *S0 = src[0], *S1 = src[1];
1140 float* dst = (float*)_dst;
1143 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
1145 for( ; x <= width - 8; x += 8 )
1147 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
1148 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
1150 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
1151 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
1158 typedef VResizeNoVec VResizeCubicVec_32s8u;
1160 struct VResizeCubicVec_32f16u
1162 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1164 const float** src = (const float**)_src;
1165 const float* beta = (const float*)_beta;
1166 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1167 ushort* dst = (ushort*)_dst;
1169 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1170 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1172 for( ; x <= width - 8; x += 8 )
1174 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1175 v_b1, vld1q_f32(S1 + x)),
1176 v_b2, vld1q_f32(S2 + x)),
1177 v_b3, vld1q_f32(S3 + x));
1178 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1179 v_b1, vld1q_f32(S1 + x + 4)),
1180 v_b2, vld1q_f32(S2 + x + 4)),
1181 v_b3, vld1q_f32(S3 + x + 4));
1183 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
1184 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1191 struct VResizeCubicVec_32f16s
1193 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1195 const float** src = (const float**)_src;
1196 const float* beta = (const float*)_beta;
1197 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1198 short* dst = (short*)_dst;
1200 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1201 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1203 for( ; x <= width - 8; x += 8 )
1205 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1206 v_b1, vld1q_f32(S1 + x)),
1207 v_b2, vld1q_f32(S2 + x)),
1208 v_b3, vld1q_f32(S3 + x));
1209 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1210 v_b1, vld1q_f32(S1 + x + 4)),
1211 v_b2, vld1q_f32(S2 + x + 4)),
1212 v_b3, vld1q_f32(S3 + x + 4));
1214 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
1215 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1222 struct VResizeCubicVec_32f
1224 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1226 const float** src = (const float**)_src;
1227 const float* beta = (const float*)_beta;
1228 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1229 float* dst = (float*)_dst;
1231 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1232 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
1234 for( ; x <= width - 8; x += 8 )
1236 vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1237 v_b1, vld1q_f32(S1 + x)),
1238 v_b2, vld1q_f32(S2 + x)),
1239 v_b3, vld1q_f32(S3 + x)));
1240 vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1241 v_b1, vld1q_f32(S1 + x + 4)),
1242 v_b2, vld1q_f32(S2 + x + 4)),
1243 v_b3, vld1q_f32(S3 + x + 4)));
1250 struct VResizeLanczos4Vec_32f16u
1252 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1254 const float** src = (const float**)_src;
1255 const float* beta = (const float*)_beta;
1256 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1257 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1258 ushort * dst = (ushort*)_dst;
1260 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1261 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1262 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1263 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1265 for( ; x <= width - 8; x += 8 )
1267 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1268 v_b1, vld1q_f32(S1 + x)),
1269 v_b2, vld1q_f32(S2 + x)),
1270 v_b3, vld1q_f32(S3 + x));
1271 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1272 v_b5, vld1q_f32(S5 + x)),
1273 v_b6, vld1q_f32(S6 + x)),
1274 v_b7, vld1q_f32(S7 + x));
1275 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1277 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1278 v_b1, vld1q_f32(S1 + x + 4)),
1279 v_b2, vld1q_f32(S2 + x + 4)),
1280 v_b3, vld1q_f32(S3 + x + 4));
1281 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1282 v_b5, vld1q_f32(S5 + x + 4)),
1283 v_b6, vld1q_f32(S6 + x + 4)),
1284 v_b7, vld1q_f32(S7 + x + 4));
1285 v_dst1 = vaddq_f32(v_dst0, v_dst1);
1287 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
1288 vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
1295 struct VResizeLanczos4Vec_32f16s
1297 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1299 const float** src = (const float**)_src;
1300 const float* beta = (const float*)_beta;
1301 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1302 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1303 short * dst = (short*)_dst;
1305 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1306 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1307 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1308 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1310 for( ; x <= width - 8; x += 8 )
1312 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1313 v_b1, vld1q_f32(S1 + x)),
1314 v_b2, vld1q_f32(S2 + x)),
1315 v_b3, vld1q_f32(S3 + x));
1316 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1317 v_b5, vld1q_f32(S5 + x)),
1318 v_b6, vld1q_f32(S6 + x)),
1319 v_b7, vld1q_f32(S7 + x));
1320 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
1322 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
1323 v_b1, vld1q_f32(S1 + x + 4)),
1324 v_b2, vld1q_f32(S2 + x + 4)),
1325 v_b3, vld1q_f32(S3 + x + 4));
1326 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
1327 v_b5, vld1q_f32(S5 + x + 4)),
1328 v_b6, vld1q_f32(S6 + x + 4)),
1329 v_b7, vld1q_f32(S7 + x + 4));
1330 v_dst1 = vaddq_f32(v_dst0, v_dst1);
1332 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
1333 vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
1340 struct VResizeLanczos4Vec_32f
1342 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
1344 const float** src = (const float**)_src;
1345 const float* beta = (const float*)_beta;
1346 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
1347 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
1348 float* dst = (float*)_dst;
1350 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
1351 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
1352 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
1353 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
1355 for( ; x <= width - 4; x += 4 )
1357 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
1358 v_b1, vld1q_f32(S1 + x)),
1359 v_b2, vld1q_f32(S2 + x)),
1360 v_b3, vld1q_f32(S3 + x));
1361 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
1362 v_b5, vld1q_f32(S5 + x)),
1363 v_b6, vld1q_f32(S6 + x)),
1364 v_b7, vld1q_f32(S7 + x));
1365 vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
1374 typedef VResizeNoVec VResizeLinearVec_32s8u;
1375 typedef VResizeNoVec VResizeLinearVec_32f16u;
1376 typedef VResizeNoVec VResizeLinearVec_32f16s;
1377 typedef VResizeNoVec VResizeLinearVec_32f;
1379 typedef VResizeNoVec VResizeCubicVec_32s8u;
1380 typedef VResizeNoVec VResizeCubicVec_32f16u;
1381 typedef VResizeNoVec VResizeCubicVec_32f16s;
1382 typedef VResizeNoVec VResizeCubicVec_32f;
1384 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
1385 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
1386 typedef VResizeNoVec VResizeLanczos4Vec_32f;
1390 typedef HResizeNoVec HResizeLinearVec_8u32s;
1391 typedef HResizeNoVec HResizeLinearVec_16u32f;
1392 typedef HResizeNoVec HResizeLinearVec_16s32f;
1393 typedef HResizeNoVec HResizeLinearVec_32f;
1394 typedef HResizeNoVec HResizeLinearVec_64f;
1397 template<typename T, typename WT, typename AT, int ONE, class VecOp>
1398 struct HResizeLinear
1400 typedef T value_type;
1401 typedef WT buf_type;
1402 typedef AT alpha_type;
1404 void operator()(const T** src, WT** dst, int count,
1405 const int* xofs, const AT* alpha,
1406 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1411 int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
1412 xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
1414 for( k = 0; k <= count - 2; k++ )
1416 const T *S0 = src[k], *S1 = src[k+1];
1417 WT *D0 = dst[k], *D1 = dst[k+1];
1418 for( dx = dx0; dx < xmax; dx++ )
1421 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
1422 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
1423 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
1424 D0[dx] = t0; D1[dx] = t1;
1427 for( ; dx < dwidth; dx++ )
1430 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
1434 for( ; k < count; k++ )
1436 const T *S = src[k];
1438 for( dx = 0; dx < xmax; dx++ )
1441 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
1444 for( ; dx < dwidth; dx++ )
1445 D[dx] = WT(S[xofs[dx]]*ONE);
1451 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1452 struct VResizeLinear
1454 typedef T value_type;
1455 typedef WT buf_type;
1456 typedef AT alpha_type;
1458 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1460 WT b0 = beta[0], b1 = beta[1];
1461 const WT *S0 = src[0], *S1 = src[1];
1465 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1466 #if CV_ENABLE_UNROLLED
1467 for( ; x <= width - 4; x += 4 )
1470 t0 = S0[x]*b0 + S1[x]*b1;
1471 t1 = S0[x+1]*b0 + S1[x+1]*b1;
1472 dst[x] = castOp(t0); dst[x+1] = castOp(t1);
1473 t0 = S0[x+2]*b0 + S1[x+2]*b1;
1474 t1 = S0[x+3]*b0 + S1[x+3]*b1;
1475 dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
1478 for( ; x < width; x++ )
1479 dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
1484 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
1486 typedef uchar value_type;
1487 typedef int buf_type;
1488 typedef short alpha_type;
1490 void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
1492 alpha_type b0 = beta[0], b1 = beta[1];
1493 const buf_type *S0 = src[0], *S1 = src[1];
1494 VResizeLinearVec_32s8u vecOp;
1496 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1497 #if CV_ENABLE_UNROLLED
1498 for( ; x <= width - 4; x += 4 )
1500 dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
1501 dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
1502 dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
1503 dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
1506 for( ; x < width; x++ )
1507 dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
1512 template<typename T, typename WT, typename AT>
1515 typedef T value_type;
1516 typedef WT buf_type;
1517 typedef AT alpha_type;
1519 void operator()(const T** src, WT** dst, int count,
1520 const int* xofs, const AT* alpha,
1521 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1523 for( int k = 0; k < count; k++ )
1525 const T *S = src[k];
1527 int dx = 0, limit = xmin;
1530 for( ; dx < limit; dx++, alpha += 4 )
1532 int j, sx = xofs[dx] - cn;
1534 for( j = 0; j < 4; j++ )
1536 int sxj = sx + j*cn;
1537 if( (unsigned)sxj >= (unsigned)swidth )
1541 while( sxj >= swidth )
1544 v += S[sxj]*alpha[j];
1548 if( limit == dwidth )
1550 for( ; dx < xmax; dx++, alpha += 4 )
1553 D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
1554 S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
1564 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1567 typedef T value_type;
1568 typedef WT buf_type;
1569 typedef AT alpha_type;
1571 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1573 WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
1574 const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
1578 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1579 for( ; x < width; x++ )
1580 dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
1585 template<typename T, typename WT, typename AT>
1586 struct HResizeLanczos4
1588 typedef T value_type;
1589 typedef WT buf_type;
1590 typedef AT alpha_type;
1592 void operator()(const T** src, WT** dst, int count,
1593 const int* xofs, const AT* alpha,
1594 int swidth, int dwidth, int cn, int xmin, int xmax ) const
1596 for( int k = 0; k < count; k++ )
1598 const T *S = src[k];
1600 int dx = 0, limit = xmin;
1603 for( ; dx < limit; dx++, alpha += 8 )
1605 int j, sx = xofs[dx] - cn*3;
1607 for( j = 0; j < 8; j++ )
1609 int sxj = sx + j*cn;
1610 if( (unsigned)sxj >= (unsigned)swidth )
1614 while( sxj >= swidth )
1617 v += S[sxj]*alpha[j];
1621 if( limit == dwidth )
1623 for( ; dx < xmax; dx++, alpha += 8 )
1626 D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
1627 S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
1628 S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
1629 S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
1639 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
1640 struct VResizeLanczos4
1642 typedef T value_type;
1643 typedef WT buf_type;
1644 typedef AT alpha_type;
1646 void operator()(const WT** src, T* dst, const AT* beta, int width ) const
1650 int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
1651 #if CV_ENABLE_UNROLLED
1652 for( ; x <= width - 4; x += 4 )
1655 const WT* S = src[0];
1656 WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
1658 for( k = 1; k < 8; k++ )
1660 b = beta[k]; S = src[k];
1661 s0 += S[x]*b; s1 += S[x+1]*b;
1662 s2 += S[x+2]*b; s3 += S[x+3]*b;
1665 dst[x] = castOp(s0); dst[x+1] = castOp(s1);
1666 dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
1669 for( ; x < width; x++ )
1671 dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
1672 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
1673 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
1679 static inline int clip(int x, int a, int b)
1681 return x >= a ? (x < b ? x : b-1) : a;
1684 static const int MAX_ESIZE=16;
1686 template <typename HResize, typename VResize>
1687 class resizeGeneric_Invoker :
1688 public ParallelLoopBody
1691 typedef typename HResize::value_type T;
1692 typedef typename HResize::buf_type WT;
1693 typedef typename HResize::alpha_type AT;
1695 resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
1696 const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
1697 int _ksize, int _xmin, int _xmax) :
1698 ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
1699 alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
1700 ksize(_ksize), xmin(_xmin), xmax(_xmax)
1702 CV_Assert(ksize <= MAX_ESIZE);
1705 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1706 # pragma GCC diagnostic push
1707 # pragma GCC diagnostic ignored "-Warray-bounds"
1709 virtual void operator() (const Range& range) const
1711 int dy, cn = src.channels();
1715 int bufstep = (int)alignSize(dsize.width, 16);
1716 AutoBuffer<WT> _buffer(bufstep*ksize);
1717 const T* srows[MAX_ESIZE]={0};
1718 WT* rows[MAX_ESIZE]={0};
1719 int prev_sy[MAX_ESIZE];
1721 for(int k = 0; k < ksize; k++ )
1724 rows[k] = (WT*)_buffer + bufstep*k;
1727 const AT* beta = _beta + ksize * range.start;
1729 for( dy = range.start; dy < range.end; dy++, beta += ksize )
1731 int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
1733 for(int k = 0; k < ksize; k++ )
1735 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
1736 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
1738 if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
1741 memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
1746 k0 = std::min(k0, k); // remember the first row that needs to be computed
1747 srows[k] = src.template ptr<T>(sy);
1752 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
1753 ssize.width, dsize.width, cn, xmin, xmax );
1754 vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
1757 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
1758 # pragma GCC diagnostic pop
1764 const int* xofs, *yofs;
1765 const AT* alpha, *_beta;
1767 const int ksize, xmin, xmax;
1769 resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
1772 template<class HResize, class VResize>
1773 static void resizeGeneric_( const Mat& src, Mat& dst,
1774 const int* xofs, const void* _alpha,
1775 const int* yofs, const void* _beta,
1776 int xmin, int xmax, int ksize )
1778 typedef typename HResize::alpha_type AT;
1780 const AT* beta = (const AT*)_beta;
1781 Size ssize = src.size(), dsize = dst.size();
1782 int cn = src.channels();
1787 // image resize is a separable operation. In case of not too strong
1789 Range range(0, dsize.height);
1790 resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
1791 ssize, dsize, ksize, xmin, xmax);
1792 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
1795 template <typename T, typename WT>
1796 struct ResizeAreaFastNoVec
1798 ResizeAreaFastNoVec(int, int) { }
1799 ResizeAreaFastNoVec(int, int, int, int) { }
1800 int operator() (const T*, T*, int) const
1806 class ResizeAreaFastVec_SIMD_8u
1809 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
1810 cn(_cn), step(_step)
1814 int operator() (const uchar* S, uchar* D, int w) const
1817 const uchar* S0 = S, * S1 = S0 + step;
1819 uint16x8_t v_2 = vdupq_n_u16(2);
1823 for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
1825 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
1827 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
1828 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
1829 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
1831 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
1832 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
1833 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
1835 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
1840 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1842 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
1844 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
1845 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
1846 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
1847 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
1849 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
1850 vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
1851 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
1852 vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
1853 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
1855 vst1_u8(D, vmovn_u16(v_dst));
1866 class ResizeAreaFastVec_SIMD_16u
1869 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
1870 cn(_cn), step(_step)
1874 int operator() (const ushort * S, ushort * D, int w) const
1877 const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
1879 uint32x4_t v_2 = vdupq_n_u32(2);
1883 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1885 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
1887 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
1888 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
1889 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
1891 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
1892 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
1893 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
1895 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
1900 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1902 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
1903 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
1904 vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
1905 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
1916 class ResizeAreaFastVec_SIMD_16s
1919 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
1920 cn(_cn), step(_step)
1924 int operator() (const short * S, short * D, int w) const
1927 const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
1929 int32x4_t v_2 = vdupq_n_s32(2);
1933 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
1935 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
1937 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
1938 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
1939 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
1941 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
1942 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
1943 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
1945 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
1950 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1952 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
1953 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
1954 vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
1955 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
1966 struct ResizeAreaFastVec_SIMD_32f
1968 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
1969 cn(_cn), step(_step)
1971 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
1974 int operator() (const float * S, float * D, int w) const
1979 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
1982 float32x4_t v_025 = vdupq_n_f32(0.25f);
1986 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
1988 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
1990 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
1991 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
1993 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
1998 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2000 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
2001 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
2003 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
2018 class ResizeAreaFastVec_SIMD_8u
2021 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
2022 cn(_cn), step(_step)
2024 use_simd = checkHardwareSupport(CV_CPU_SSE2);
2027 int operator() (const uchar* S, uchar* D, int w) const
2033 const uchar* S0 = S;
2034 const uchar* S1 = S0 + step;
2035 __m128i zero = _mm_setzero_si128();
2036 __m128i delta2 = _mm_set1_epi16(2);
2040 __m128i masklow = _mm_set1_epi16(0x00ff);
2041 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2043 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2044 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2046 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
2047 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
2048 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
2049 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
2051 _mm_storel_epi64((__m128i*)D, s0);
2055 for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
2057 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2058 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2060 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
2061 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
2062 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
2063 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
2065 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
2066 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
2067 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2068 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
2069 _mm_storel_epi64((__m128i*)D, s0);
2071 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
2072 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
2073 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2074 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
2075 _mm_storel_epi64((__m128i*)(D+3), s0);
2080 int v[] = { 0, 0, -1, -1 };
2081 __m128i mask = _mm_loadu_si128((const __m128i*)v);
2083 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
2085 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2086 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2088 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
2089 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
2090 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
2091 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
2093 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
2094 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
2095 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2096 __m128i res0 = _mm_srli_epi16(s0, 2);
2098 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
2099 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
2100 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
2101 __m128i res1 = _mm_srli_epi16(s0, 2);
2102 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
2103 _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
2104 _mm_storel_epi64((__m128i*)(D), s0);
2117 class ResizeAreaFastVec_SIMD_16u
2120 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
2121 cn(_cn), step(_step)
2123 use_simd = checkHardwareSupport(CV_CPU_SSE2);
2126 int operator() (const ushort* S, ushort* D, int w) const
2132 const ushort* S0 = (const ushort*)S;
2133 const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
2134 __m128i masklow = _mm_set1_epi32(0x0000ffff);
2135 __m128i zero = _mm_setzero_si128();
2136 __m128i delta2 = _mm_set1_epi32(2);
2138 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
2142 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2144 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2145 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2147 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
2148 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
2149 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
2150 s0 = _mm_srli_epi32(s0, 2);
2151 s0 = _mm_packus_epi32(s0, zero);
2153 _mm_storel_epi64((__m128i*)D, s0);
2157 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2159 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2160 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2162 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
2163 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
2164 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
2165 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
2167 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
2168 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
2169 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
2170 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2171 _mm_storel_epi64((__m128i*)D, s0);
2176 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2178 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2179 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2181 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
2182 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
2183 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
2184 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
2186 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
2187 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
2188 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
2189 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
2190 _mm_storel_epi64((__m128i*)D, s0);
2194 #undef _mm_packus_epi32
2205 class ResizeAreaFastVec_SIMD_16s
2208 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
2209 cn(_cn), step(_step)
2211 use_simd = checkHardwareSupport(CV_CPU_SSE2);
2214 int operator() (const short* S, short* D, int w) const
2220 const short* S0 = (const short*)S;
2221 const short* S1 = (const short*)((const uchar*)(S) + step);
2222 __m128i masklow = _mm_set1_epi32(0x0000ffff);
2223 __m128i zero = _mm_setzero_si128();
2224 __m128i delta2 = _mm_set1_epi32(2);
2228 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2230 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2231 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2233 __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
2234 _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
2235 __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
2236 _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
2237 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
2238 s0 = _mm_srai_epi32(s0, 2);
2239 s0 = _mm_packs_epi32(s0, zero);
2241 _mm_storel_epi64((__m128i*)D, s0);
2245 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
2247 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2248 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2250 __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
2251 __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
2252 __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
2253 __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
2255 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
2256 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
2257 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
2258 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
2259 _mm_storel_epi64((__m128i*)D, s0);
2264 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2266 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
2267 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
2269 __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
2270 __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
2271 __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
2272 __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
2274 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
2275 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
2276 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
2277 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
2278 _mm_storel_epi64((__m128i*)D, s0);
2291 struct ResizeAreaFastVec_SIMD_32f
2293 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
2294 cn(_cn), step(_step)
2296 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
2297 fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
2300 int operator() (const float * S, float * D, int w) const
2305 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
2308 __m128 v_025 = _mm_set1_ps(0.25f);
2312 const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
2313 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2315 __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
2316 v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
2318 __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
2319 _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
2320 __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
2321 _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
2323 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
2328 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
2330 __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
2331 __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
2333 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
2348 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
2349 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
2350 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
2351 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
2355 template<typename T, typename SIMDVecOp>
2356 struct ResizeAreaFastVec
2358 ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
2359 scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
2361 fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
2364 int operator() (const T* S, T* D, int w) const
2369 const T* nextS = (const T*)((const uchar*)S + step);
2370 int dx = vecOp(S, D, w);
2373 for( ; dx < w; ++dx )
2376 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
2379 for( ; dx < w; dx += 3 )
2382 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
2383 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
2384 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
2389 for( ; dx < w; dx += 4 )
2392 D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
2393 D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
2394 D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
2395 D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
2403 int scale_x, scale_y;
2410 template <typename T, typename WT, typename VecOp>
2411 class resizeAreaFast_Invoker :
2412 public ParallelLoopBody
2415 resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
2416 int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
2417 ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
2418 scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
2422 virtual void operator() (const Range& range) const
2424 Size ssize = src.size(), dsize = dst.size();
2425 int cn = src.channels();
2426 int area = scale_x*scale_y;
2427 float scale = 1.f/(area);
2428 int dwidth1 = (ssize.width/scale_x)*cn;
2433 VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
2435 for( dy = range.start; dy < range.end; dy++ )
2437 T* D = (T*)(dst.data + dst.step*dy);
2438 int sy0 = dy*scale_y;
2439 int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
2441 if( sy0 >= ssize.height )
2443 for( dx = 0; dx < dsize.width; dx++ )
2448 dx = vop(src.template ptr<T>(sy0), D, w);
2449 for( ; dx < w; dx++ )
2451 const T* S = src.template ptr<T>(sy0) + xofs[dx];
2454 #if CV_ENABLE_UNROLLED
2455 for( ; k <= area - 4; k += 4 )
2456 sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
2458 for( ; k < area; k++ )
2461 D[dx] = saturate_cast<T>(sum * scale);
2464 for( ; dx < dsize.width; dx++ )
2467 int count = 0, sx0 = xofs[dx];
2468 if( sx0 >= ssize.width )
2471 for( int sy = 0; sy < scale_y; sy++ )
2473 if( sy0 + sy >= ssize.height )
2475 const T* S = src.template ptr<T>(sy0 + sy) + sx0;
2476 for( int sx = 0; sx < scale_x*cn; sx += cn )
2478 if( sx0 + sx >= ssize.width )
2485 D[dx] = saturate_cast<T>((float)sum/count);
2493 int scale_x, scale_y;
2494 const int *ofs, *xofs;
2497 template<typename T, typename WT, typename VecOp>
2498 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
2499 int scale_x, int scale_y )
2501 Range range(0, dst.rows);
2502 resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
2503 scale_y, ofs, xofs);
2504 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
2507 struct DecimateAlpha
2514 template<typename T, typename WT> class ResizeArea_Invoker :
2515 public ParallelLoopBody
2518 ResizeArea_Invoker( const Mat& _src, Mat& _dst,
2519 const DecimateAlpha* _xtab, int _xtab_size,
2520 const DecimateAlpha* _ytab, int _ytab_size,
2521 const int* _tabofs )
2526 xtab_size0 = _xtab_size;
2528 ytab_size = _ytab_size;
2532 virtual void operator() (const Range& range) const
2534 Size dsize = dst->size();
2535 int cn = dst->channels();
2537 AutoBuffer<WT> _buffer(dsize.width*2);
2538 const DecimateAlpha* xtab = xtab0;
2539 int xtab_size = xtab_size0;
2540 WT *buf = _buffer, *sum = buf + dsize.width;
2541 int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
2543 for( dx = 0; dx < dsize.width; dx++ )
2546 for( j = j_start; j < j_end; j++ )
2548 WT beta = ytab[j].alpha;
2549 int dy = ytab[j].di;
2550 int sy = ytab[j].si;
2553 const T* S = src->template ptr<T>(sy);
2554 for( dx = 0; dx < dsize.width; dx++ )
2558 for( k = 0; k < xtab_size; k++ )
2560 int dxn = xtab[k].di;
2561 WT alpha = xtab[k].alpha;
2562 buf[dxn] += S[xtab[k].si]*alpha;
2565 for( k = 0; k < xtab_size; k++ )
2567 int sxn = xtab[k].si;
2568 int dxn = xtab[k].di;
2569 WT alpha = xtab[k].alpha;
2570 WT t0 = buf[dxn] + S[sxn]*alpha;
2571 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2572 buf[dxn] = t0; buf[dxn+1] = t1;
2575 for( k = 0; k < xtab_size; k++ )
2577 int sxn = xtab[k].si;
2578 int dxn = xtab[k].di;
2579 WT alpha = xtab[k].alpha;
2580 WT t0 = buf[dxn] + S[sxn]*alpha;
2581 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2582 WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
2583 buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
2587 for( k = 0; k < xtab_size; k++ )
2589 int sxn = xtab[k].si;
2590 int dxn = xtab[k].di;
2591 WT alpha = xtab[k].alpha;
2592 WT t0 = buf[dxn] + S[sxn]*alpha;
2593 WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
2594 buf[dxn] = t0; buf[dxn+1] = t1;
2595 t0 = buf[dxn+2] + S[sxn+2]*alpha;
2596 t1 = buf[dxn+3] + S[sxn+3]*alpha;
2597 buf[dxn+2] = t0; buf[dxn+3] = t1;
2602 for( k = 0; k < xtab_size; k++ )
2604 int sxn = xtab[k].si;
2605 int dxn = xtab[k].di;
2606 WT alpha = xtab[k].alpha;
2607 for( int c = 0; c < cn; c++ )
2608 buf[dxn + c] += S[sxn + c]*alpha;
2615 T* D = dst->template ptr<T>(prev_dy);
2617 for( dx = 0; dx < dsize.width; dx++ )
2619 D[dx] = saturate_cast<T>(sum[dx]);
2620 sum[dx] = beta*buf[dx];
2626 for( dx = 0; dx < dsize.width; dx++ )
2627 sum[dx] += beta*buf[dx];
2632 T* D = dst->template ptr<T>(prev_dy);
2633 for( dx = 0; dx < dsize.width; dx++ )
2634 D[dx] = saturate_cast<T>(sum[dx]);
2641 const DecimateAlpha* xtab0;
2642 const DecimateAlpha* ytab;
2643 int xtab_size0, ytab_size;
2648 template <typename T, typename WT>
2649 static void resizeArea_( const Mat& src, Mat& dst,
2650 const DecimateAlpha* xtab, int xtab_size,
2651 const DecimateAlpha* ytab, int ytab_size,
2654 parallel_for_(Range(0, dst.rows),
2655 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
2656 dst.total()/((double)(1 << 16)));
2660 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
2661 const int* xofs, const void* alpha,
2662 const int* yofs, const void* beta,
2663 int xmin, int xmax, int ksize );
2665 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
2666 const int* ofs, const int *xofs,
2667 int scale_x, int scale_y );
2669 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
2670 const DecimateAlpha* xtab, int xtab_size,
2671 const DecimateAlpha* ytab, int ytab_size,
2675 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
2678 for(int dx = 0; dx < dsize; dx++ )
2680 double fsx1 = dx * scale;
2681 double fsx2 = fsx1 + scale;
2682 double cellWidth = std::min(scale, ssize - fsx1);
2684 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2686 sx2 = std::min(sx2, ssize - 1);
2687 sx1 = std::min(sx1, sx2);
2689 if( sx1 - fsx1 > 1e-3 )
2691 assert( k < ssize*2 );
2692 tab[k].di = dx * cn;
2693 tab[k].si = (sx1 - 1) * cn;
2694 tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
2697 for(int sx = sx1; sx < sx2; sx++ )
2699 assert( k < ssize*2 );
2700 tab[k].di = dx * cn;
2701 tab[k].si = sx * cn;
2702 tab[k++].alpha = float(1.0 / cellWidth);
2705 if( fsx2 - sx2 > 1e-3 )
2707 assert( k < ssize*2 );
2708 tab[k].di = dx * cn;
2709 tab[k].si = sx2 * cn;
2710 tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2716 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
2718 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
2719 ippiResize = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2720 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2721 specBuf.allocate(specSize);\
2722 pSpec = (uchar*)specBuf;\
2723 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
2725 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
2726 if (mode == (int)ippCubic) { *ok = false; return; } \
2727 ippiResize = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
2728 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2729 specBuf.allocate(specSize);\
2730 pSpec = (uchar*)specBuf;\
2731 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
2732 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
2733 getSrcOffsetFunc = (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
2735 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
2736 ippiResize = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
2737 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
2738 specBuf.allocate(specSize);\
2739 pSpec = (uchar*)specBuf;\
2740 AutoBuffer<uchar> buf(initSize);\
2741 uchar* pInit = (uchar*)buf;\
2742 CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
2744 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
2745 if (mode == (int)ippLinear) { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
2746 else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
2747 else { *ok = false; return; } \
2748 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
2749 getSrcOffsetFunc = (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
2751 #if IPP_VERSION_X100 >= 710
2752 class IPPresizeInvoker :
2753 public ParallelLoopBody
2756 IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
2757 ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
2758 inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
2759 ippiResize(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
2762 IppiSize srcSize, dstSize;
2763 int type = src.type(), specSize = 0, initSize = 0;
2764 srcSize.width = src.cols;
2765 srcSize.height = src.rows;
2766 dstSize.width = dst.cols;
2767 dstSize.height = dst.rows;
2771 #if IPP_DISABLE_BLOCK // disabled since it breaks tests for CascadeClassifier
2772 case CV_8UC1: SET_IPP_RESIZE_PTR(8u,C1); break;
2773 case CV_8UC3: SET_IPP_RESIZE_PTR(8u,C3); break;
2774 case CV_8UC4: SET_IPP_RESIZE_PTR(8u,C4); break;
2776 case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
2777 case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
2778 case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
2779 case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
2780 case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
2781 case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
2782 case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
2783 case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
2784 case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
2785 case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
2786 case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
2787 case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
2788 default: { *ok = false; return; } break;
2796 virtual void operator() (const Range& range) const
2801 int cn = src.channels();
2802 int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
2803 int dstwidth = min(cvRound(src.cols * inv_scale_x), dst.cols);
2804 int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
2806 IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
2807 IppiSize dstSize = { dstwidth, dstheight - dsty };
2808 int bufsize = 0, itemSize = (int)src.elemSize1();
2810 CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
2811 CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
2813 const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
2814 Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
2816 AutoBuffer<uchar> buf(bufsize + 64);
2817 uchar* bufptr = alignPtr((uchar*)buf, 32);
2819 if( CV_INSTRUMENT_FUN_IPP(ippiResize, pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr) < 0 )
2823 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2832 AutoBuffer<uchar> specBuf;
2834 ippiResizeFunc ippiResize;
2835 ippiResizeGetBufferSize getBufferSizeFunc;
2836 ippiResizeGetSrcOffset getSrcOffsetFunc;
2838 const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
2845 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
2846 float * const alpha_tab, int * const ofs_tab)
2849 for ( ; dx < dsize; dx++)
2853 double fsx1 = dx * scale;
2854 double fsx2 = fsx1 + scale;
2855 double cellWidth = std::min(scale, ssize - fsx1);
2857 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
2859 sx2 = std::min(sx2, ssize - 1);
2860 sx1 = std::min(sx1, sx2);
2862 if (sx1 - fsx1 > 1e-3)
2864 map_tab[k] = sx1 - 1;
2865 alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
2868 for (int sx = sx1; sx < sx2; sx++)
2871 alpha_tab[k++] = float(1.0 / cellWidth);
2874 if (fsx2 - sx2 > 1e-3)
2877 alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
2883 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
2884 double fx, double fy, int interpolation)
2886 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2888 double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
2889 float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
2890 int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
2891 bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
2892 std::abs(inv_fy - iscale_y) < DBL_EPSILON;
2894 // in case of scale_x && scale_y is equal to 2
2895 // INTER_AREA (fast) also is equal to INTER_LINEAR
2896 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
2897 /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
2900 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
2901 (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
2904 UMat src = _src.getUMat();
2905 _dst.create(dsize, type);
2906 UMat dst = _dst.getUMat();
2908 Size ssize = src.size();
2910 size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
2912 ocl::Image2D srcImage;
2914 // See if this could be done with a sampler. We stick with integer
2915 // datatypes because the observed error is low.
2916 bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
2917 ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
2918 ocl::Image2D::isFormatSupported(depth, cn, true) &&
2922 int wdepth = std::max(depth, CV_32S);
2924 cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
2925 "-D convertToDT=%s -D cn=%d",
2926 depth, ocl::typeToStr(type), ocl::typeToStr(depth),
2927 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2929 k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
2935 // Convert the input into an OpenCL image type, using normalized channel data types
2936 // and aliasing the UMat.
2937 srcImage = ocl::Image2D(src, true, true);
2938 k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
2939 (float)inv_fx, (float)inv_fy);
2943 if (interpolation == INTER_LINEAR && !useSampler)
2947 // integer path is slower because of CPU part, so it's disabled
2948 if (depth == CV_8U && ((void)0, 0))
2950 AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
2951 int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
2952 short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
2956 for (int dx = 0; dx < dsize.width; dx++)
2958 fxx = (float)((dx+0.5)*inv_fx - 0.5);
2965 if (sx >= ssize.width-1)
2966 fxx = 0, sx = ssize.width-1;
2969 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
2970 ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE);
2973 for (int dy = 0; dy < dsize.height; dy++)
2975 fyy = (float)((dy+0.5)*inv_fy - 0.5);
2980 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
2981 ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE);
2984 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
2986 Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
2988 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
2989 format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
2990 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
2991 "-D INTER_RESIZE_COEF_BITS=%d",
2992 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
2993 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
2994 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
2995 cn, INTER_RESIZE_COEF_BITS));
2999 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3000 ocl::KernelArg::PtrReadOnly(coeffs));
3004 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
3005 k.create("resizeLN", ocl::imgproc::resize_oclsrc,
3006 format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
3007 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
3008 "-D INTER_RESIZE_COEF_BITS=%d",
3009 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3010 ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
3011 ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
3012 cn, INTER_RESIZE_COEF_BITS));
3016 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3017 (float)inv_fx, (float)inv_fy);
3020 else if (interpolation == INTER_NEAREST)
3022 k.create("resizeNN", ocl::imgproc::resize_oclsrc,
3023 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
3024 ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
3028 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
3029 (float)inv_fx, (float)inv_fy);
3031 else if (interpolation == INTER_AREA)
3033 int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
3034 int wtype = CV_MAKE_TYPE(wdepth, cn);
3037 String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
3038 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
3039 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
3041 UMat alphaOcl, tabofsOcl, mapOcl;
3046 int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
3047 buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
3048 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
3049 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
3050 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
3051 iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
3053 k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
3059 buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
3060 k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
3064 int xytab_size = (ssize.width + ssize.height) << 1;
3065 int tabofs_size = dsize.height + dsize.width + 2;
3067 AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
3068 AutoBuffer<float> _xyalpha_tab(xytab_size);
3069 int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
3070 float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
3071 int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
3073 ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
3074 ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
3076 // loading precomputed arrays to GPU
3077 Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
3078 Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
3079 Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
3082 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
3085 k.args(srcarg, dstarg);
3087 k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
3088 ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
3090 return k.run(2, globalsize, NULL, false);
3093 return k.run(2, globalsize, 0, false);
3098 #if IPP_VERSION_X100 >= 710
3099 static bool ipp_resize_mt(Mat & src, Mat & dst,
3100 double inv_scale_x, double inv_scale_y, int interpolation)
3102 CV_INSTRUMENT_REGION_IPP()
3105 if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
3107 else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
3113 Range range(0, src.rows);
3114 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
3115 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
3123 //==================================================================================================
3127 void resize(int src_type,
3128 const uchar * src_data, size_t src_step, int src_width, int src_height,
3129 uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
3130 double inv_scale_x, double inv_scale_y, int interpolation)
3132 CV_INSTRUMENT_REGION()
3134 CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
3135 if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
3137 inv_scale_x = static_cast<double>(dst_width) / src_width;
3138 inv_scale_y = static_cast<double>(dst_height) / src_height;
3141 CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
3143 static ResizeFunc linear_tab[] =
3146 HResizeLinear<uchar, int, short,
3147 INTER_RESIZE_COEF_SCALE,
3148 HResizeLinearVec_8u32s>,
3149 VResizeLinear<uchar, int, short,
3150 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3151 VResizeLinearVec_32s8u> >,
3154 HResizeLinear<ushort, float, float, 1,
3155 HResizeLinearVec_16u32f>,
3156 VResizeLinear<ushort, float, float, Cast<float, ushort>,
3157 VResizeLinearVec_32f16u> >,
3159 HResizeLinear<short, float, float, 1,
3160 HResizeLinearVec_16s32f>,
3161 VResizeLinear<short, float, float, Cast<float, short>,
3162 VResizeLinearVec_32f16s> >,
3165 HResizeLinear<float, float, float, 1,
3166 HResizeLinearVec_32f>,
3167 VResizeLinear<float, float, float, Cast<float, float>,
3168 VResizeLinearVec_32f> >,
3170 HResizeLinear<double, double, float, 1,
3172 VResizeLinear<double, double, float, Cast<double, double>,
3177 static ResizeFunc cubic_tab[] =
3180 HResizeCubic<uchar, int, short>,
3181 VResizeCubic<uchar, int, short,
3182 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3183 VResizeCubicVec_32s8u> >,
3186 HResizeCubic<ushort, float, float>,
3187 VResizeCubic<ushort, float, float, Cast<float, ushort>,
3188 VResizeCubicVec_32f16u> >,
3190 HResizeCubic<short, float, float>,
3191 VResizeCubic<short, float, float, Cast<float, short>,
3192 VResizeCubicVec_32f16s> >,
3195 HResizeCubic<float, float, float>,
3196 VResizeCubic<float, float, float, Cast<float, float>,
3197 VResizeCubicVec_32f> >,
3199 HResizeCubic<double, double, float>,
3200 VResizeCubic<double, double, float, Cast<double, double>,
3205 static ResizeFunc lanczos4_tab[] =
3207 resizeGeneric_<HResizeLanczos4<uchar, int, short>,
3208 VResizeLanczos4<uchar, int, short,
3209 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
3212 resizeGeneric_<HResizeLanczos4<ushort, float, float>,
3213 VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
3214 VResizeLanczos4Vec_32f16u> >,
3215 resizeGeneric_<HResizeLanczos4<short, float, float>,
3216 VResizeLanczos4<short, float, float, Cast<float, short>,
3217 VResizeLanczos4Vec_32f16s> >,
3219 resizeGeneric_<HResizeLanczos4<float, float, float>,
3220 VResizeLanczos4<float, float, float, Cast<float, float>,
3221 VResizeLanczos4Vec_32f> >,
3222 resizeGeneric_<HResizeLanczos4<double, double, float>,
3223 VResizeLanczos4<double, double, float, Cast<double, double>,
3228 static ResizeAreaFastFunc areafast_tab[] =
3230 resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
3232 resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
3233 resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
3235 resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
3236 resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
3240 static ResizeAreaFunc area_tab[] =
3242 resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
3243 resizeArea_<short, float>, 0, resizeArea_<float, float>,
3244 resizeArea_<double, double>, 0
3247 int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
3248 double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
3250 int iscale_x = saturate_cast<int>(scale_x);
3251 int iscale_y = saturate_cast<int>(scale_y);
3253 bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
3254 std::abs(scale_y - iscale_y) < DBL_EPSILON;
3256 Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
3257 saturate_cast<int>(src_height*inv_scale_y));
3258 CV_Assert( dsize.area() > 0 );
3260 Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
3261 Mat dst(dsize, src_type, dst_data, dst_step);
3265 if (interpolation == INTER_LINEAR && src_height >= 2 && src_width >= 2)
3266 mode = INTER_LINEAR;
3267 else if (interpolation == INTER_CUBIC && src_height >= 4 && src_width >= 4)
3270 const double IPP_RESIZE_EPS = 1e-10;
3271 double ex = fabs((double)dsize.width / src_width - inv_scale_x) / inv_scale_x;
3272 double ey = fabs((double)dsize.height / src_height - inv_scale_y) / inv_scale_y;
3274 CV_IPP_RUN(IPP_VERSION_X100 >= 710 && ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
3275 (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
3276 !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U) &&
3277 mode >= 0 && (cn == 1 || cn == 3 || cn == 4) && (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
3278 (depth == CV_64F && mode == INTER_LINEAR)),
3279 ipp_resize_mt(src, dst, inv_scale_x, inv_scale_y, interpolation))
3281 if( interpolation == INTER_NEAREST )
3283 resizeNN( src, dst, inv_scale_x, inv_scale_y );
3287 int k, sx, sy, dx, dy;
3291 // in case of scale_x && scale_y is equal to 2
3292 // INTER_AREA (fast) also is equal to INTER_LINEAR
3293 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
3294 interpolation = INTER_AREA;
3296 // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
3297 // In other cases it is emulated using some variant of bilinear interpolation
3298 if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
3302 int area = iscale_x*iscale_y;
3303 size_t srcstep = src_step / src.elemSize1();
3304 AutoBuffer<int> _ofs(area + dsize.width*cn);
3306 int* xofs = ofs + area;
3307 ResizeAreaFastFunc func = areafast_tab[depth];
3308 CV_Assert( func != 0 );
3310 for( sy = 0, k = 0; sy < iscale_y; sy++ )
3311 for( sx = 0; sx < iscale_x; sx++ )
3312 ofs[k++] = (int)(sy*srcstep + sx*cn);
3314 for( dx = 0; dx < dsize.width; dx++ )
3318 for( k = 0; k < cn; k++ )
3319 xofs[j + k] = sx + k;
3322 func( src, dst, ofs, xofs, iscale_x, iscale_y );
3326 ResizeAreaFunc func = area_tab[depth];
3327 CV_Assert( func != 0 && cn <= 4 );
3329 AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
3330 DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2;
3332 int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
3333 int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
3335 AutoBuffer<int> _tabofs(dsize.height + 1);
3336 int* tabofs = _tabofs;
3337 for( k = 0, dy = 0; k < ytab_size; k++ )
3339 if( k == 0 || ytab[k].di != ytab[k-1].di )
3341 assert( ytab[k].di == dy );
3345 tabofs[dy] = ytab_size;
3347 func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
3352 int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
3353 bool area_mode = interpolation == INTER_AREA;
3354 bool fixpt = depth == CV_8U;
3357 int ksize=0, ksize2;
3358 if( interpolation == INTER_CUBIC )
3359 ksize = 4, func = cubic_tab[depth];
3360 else if( interpolation == INTER_LANCZOS4 )
3361 ksize = 8, func = lanczos4_tab[depth];
3362 else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
3363 ksize = 2, func = linear_tab[depth];
3365 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
3368 CV_Assert( func != 0 );
3370 AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
3371 int* xofs = (int*)(uchar*)_buffer;
3372 int* yofs = xofs + width;
3373 float* alpha = (float*)(yofs + dsize.height);
3374 short* ialpha = (short*)alpha;
3375 float* beta = alpha + width*ksize;
3376 short* ibeta = ialpha + width*ksize;
3377 float cbuf[MAX_ESIZE];
3379 for( dx = 0; dx < dsize.width; dx++ )
3383 fx = (float)((dx+0.5)*scale_x - 0.5);
3389 sx = cvFloor(dx*scale_x);
3390 fx = (float)((dx+1) - (sx+1)*inv_scale_x);
3391 fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
3397 if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3401 if( sx + ksize2 >= src_width )
3403 xmax = std::min( xmax, dx );
3404 if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
3405 fx = 0, sx = src_width-1;
3408 for( k = 0, sx *= cn; k < cn; k++ )
3409 xofs[dx*cn + k] = sx + k;
3411 if( interpolation == INTER_CUBIC )
3412 interpolateCubic( fx, cbuf );
3413 else if( interpolation == INTER_LANCZOS4 )
3414 interpolateLanczos4( fx, cbuf );
3422 for( k = 0; k < ksize; k++ )
3423 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3424 for( ; k < cn*ksize; k++ )
3425 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
3429 for( k = 0; k < ksize; k++ )
3430 alpha[dx*cn*ksize + k] = cbuf[k];
3431 for( ; k < cn*ksize; k++ )
3432 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
3436 for( dy = 0; dy < dsize.height; dy++ )
3440 fy = (float)((dy+0.5)*scale_y - 0.5);
3446 sy = cvFloor(dy*scale_y);
3447 fy = (float)((dy+1) - (sy+1)*inv_scale_y);
3448 fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
3452 if( interpolation == INTER_CUBIC )
3453 interpolateCubic( fy, cbuf );
3454 else if( interpolation == INTER_LANCZOS4 )
3455 interpolateLanczos4( fy, cbuf );
3464 for( k = 0; k < ksize; k++ )
3465 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
3469 for( k = 0; k < ksize; k++ )
3470 beta[dy*ksize + k] = cbuf[k];
3474 func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
3475 fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
3481 //==================================================================================================
3483 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
3484 double inv_scale_x, double inv_scale_y, int interpolation )
3486 CV_INSTRUMENT_REGION()
3488 Size ssize = _src.size();
3490 CV_Assert( ssize.width > 0 && ssize.height > 0 );
3491 CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
3492 if( dsize.area() == 0 )
3494 dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
3495 saturate_cast<int>(ssize.height*inv_scale_y));
3496 CV_Assert( dsize.area() > 0 );
3500 inv_scale_x = (double)dsize.width/ssize.width;
3501 inv_scale_y = (double)dsize.height/ssize.height;
3504 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
3505 ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
3507 Mat src = _src.getMat();
3508 _dst.create(dsize, src.type());
3509 Mat dst = _dst.getMat();
3513 // Source and destination are of same size. Use simple copy.
3518 hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
3522 /****************************************************************************************\
3523 * General warping (affine, perspective, remap) *
3524 \****************************************************************************************/
3529 template<typename T>
3530 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
3531 int borderType, const Scalar& _borderValue )
3533 Size ssize = _src.size(), dsize = _dst.size();
3534 int cn = _src.channels();
3535 const T* S0 = _src.ptr<T>();
3536 size_t sstep = _src.step/sizeof(S0[0]);
3537 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
3538 saturate_cast<T>(_borderValue[1]),
3539 saturate_cast<T>(_borderValue[2]),
3540 saturate_cast<T>(_borderValue[3]));
3543 unsigned width1 = ssize.width, height1 = ssize.height;
3545 if( _dst.isContinuous() && _xy.isContinuous() )
3547 dsize.width *= dsize.height;
3551 for( dy = 0; dy < dsize.height; dy++ )
3553 T* D = _dst.ptr<T>(dy);
3554 const short* XY = _xy.ptr<short>(dy);
3558 for( dx = 0; dx < dsize.width; dx++ )
3560 int sx = XY[dx*2], sy = XY[dx*2+1];
3561 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3562 D[dx] = S0[sy*sstep + sx];
3565 if( borderType == BORDER_REPLICATE )
3567 sx = clip(sx, 0, ssize.width);
3568 sy = clip(sy, 0, ssize.height);
3569 D[dx] = S0[sy*sstep + sx];
3571 else if( borderType == BORDER_CONSTANT )
3573 else if( borderType != BORDER_TRANSPARENT )
3575 sx = borderInterpolate(sx, ssize.width, borderType);
3576 sy = borderInterpolate(sy, ssize.height, borderType);
3577 D[dx] = S0[sy*sstep + sx];
3584 for( dx = 0; dx < dsize.width; dx++, D += cn )
3586 int sx = XY[dx*2], sy = XY[dx*2+1], k;
3588 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
3592 S = S0 + sy*sstep + sx*3;
3593 D[0] = S[0], D[1] = S[1], D[2] = S[2];
3597 S = S0 + sy*sstep + sx*4;
3598 D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
3602 S = S0 + sy*sstep + sx*cn;
3603 for( k = 0; k < cn; k++ )
3607 else if( borderType != BORDER_TRANSPARENT )
3609 if( borderType == BORDER_REPLICATE )
3611 sx = clip(sx, 0, ssize.width);
3612 sy = clip(sy, 0, ssize.height);
3613 S = S0 + sy*sstep + sx*cn;
3615 else if( borderType == BORDER_CONSTANT )
3619 sx = borderInterpolate(sx, ssize.width, borderType);
3620 sy = borderInterpolate(sy, ssize.height, borderType);
3621 S = S0 + sy*sstep + sx*cn;
3623 for( k = 0; k < cn; k++ )
3634 int operator()( const Mat&, void*, const short*, const ushort*,
3635 const void*, int ) const { return 0; }
3642 int operator()( const Mat& _src, void* _dst, const short* XY,
3643 const ushort* FXY, const void* _wtab, int width ) const
3645 int cn = _src.channels(), x = 0, sstep = (int)_src.step;
3647 if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
3651 const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
3652 const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
3653 uchar* D = (uchar*)_dst;
3654 __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
3655 __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
3656 __m128i z = _mm_setzero_si128();
3657 int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
3661 for( ; x <= width - 8; x += 8 )
3663 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3664 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
3665 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
3668 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3669 xy1 = _mm_madd_epi16( xy1, xy2ofs );
3670 _mm_store_si128( (__m128i*)iofs0, xy0 );
3671 _mm_store_si128( (__m128i*)iofs1, xy1 );
3673 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
3674 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
3675 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3676 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
3677 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
3678 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3679 v0 = _mm_unpacklo_epi8(v0, z);
3680 v1 = _mm_unpacklo_epi8(v1, z);
3682 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
3683 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
3684 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
3685 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
3686 b0 = _mm_unpacklo_epi64(a0, a1);
3687 b1 = _mm_unpackhi_epi64(a0, a1);
3688 v0 = _mm_madd_epi16(v0, b0);
3689 v1 = _mm_madd_epi16(v1, b1);
3690 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
3692 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
3693 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
3694 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3695 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
3696 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
3697 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
3698 v2 = _mm_unpacklo_epi8(v2, z);
3699 v3 = _mm_unpacklo_epi8(v3, z);
3701 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
3702 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
3703 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
3704 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
3705 b0 = _mm_unpacklo_epi64(a0, a1);
3706 b1 = _mm_unpackhi_epi64(a0, a1);
3707 v2 = _mm_madd_epi16(v2, b0);
3708 v3 = _mm_madd_epi16(v3, b1);
3709 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
3711 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
3712 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
3713 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
3714 _mm_storel_epi64( (__m128i*)(D + x), v0 );
3719 for( ; x <= width - 5; x += 4, D += 12 )
3721 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3722 __m128i u0, v0, u1, v1;
3724 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3725 _mm_store_si128( (__m128i*)iofs0, xy0 );
3726 const __m128i *w0, *w1;
3727 w0 = (const __m128i*)(wtab + FXY[x]*16);
3728 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3730 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3731 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
3732 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3733 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
3734 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3735 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
3736 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3737 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
3738 u0 = _mm_unpacklo_epi8(u0, z);
3739 v0 = _mm_unpacklo_epi8(v0, z);
3740 u1 = _mm_unpacklo_epi8(u1, z);
3741 v1 = _mm_unpacklo_epi8(v1, z);
3742 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3743 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3744 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3745 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3746 u0 = _mm_slli_si128(u0, 4);
3747 u0 = _mm_packs_epi32(u0, u1);
3748 u0 = _mm_packus_epi16(u0, u0);
3749 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
3751 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3752 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3754 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3755 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
3756 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3757 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
3758 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3759 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
3760 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3761 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
3762 u0 = _mm_unpacklo_epi8(u0, z);
3763 v0 = _mm_unpacklo_epi8(v0, z);
3764 u1 = _mm_unpacklo_epi8(u1, z);
3765 v1 = _mm_unpacklo_epi8(v1, z);
3766 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3767 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3768 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3769 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3770 u0 = _mm_slli_si128(u0, 4);
3771 u0 = _mm_packs_epi32(u0, u1);
3772 u0 = _mm_packus_epi16(u0, u0);
3773 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
3778 for( ; x <= width - 4; x += 4, D += 16 )
3780 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
3781 __m128i u0, v0, u1, v1;
3783 xy0 = _mm_madd_epi16( xy0, xy2ofs );
3784 _mm_store_si128( (__m128i*)iofs0, xy0 );
3785 const __m128i *w0, *w1;
3786 w0 = (const __m128i*)(wtab + FXY[x]*16);
3787 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
3789 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
3790 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
3791 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
3792 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
3793 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
3794 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
3795 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
3796 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
3797 u0 = _mm_unpacklo_epi8(u0, z);
3798 v0 = _mm_unpacklo_epi8(v0, z);
3799 u1 = _mm_unpacklo_epi8(u1, z);
3800 v1 = _mm_unpacklo_epi8(v1, z);
3801 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3802 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3803 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3804 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3805 u0 = _mm_packs_epi32(u0, u1);
3806 u0 = _mm_packus_epi16(u0, u0);
3807 _mm_storel_epi64((__m128i*)D, u0);
3809 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
3810 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
3812 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
3813 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
3814 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
3815 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
3816 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
3817 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
3818 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
3819 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
3820 u0 = _mm_unpacklo_epi8(u0, z);
3821 v0 = _mm_unpacklo_epi8(v0, z);
3822 u1 = _mm_unpacklo_epi8(u1, z);
3823 v1 = _mm_unpacklo_epi8(v1, z);
3824 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
3825 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
3826 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
3827 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
3828 u0 = _mm_packs_epi32(u0, u1);
3829 u0 = _mm_packus_epi16(u0, u0);
3830 _mm_storel_epi64((__m128i*)(D + 8), u0);
3840 typedef RemapNoVec RemapVec_8u;
3845 template<class CastOp, class VecOp, typename AT>
3846 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
3847 const Mat& _fxy, const void* _wtab,
3848 int borderType, const Scalar& _borderValue )
3850 typedef typename CastOp::rtype T;
3851 typedef typename CastOp::type1 WT;
3852 Size ssize = _src.size(), dsize = _dst.size();
3853 int k, cn = _src.channels();
3854 const AT* wtab = (const AT*)_wtab;
3855 const T* S0 = _src.ptr<T>();
3856 size_t sstep = _src.step/sizeof(S0[0]);
3862 for( k = 0; k < cn; k++ )
3863 cval[k] = saturate_cast<T>(_borderValue[k & 3]);
3865 unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
3866 CV_Assert( ssize.area() > 0 );
3868 if( _src.type() == CV_8UC3 )
3869 width1 = std::max(ssize.width-2, 0);
3872 for( dy = 0; dy < dsize.height; dy++ )
3874 T* D = _dst.ptr<T>(dy);
3875 const short* XY = _xy.ptr<short>(dy);
3876 const ushort* FXY = _fxy.ptr<ushort>(dy);
3878 bool prevInlier = false;
3880 for( dx = 0; dx <= dsize.width; dx++ )
3882 bool curInlier = dx < dsize.width ?
3883 (unsigned)XY[dx*2] < width1 &&
3884 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
3885 if( curInlier == prevInlier )
3891 prevInlier = curInlier;
3895 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
3901 for( ; dx < X1; dx++, D++ )
3903 int sx = XY[dx*2], sy = XY[dx*2+1];
3904 const AT* w = wtab + FXY[dx]*4;
3905 const T* S = S0 + sy*sstep + sx;
3906 *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
3910 for( ; dx < X1; dx++, D += 2 )
3912 int sx = XY[dx*2], sy = XY[dx*2+1];
3913 const AT* w = wtab + FXY[dx]*4;
3914 const T* S = S0 + sy*sstep + sx*2;
3915 WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
3916 WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
3917 D[0] = castOp(t0); D[1] = castOp(t1);
3920 for( ; dx < X1; dx++, D += 3 )
3922 int sx = XY[dx*2], sy = XY[dx*2+1];
3923 const AT* w = wtab + FXY[dx]*4;
3924 const T* S = S0 + sy*sstep + sx*3;
3925 WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
3926 WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
3927 WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
3928 D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
3931 for( ; dx < X1; dx++, D += 4 )
3933 int sx = XY[dx*2], sy = XY[dx*2+1];
3934 const AT* w = wtab + FXY[dx]*4;
3935 const T* S = S0 + sy*sstep + sx*4;
3936 WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
3937 WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
3938 D[0] = castOp(t0); D[1] = castOp(t1);
3939 t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
3940 t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
3941 D[2] = castOp(t0); D[3] = castOp(t1);
3944 for( ; dx < X1; dx++, D += cn )
3946 int sx = XY[dx*2], sy = XY[dx*2+1];
3947 const AT* w = wtab + FXY[dx]*4;
3948 const T* S = S0 + sy*sstep + sx*cn;
3949 for( k = 0; k < cn; k++ )
3951 WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
3958 if( borderType == BORDER_TRANSPARENT && cn != 3 )
3966 for( ; dx < X1; dx++, D++ )
3968 int sx = XY[dx*2], sy = XY[dx*2+1];
3969 if( borderType == BORDER_CONSTANT &&
3970 (sx >= ssize.width || sx+1 < 0 ||
3971 sy >= ssize.height || sy+1 < 0) )
3977 int sx0, sx1, sy0, sy1;
3979 const AT* w = wtab + FXY[dx]*4;
3980 if( borderType == BORDER_REPLICATE )
3982 sx0 = clip(sx, 0, ssize.width);
3983 sx1 = clip(sx+1, 0, ssize.width);
3984 sy0 = clip(sy, 0, ssize.height);
3985 sy1 = clip(sy+1, 0, ssize.height);
3986 v0 = S0[sy0*sstep + sx0];
3987 v1 = S0[sy0*sstep + sx1];
3988 v2 = S0[sy1*sstep + sx0];
3989 v3 = S0[sy1*sstep + sx1];
3993 sx0 = borderInterpolate(sx, ssize.width, borderType);
3994 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
3995 sy0 = borderInterpolate(sy, ssize.height, borderType);
3996 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
3997 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
3998 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
3999 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
4000 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
4002 D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
4006 for( ; dx < X1; dx++, D += cn )
4008 int sx = XY[dx*2], sy = XY[dx*2+1];
4009 if( borderType == BORDER_CONSTANT &&
4010 (sx >= ssize.width || sx+1 < 0 ||
4011 sy >= ssize.height || sy+1 < 0) )
4013 for( k = 0; k < cn; k++ )
4018 int sx0, sx1, sy0, sy1;
4019 const T *v0, *v1, *v2, *v3;
4020 const AT* w = wtab + FXY[dx]*4;
4021 if( borderType == BORDER_REPLICATE )
4023 sx0 = clip(sx, 0, ssize.width);
4024 sx1 = clip(sx+1, 0, ssize.width);
4025 sy0 = clip(sy, 0, ssize.height);
4026 sy1 = clip(sy+1, 0, ssize.height);
4027 v0 = S0 + sy0*sstep + sx0*cn;
4028 v1 = S0 + sy0*sstep + sx1*cn;
4029 v2 = S0 + sy1*sstep + sx0*cn;
4030 v3 = S0 + sy1*sstep + sx1*cn;
4032 else if( borderType == BORDER_TRANSPARENT &&
4033 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
4034 (unsigned)sy >= (unsigned)(ssize.height-1)))
4038 sx0 = borderInterpolate(sx, ssize.width, borderType);
4039 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
4040 sy0 = borderInterpolate(sy, ssize.height, borderType);
4041 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
4042 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
4043 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
4044 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
4045 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
4047 for( k = 0; k < cn; k++ )
4048 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
4057 template<class CastOp, typename AT, int ONE>
4058 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
4059 const Mat& _fxy, const void* _wtab,
4060 int borderType, const Scalar& _borderValue )
4062 typedef typename CastOp::rtype T;
4063 typedef typename CastOp::type1 WT;
4064 Size ssize = _src.size(), dsize = _dst.size();
4065 int cn = _src.channels();
4066 const AT* wtab = (const AT*)_wtab;
4067 const T* S0 = _src.ptr<T>();
4068 size_t sstep = _src.step/sizeof(S0[0]);
4069 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
4070 saturate_cast<T>(_borderValue[1]),
4071 saturate_cast<T>(_borderValue[2]),
4072 saturate_cast<T>(_borderValue[3]));
4075 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
4077 unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
4079 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
4081 dsize.width *= dsize.height;
4085 for( dy = 0; dy < dsize.height; dy++ )
4087 T* D = _dst.ptr<T>(dy);
4088 const short* XY = _xy.ptr<short>(dy);
4089 const ushort* FXY = _fxy.ptr<ushort>(dy);
4091 for( dx = 0; dx < dsize.width; dx++, D += cn )
4093 int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
4094 const AT* w = wtab + FXY[dx]*16;
4096 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
4098 const T* S = S0 + sy*sstep + sx*cn;
4099 for( k = 0; k < cn; k++ )
4101 WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
4103 sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
4105 sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
4107 sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
4115 if( borderType == BORDER_TRANSPARENT &&
4116 ((unsigned)(sx+1) >= (unsigned)ssize.width ||
4117 (unsigned)(sy+1) >= (unsigned)ssize.height) )
4120 if( borderType1 == BORDER_CONSTANT &&
4121 (sx >= ssize.width || sx+4 <= 0 ||
4122 sy >= ssize.height || sy+4 <= 0))
4124 for( k = 0; k < cn; k++ )
4129 for( i = 0; i < 4; i++ )
4131 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
4132 y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
4135 for( k = 0; k < cn; k++, S0++, w -= 16 )
4137 WT cv = cval[k], sum = cv*ONE;
4138 for( i = 0; i < 4; i++, w += 4 )
4141 const T* S = S0 + yi*sstep;
4145 sum += (S[x[0]] - cv)*w[0];
4147 sum += (S[x[1]] - cv)*w[1];
4149 sum += (S[x[2]] - cv)*w[2];
4151 sum += (S[x[3]] - cv)*w[3];
4162 template<class CastOp, typename AT, int ONE>
4163 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
4164 const Mat& _fxy, const void* _wtab,
4165 int borderType, const Scalar& _borderValue )
4167 typedef typename CastOp::rtype T;
4168 typedef typename CastOp::type1 WT;
4169 Size ssize = _src.size(), dsize = _dst.size();
4170 int cn = _src.channels();
4171 const AT* wtab = (const AT*)_wtab;
4172 const T* S0 = _src.ptr<T>();
4173 size_t sstep = _src.step/sizeof(S0[0]);
4174 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
4175 saturate_cast<T>(_borderValue[1]),
4176 saturate_cast<T>(_borderValue[2]),
4177 saturate_cast<T>(_borderValue[3]));
4180 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
4182 unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
4184 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
4186 dsize.width *= dsize.height;
4190 for( dy = 0; dy < dsize.height; dy++ )
4192 T* D = _dst.ptr<T>(dy);
4193 const short* XY = _xy.ptr<short>(dy);
4194 const ushort* FXY = _fxy.ptr<ushort>(dy);
4196 for( dx = 0; dx < dsize.width; dx++, D += cn )
4198 int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
4199 const AT* w = wtab + FXY[dx]*64;
4200 const T* S = S0 + sy*sstep + sx*cn;
4202 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
4204 for( k = 0; k < cn; k++ )
4207 for( int r = 0; r < 8; r++, S += sstep, w += 8 )
4208 sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
4209 S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
4218 if( borderType == BORDER_TRANSPARENT &&
4219 ((unsigned)(sx+3) >= (unsigned)ssize.width ||
4220 (unsigned)(sy+3) >= (unsigned)ssize.height) )
4223 if( borderType1 == BORDER_CONSTANT &&
4224 (sx >= ssize.width || sx+8 <= 0 ||
4225 sy >= ssize.height || sy+8 <= 0))
4227 for( k = 0; k < cn; k++ )
4232 for( i = 0; i < 8; i++ )
4234 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
4235 y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
4238 for( k = 0; k < cn; k++, S0++, w -= 64 )
4240 WT cv = cval[k], sum = cv*ONE;
4241 for( i = 0; i < 8; i++, w += 8 )
4244 const T* S1 = S0 + yi*sstep;
4248 sum += (S1[x[0]] - cv)*w[0];
4250 sum += (S1[x[1]] - cv)*w[1];
4252 sum += (S1[x[2]] - cv)*w[2];
4254 sum += (S1[x[3]] - cv)*w[3];
4256 sum += (S1[x[4]] - cv)*w[4];
4258 sum += (S1[x[5]] - cv)*w[5];
4260 sum += (S1[x[6]] - cv)*w[6];
4262 sum += (S1[x[7]] - cv)*w[7];
4273 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
4274 int borderType, const Scalar& _borderValue );
4276 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
4277 const Mat& _fxy, const void* _wtab,
4278 int borderType, const Scalar& _borderValue);
4280 class RemapInvoker :
4281 public ParallelLoopBody
4284 RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
4285 const Mat *_m2, int _borderType, const Scalar &_borderValue,
4286 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
4287 ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
4288 borderType(_borderType), borderValue(_borderValue),
4289 planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
4293 virtual void operator() (const Range& range) const
4296 const int buf_size = 1 << 14;
4297 int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
4298 int bcols0 = std::min(buf_size/brows0, dst->cols);
4299 brows0 = std::min(buf_size/bcols0, dst->rows);
4301 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
4304 Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
4306 _bufa.create(brows0, bcols0, CV_16UC1);
4308 for( y = range.start; y < range.end; y += brows0 )
4310 for( x = 0; x < dst->cols; x += bcols0 )
4312 int brows = std::min(brows0, range.end - y);
4313 int bcols = std::min(bcols0, dst->cols - x);
4314 Mat dpart(*dst, Rect(x, y, bcols, brows));
4315 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
4319 if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
4320 bufxy = (*m1)(Rect(x, y, bcols, brows));
4321 else if( map_depth != CV_32F )
4323 for( y1 = 0; y1 < brows; y1++ )
4325 short* XY = bufxy.ptr<short>(y1);
4326 const short* sXY = m1->ptr<short>(y+y1) + x*2;
4327 const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4329 for( x1 = 0; x1 < bcols; x1++ )
4331 int a = sA[x1] & (INTER_TAB_SIZE2-1);
4332 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
4333 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
4337 else if( !planar_input )
4338 (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
4341 for( y1 = 0; y1 < brows; y1++ )
4343 short* XY = bufxy.ptr<short>(y1);
4344 const float* sX = m1->ptr<float>(y+y1) + x;
4345 const float* sY = m2->ptr<float>(y+y1) + x;
4351 for( ; x1 <= bcols - 8; x1 += 8 )
4353 __m128 fx0 = _mm_loadu_ps(sX + x1);
4354 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4355 __m128 fy0 = _mm_loadu_ps(sY + x1);
4356 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4357 __m128i ix0 = _mm_cvtps_epi32(fx0);
4358 __m128i ix1 = _mm_cvtps_epi32(fx1);
4359 __m128i iy0 = _mm_cvtps_epi32(fy0);
4360 __m128i iy1 = _mm_cvtps_epi32(fy1);
4361 ix0 = _mm_packs_epi32(ix0, ix1);
4362 iy0 = _mm_packs_epi32(iy0, iy1);
4363 ix1 = _mm_unpacklo_epi16(ix0, iy0);
4364 iy1 = _mm_unpackhi_epi16(ix0, iy0);
4365 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4366 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4371 for( ; x1 < bcols; x1++ )
4373 XY[x1*2] = saturate_cast<short>(sX[x1]);
4374 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
4378 nnfunc( *src, dpart, bufxy, borderType, borderValue );
4382 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
4383 for( y1 = 0; y1 < brows; y1++ )
4385 short* XY = bufxy.ptr<short>(y1);
4386 ushort* A = bufa.ptr<ushort>(y1);
4388 if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
4390 bufxy = (*m1)(Rect(x, y, bcols, brows));
4392 const ushort* sA = m2->ptr<ushort>(y+y1) + x;
4396 uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
4397 for ( ; x1 <= bcols - 8; x1 += 8)
4398 vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
4400 __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
4401 for ( ; x1 <= bcols - 8; x1 += 8)
4402 _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
4405 for( ; x1 < bcols; x1++ )
4406 A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
4408 else if( planar_input )
4410 const float* sX = m1->ptr<float>(y+y1) + x;
4411 const float* sY = m2->ptr<float>(y+y1) + x;
4417 __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
4418 __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
4419 for( ; x1 <= bcols - 8; x1 += 8 )
4421 __m128 fx0 = _mm_loadu_ps(sX + x1);
4422 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
4423 __m128 fy0 = _mm_loadu_ps(sY + x1);
4424 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
4425 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
4426 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
4427 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
4428 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
4429 __m128i mx0 = _mm_and_si128(ix0, mask);
4430 __m128i mx1 = _mm_and_si128(ix1, mask);
4431 __m128i my0 = _mm_and_si128(iy0, mask);
4432 __m128i my1 = _mm_and_si128(iy1, mask);
4433 mx0 = _mm_packs_epi32(mx0, mx1);
4434 my0 = _mm_packs_epi32(my0, my1);
4435 my0 = _mm_slli_epi16(my0, INTER_BITS);
4436 mx0 = _mm_or_si128(mx0, my0);
4437 _mm_storeu_si128((__m128i*)(A + x1), mx0);
4438 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
4439 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
4440 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
4441 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
4442 ix0 = _mm_packs_epi32(ix0, ix1);
4443 iy0 = _mm_packs_epi32(iy0, iy1);
4444 ix1 = _mm_unpacklo_epi16(ix0, iy0);
4445 iy1 = _mm_unpackhi_epi16(ix0, iy0);
4446 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
4447 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
4451 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
4452 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4454 for( ; x1 <= bcols - 4; x1 += 4 )
4456 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
4457 v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
4458 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4459 vandq_s32(v_sy, v_scale2));
4460 vst1_u16(A + x1, vqmovun_s32(v_v));
4462 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4463 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4464 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4468 for( ; x1 < bcols; x1++ )
4470 int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
4471 int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
4472 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4473 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4474 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4480 const float* sXY = m1->ptr<float>(y+y1) + x*2;
4484 float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
4485 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
4487 for( ; x1 <= bcols - 4; x1 += 4 )
4489 float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
4490 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
4491 int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
4492 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
4493 vandq_s32(v_sy, v_scale2));
4494 vst1_u16(A + x1, vqmovun_s32(v_v));
4496 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
4497 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
4498 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
4502 for( x1 = 0; x1 < bcols; x1++ )
4504 int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
4505 int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
4506 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
4507 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
4508 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
4513 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
4532 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
4533 int interpolation, int borderType, const Scalar& borderValue)
4535 const ocl::Device & dev = ocl::Device::getDefault();
4536 int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
4537 rowsPerWI = dev.isIntel() ? 4 : 1;
4539 if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
4540 || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
4543 UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
4545 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
4546 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
4548 if (map1.type() != CV_16SC2)
4549 std::swap(map1, map2);
4552 CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4554 _dst.create(map1.size(), type);
4555 UMat dst = _dst.getUMat();
4557 String kernelName = "remap";
4558 if (map1.type() == CV_32FC2 && map2.empty())
4559 kernelName += "_32FC2";
4560 else if (map1.type() == CV_16SC2)
4562 kernelName += "_16SC2";
4564 kernelName += "_16UC1";
4566 else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
4567 kernelName += "_2_32FC1";
4569 CV_Error(Error::StsBadArg, "Unsupported map types");
4571 static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
4572 static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4573 "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
4574 String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
4575 interMap[interpolation], borderMap[borderType],
4576 ocl::typeToStr(type), rowsPerWI);
4578 if (interpolation != INTER_NEAREST)
4581 int wdepth = std::max(CV_32F, depth);
4582 buildOptions = buildOptions
4583 + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
4584 " -D convertToWT2=%s -D WT2=%s",
4585 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
4586 ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
4587 ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
4588 ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
4589 ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
4591 int scalarcn = cn == 3 ? 4 : cn;
4592 int sctype = CV_MAKETYPE(depth, scalarcn);
4593 buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
4594 ocl::typeToStr(type), ocl::typeToStr(depth),
4595 cn, ocl::typeToStr(sctype), depth);
4597 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
4599 Mat scalar(1, 1, sctype, borderValue);
4600 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
4601 map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
4602 scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
4605 k.args(srcarg, dstarg, map1arg, scalararg);
4607 k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
4609 size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
4610 return k.run(2, globalThreads, NULL, false);
4613 static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
4614 Point2f center, double maxRadius, int flags)
4616 UMat src_with_border; // don't scope this variable (it holds image data)
4618 UMat mapx, mapy, r, cp_sp;
4619 UMat src = _src.getUMat();
4620 _dst.create(src.size(), src.type());
4621 Size dsize = src.size();
4622 r.create(Size(1, dsize.width), CV_32F);
4623 cp_sp.create(Size(1, dsize.height), CV_32FC2);
4625 mapx.create(dsize, CV_32F);
4626 mapy.create(dsize, CV_32F);
4627 size_t w = dsize.width;
4628 size_t h = dsize.height;
4629 String buildOptions;
4630 unsigned mem_size = 32;
4631 if (flags & CV_WARP_INVERSE_MAP)
4633 buildOptions = "-D InverseMap";
4637 buildOptions = format("-D ForwardMap -D MEM_SIZE=%d", mem_size);
4640 ocl::Program p(ocl::imgproc::linearPolar_oclsrc, buildOptions, retval);
4641 ocl::Kernel k("linearPolar", p);
4642 ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
4643 ocl::KernelArg ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
4644 ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
4646 if (!(flags & CV_WARP_INVERSE_MAP))
4651 ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
4652 float PI2_height = (float) CV_2PI / dsize.height;
4653 float maxRadius_width = (float) maxRadius / dsize.width;
4654 computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, maxRadius_width, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
4655 size_t max_dim = max(h, w);
4656 computeAngleRadius_Kernel.run(1, &max_dim, NULL, false);
4657 k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
4661 const int ANGLE_BORDER = 1;
4663 cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
4664 src = src_with_border;
4665 Size ssize = src_with_border.size();
4666 ssize.height -= 2 * ANGLE_BORDER;
4667 float ascale = ssize.height / ((float)CV_2PI);
4668 float pscale = ssize.width / ((float) maxRadius);
4670 k.args(ocl_mapx, ocl_mapy, ascale, pscale, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
4674 size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
4675 size_t localThreads[2] = { mem_size , mem_size };
4676 k.run(2, globalThreads, localThreads, false);
4677 remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
4680 static bool ocl_logPolar(InputArray _src, OutputArray _dst,
4681 Point2f center, double M, int flags)
4684 CV_Error(CV_StsOutOfRange, "M should be >0");
4685 UMat src_with_border; // don't scope this variable (it holds image data)
4687 UMat mapx, mapy, r, cp_sp;
4688 UMat src = _src.getUMat();
4689 _dst.create(src.size(), src.type());
4690 Size dsize = src.size();
4691 r.create(Size(1, dsize.width), CV_32F);
4692 cp_sp.create(Size(1, dsize.height), CV_32FC2);
4694 mapx.create(dsize, CV_32F);
4695 mapy.create(dsize, CV_32F);
4696 size_t w = dsize.width;
4697 size_t h = dsize.height;
4698 String buildOptions;
4699 unsigned mem_size = 32;
4700 if (flags & CV_WARP_INVERSE_MAP)
4702 buildOptions = "-D InverseMap";
4706 buildOptions = format("-D ForwardMap -D MEM_SIZE=%d", mem_size);
4709 ocl::Program p(ocl::imgproc::logPolar_oclsrc, buildOptions, retval);
4710 //ocl::Program p(ocl::imgproc::my_linearPolar_oclsrc, buildOptions, retval);
4711 //printf("%s\n", retval);
4712 ocl::Kernel k("logPolar", p);
4713 ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
4714 ocl::KernelArg ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
4715 ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
4717 if (!(flags & CV_WARP_INVERSE_MAP))
4722 ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
4723 float PI2_height = (float) CV_2PI / dsize.height;
4725 computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, (float)M, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
4726 size_t max_dim = max(h, w);
4727 computeAngleRadius_Kernel.run(1, &max_dim, NULL, false);
4728 k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
4732 const int ANGLE_BORDER = 1;
4734 cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
4735 src = src_with_border;
4736 Size ssize = src_with_border.size();
4737 ssize.height -= 2 * ANGLE_BORDER;
4738 float ascale = ssize.height / ((float)CV_2PI);
4741 k.args(ocl_mapx, ocl_mapy, ascale, (float)M, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
4745 size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
4746 size_t localThreads[2] = { mem_size , mem_size };
4747 k.run(2, globalThreads, localThreads, false);
4748 remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
4753 #if defined HAVE_IPP && IPP_DISABLE_BLOCK
4755 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
4756 const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
4757 void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
4759 class IPPRemapInvoker :
4760 public ParallelLoopBody
4763 IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
4764 int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
4765 ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
4766 ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
4771 virtual void operator() (const Range & range) const
4773 IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
4774 Mat dstRoi = dst.rowRange(range);
4775 IppiSize dstRoiSize = ippiSize(dstRoi.size());
4776 int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4778 if (borderType == BORDER_CONSTANT &&
4779 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
4785 if (CV_INSTRUMENT_FUN_PTR_CALL_IPP(ippFunc,(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
4786 map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
4787 dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation)) < 0)
4791 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4796 Mat & src, & dst, & map1, & map2;
4798 int ippInterpolation, borderType;
4807 void cv::remap( InputArray _src, OutputArray _dst,
4808 InputArray _map1, InputArray _map2,
4809 int interpolation, int borderType, const Scalar& borderValue )
4811 CV_INSTRUMENT_REGION()
4813 static RemapNNFunc nn_tab[] =
4815 remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
4816 remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
4819 static RemapFunc linear_tab[] =
4821 remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
4822 remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
4823 remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
4824 remapBilinear<Cast<float, float>, RemapNoVec, float>,
4825 remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
4828 static RemapFunc cubic_tab[] =
4830 remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4831 remapBicubic<Cast<float, ushort>, float, 1>,
4832 remapBicubic<Cast<float, short>, float, 1>, 0,
4833 remapBicubic<Cast<float, float>, float, 1>,
4834 remapBicubic<Cast<double, double>, float, 1>, 0
4837 static RemapFunc lanczos4_tab[] =
4839 remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
4840 remapLanczos4<Cast<float, ushort>, float, 1>,
4841 remapLanczos4<Cast<float, short>, float, 1>, 0,
4842 remapLanczos4<Cast<float, float>, float, 1>,
4843 remapLanczos4<Cast<double, double>, float, 1>, 0
4846 CV_Assert( _map1.size().area() > 0 );
4847 CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
4849 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4850 ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
4852 Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
4853 _dst.create( map1.size(), src.type() );
4854 Mat dst = _dst.getMat();
4855 CV_Assert( dst.cols < SHRT_MAX && dst.rows < SHRT_MAX && src.cols < SHRT_MAX && src.rows < SHRT_MAX );
4857 if( dst.data == src.data )
4860 if( interpolation == INTER_AREA )
4861 interpolation = INTER_LINEAR;
4863 int type = src.type(), depth = CV_MAT_DEPTH(type);
4865 #if defined HAVE_IPP && IPP_DISABLE_BLOCK
4868 if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
4869 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
4870 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
4872 int ippInterpolation =
4873 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
4874 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
4877 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
4878 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
4879 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
4880 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
4881 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
4882 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
4883 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
4884 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
4885 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
4890 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
4891 borderType, borderValue, &ok);
4892 Range range(0, dst.rows);
4893 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
4897 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
4900 setIppErrorStatus();
4906 RemapNNFunc nnfunc = 0;
4907 RemapFunc ifunc = 0;
4908 const void* ctab = 0;
4909 bool fixpt = depth == CV_8U;
4910 bool planar_input = false;
4912 if( interpolation == INTER_NEAREST )
4914 nnfunc = nn_tab[depth];
4915 CV_Assert( nnfunc != 0 );
4919 if( interpolation == INTER_LINEAR )
4920 ifunc = linear_tab[depth];
4921 else if( interpolation == INTER_CUBIC )
4922 ifunc = cubic_tab[depth];
4923 else if( interpolation == INTER_LANCZOS4 )
4924 ifunc = lanczos4_tab[depth];
4926 CV_Error( CV_StsBadArg, "Unknown interpolation method" );
4927 CV_Assert( ifunc != 0 );
4928 ctab = initInterTab2D( interpolation, fixpt );
4931 const Mat *m1 = &map1, *m2 = &map2;
4933 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
4934 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
4936 if( map1.type() != CV_16SC2 )
4941 CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
4942 (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
4943 planar_input = map1.channels() == 1;
4946 RemapInvoker invoker(src, dst, m1, m2,
4947 borderType, borderValue, planar_input, nnfunc, ifunc,
4949 parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
4953 void cv::convertMaps( InputArray _map1, InputArray _map2,
4954 OutputArray _dstmap1, OutputArray _dstmap2,
4955 int dstm1type, bool nninterpolate )
4957 CV_INSTRUMENT_REGION()
4959 Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
4960 Size size = map1.size();
4961 const Mat *m1 = &map1, *m2 = &map2;
4962 int m1type = m1->type(), m2type = m2->type();
4964 CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
4965 (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
4966 (m1type == CV_32FC1 && m2type == CV_32FC1) ||
4967 (m1type == CV_32FC2 && m2->empty()) );
4969 if( m2type == CV_16SC2 )
4971 std::swap( m1, m2 );
4972 std::swap( m1type, m2type );
4975 if( dstm1type <= 0 )
4976 dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
4977 CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
4978 _dstmap1.create( size, dstm1type );
4979 dstmap1 = _dstmap1.getMat();
4981 if( !nninterpolate && dstm1type != CV_32FC2 )
4983 _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
4984 dstmap2 = _dstmap2.getMat();
4989 if( m1type == dstm1type || (nninterpolate &&
4990 ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
4991 (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
4993 m1->convertTo( dstmap1, dstmap1.type() );
4994 if( !dstmap2.empty() && dstmap2.type() == m2->type() )
4995 m2->copyTo( dstmap2 );
4999 if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
5001 Mat vdata[] = { *m1, *m2 };
5002 merge( vdata, 2, dstmap1 );
5006 if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
5008 Mat mv[] = { dstmap1, dstmap2 };
5013 if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
5014 dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
5016 size.width *= size.height;
5021 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
5024 bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
5027 const float scale = 1.f/INTER_TAB_SIZE;
5029 for( y = 0; y < size.height; y++ )
5031 const float* src1f = m1->ptr<float>(y);
5032 const float* src2f = m2->ptr<float>(y);
5033 const short* src1 = (const short*)src1f;
5034 const ushort* src2 = (const ushort*)src2f;
5036 float* dst1f = dstmap1.ptr<float>(y);
5037 float* dst2f = dstmap2.ptr<float>(y);
5038 short* dst1 = (short*)dst1f;
5039 ushort* dst2 = (ushort*)dst2f;
5042 if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
5047 for( ; x <= size.width - 8; x += 8 )
5050 v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
5051 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
5052 v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
5053 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
5055 vst2q_s16(dst1 + (x << 1), v_dst);
5060 for( ; x <= size.width - 16; x += 16 )
5062 __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
5063 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
5064 __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
5065 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
5067 __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
5068 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
5069 __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
5070 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
5072 _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
5074 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
5075 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
5076 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
5077 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
5081 for( ; x < size.width; x++ )
5083 dst1[x*2] = saturate_cast<short>(src1f[x]);
5084 dst1[x*2+1] = saturate_cast<short>(src2f[x]);
5090 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
5091 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
5093 for( ; x <= size.width - 8; x += 8 )
5095 int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
5096 int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
5097 int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
5098 int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
5101 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
5102 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
5103 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
5104 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
5106 vst2q_s16(dst1 + (x << 1), v_dst);
5108 uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
5109 vandq_s32(v_ix0, v_mask)));
5110 uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
5111 vandq_s32(v_ix1, v_mask)));
5112 vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
5117 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
5118 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
5120 for( ; x <= size.width - 16; x += 16 )
5122 __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
5123 __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
5124 __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
5125 __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
5127 __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
5128 _mm_srai_epi32(v_ix1, INTER_BITS));
5129 __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
5130 _mm_srai_epi32(v_iy1, INTER_BITS));
5131 __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
5132 _mm_and_si128(v_ix0, v_its1));
5133 __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
5134 _mm_and_si128(v_ix1, v_its1));
5135 _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
5137 v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
5138 v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
5139 v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
5140 v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
5142 __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
5143 _mm_srai_epi32(v_ix1, INTER_BITS));
5144 __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
5145 _mm_srai_epi32(v_iy1, INTER_BITS));
5146 v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
5147 _mm_and_si128(v_ix0, v_its1));
5148 v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
5149 _mm_and_si128(v_ix1, v_its1));
5150 _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
5152 _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
5154 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
5155 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
5156 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
5157 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
5161 for( ; x < size.width; x++ )
5163 int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
5164 int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
5165 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
5166 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
5167 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
5171 else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
5176 for( ; x <= (size.width << 1) - 8; x += 8 )
5177 vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
5178 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
5180 for( ; x <= (size.width << 1) - 8; x += 8 )
5182 _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
5183 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
5186 for( ; x < size.width; x++ )
5188 dst1[x*2] = saturate_cast<short>(src1f[x*2]);
5189 dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
5195 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
5196 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
5198 for( ; x <= size.width - 8; x += 8 )
5200 float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
5201 int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
5202 int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
5203 int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
5204 int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
5207 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
5208 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
5209 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
5210 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
5212 vst2q_s16(dst1 + (x << 1), v_dst);
5214 uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
5215 vandq_s32(v_ix0, v_mask)));
5216 uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
5217 vandq_s32(v_ix1, v_mask)));
5218 vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
5223 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
5224 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
5225 __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
5227 for( ; x <= size.width - 4; x += 4 )
5229 __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
5230 __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
5232 __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
5233 _mm_srai_epi32(v_src1, INTER_BITS));
5234 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
5236 // x0 y0 x1 y1 . . .
5237 v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
5238 _mm_and_si128(v_src1, v_its1));
5239 __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
5240 _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
5241 _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
5245 for( ; x < size.width; x++ )
5247 int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
5248 int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
5249 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
5250 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
5251 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
5255 else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
5258 uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
5259 uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
5260 float32x4_t v_scale = vdupq_n_f32(scale);
5262 for( ; x <= size.width - 8; x += 8)
5264 uint32x4_t v_fxy1, v_fxy2;
5267 uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
5268 v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
5269 v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
5272 v_fxy1 = v_fxy2 = v_zero;
5274 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
5275 float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
5276 v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
5277 float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
5278 v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
5279 vst1q_f32(dst1f + x, v_dst1);
5280 vst1q_f32(dst2f + x, v_dst2);
5282 v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
5283 v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
5284 v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
5285 v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
5286 vst1q_f32(dst1f + x + 4, v_dst1);
5287 vst1q_f32(dst2f + x + 4, v_dst2);
5290 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
5291 __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
5292 __m128 v_scale = _mm_set1_ps(scale);
5294 for( ; x <= size.width - 16; x += 16)
5296 __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
5297 __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
5298 __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
5299 __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
5301 _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
5303 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
5304 __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
5305 _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
5306 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5307 _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
5308 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5309 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
5310 _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
5311 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5312 _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
5313 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5315 v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
5316 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
5317 _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
5318 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5319 _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
5320 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5321 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
5322 _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
5323 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
5324 _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
5325 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
5328 for( ; x < size.width; x++ )
5330 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
5331 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
5332 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
5335 else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
5338 int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
5339 int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
5340 float32x4_t v_scale = vdupq_n_f32(scale);
5342 for( ; x <= size.width - 8; x += 8)
5344 int32x4_t v_fxy1, v_fxy2;
5347 int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
5348 v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
5349 v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
5352 v_fxy1 = v_fxy2 = v_zero;
5354 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
5355 float32x4x2_t v_dst;
5356 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
5357 v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
5358 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
5359 v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
5360 vst2q_f32(dst1f + (x << 1), v_dst);
5362 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
5363 v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
5364 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
5365 v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
5366 vst2q_f32(dst1f + (x << 1) + 8, v_dst);
5371 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
5372 __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
5373 __m128 v_scale = _mm_set1_ps(scale);
5375 for ( ; x <= size.width - 8; x += 8)
5377 __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
5378 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
5379 __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
5380 __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
5382 __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
5383 _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
5385 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
5386 _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
5390 for( ; x < size.width; x++ )
5392 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
5393 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
5394 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
5398 CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
5406 class WarpAffineInvoker :
5407 public ParallelLoopBody
5410 WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
5411 const Scalar &_borderValue, int *_adelta, int *_bdelta, const double *_M) :
5412 ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
5413 borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
5418 virtual void operator() (const Range& range) const
5420 const int BLOCK_SZ = 64;
5421 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
5422 const int AB_BITS = MAX(10, (int)INTER_BITS);
5423 const int AB_SCALE = 1 << AB_BITS;
5424 int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
5426 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
5429 bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
5432 int bh0 = std::min(BLOCK_SZ/2, dst.rows);
5433 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
5434 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
5436 for( y = range.start; y < range.end; y += bh0 )
5438 for( x = 0; x < dst.cols; x += bw0 )
5440 int bw = std::min( bw0, dst.cols - x);
5441 int bh = std::min( bh0, range.end - y);
5443 Mat _XY(bh, bw, CV_16SC2, XY), matA;
5444 Mat dpart(dst, Rect(x, y, bw, bh));
5446 for( y1 = 0; y1 < bh; y1++ )
5448 short* xy = XY + y1*bw*2;
5449 int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
5450 int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
5452 if( interpolation == INTER_NEAREST )
5456 int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
5457 for( ; x1 <= bw - 8; x1 += 8 )
5460 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
5461 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
5462 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
5463 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
5465 vst2q_s16(xy + (x1 << 1), v_dst);
5470 __m128i v_X0 = _mm_set1_epi32(X0);
5471 __m128i v_Y0 = _mm_set1_epi32(Y0);
5472 for ( ; x1 <= bw - 16; x1 += 16)
5474 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
5475 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
5476 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
5477 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
5479 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
5480 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
5481 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
5482 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
5484 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
5486 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
5487 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
5488 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
5489 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
5493 for( ; x1 < bw; x1++ )
5495 int X = (X0 + adelta[x+x1]) >> AB_BITS;
5496 int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
5497 xy[x1*2] = saturate_cast<short>(X);
5498 xy[x1*2+1] = saturate_cast<short>(Y);
5503 short* alpha = A + y1*bw;
5508 __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
5509 __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
5510 for( ; x1 <= bw - 8; x1 += 8 )
5512 __m128i tx0, tx1, ty0, ty1;
5513 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
5514 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
5515 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
5516 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
5518 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
5519 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
5520 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
5521 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
5523 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
5524 _mm_and_si128(tx1, fxy_mask));
5525 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
5526 _mm_and_si128(ty1, fxy_mask));
5527 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
5528 _mm_srai_epi32(tx1, INTER_BITS));
5529 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
5530 _mm_srai_epi32(ty1, INTER_BITS));
5531 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
5533 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
5534 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
5535 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
5539 int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
5540 for( ; x1 <= bw - 8; x1 += 8 )
5542 int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
5543 int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
5544 int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
5545 int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
5548 v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
5549 v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
5551 vst2q_s16(xy + (x1 << 1), v_xy);
5553 int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
5554 vandq_s32(v_X0, v_mask)));
5555 int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
5556 vandq_s32(v_X1, v_mask)));
5557 vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
5560 for( ; x1 < bw; x1++ )
5562 int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
5563 int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
5564 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
5565 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
5566 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
5567 (X & (INTER_TAB_SIZE-1)));
5572 if( interpolation == INTER_NEAREST )
5573 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
5576 Mat _matA(bh, bw, CV_16U, A);
5577 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
5586 int interpolation, borderType;
5588 int *adelta, *bdelta;
5593 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
5594 class IPPWarpAffineInvoker :
5595 public ParallelLoopBody
5598 IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
5599 const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
5600 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
5601 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
5606 virtual void operator() (const Range& range) const
5608 IppiSize srcsize = { src.cols, src.rows };
5609 IppiRect srcroi = { 0, 0, src.cols, src.rows };
5610 IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
5611 int cnn = src.channels();
5612 if( borderType == BORDER_CONSTANT )
5614 IppiSize setSize = { dst.cols, range.end - range.start };
5615 void *dataPointer = dst.ptr(range.start);
5616 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
5623 // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
5624 IppStatus status = CV_INSTRUMENT_FUN_PTR_CALL_IPP(func,( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
5625 (int)dst.step[0], dstroi, coeffs, mode ));
5630 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5637 double (&coeffs)[2][3];
5640 ippiWarpAffineBackFunc func;
5642 const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
5648 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
5650 static bool ocl_warpTransform_cols4(InputArray _src, OutputArray _dst, InputArray _M0,
5651 Size dsize, int flags, int borderType, const Scalar& borderValue,
5654 CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
5655 const ocl::Device & dev = ocl::Device::getDefault();
5656 int type = _src.type(), dtype = _dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5658 int interpolation = flags & INTER_MAX;
5659 if( interpolation == INTER_AREA )
5660 interpolation = INTER_LINEAR;
5662 if ( !dev.isIntel() || !(type == CV_8UC1) ||
5663 !(dtype == CV_8UC1) || !(_dst.cols() % 4 == 0) ||
5664 !(borderType == cv::BORDER_CONSTANT &&
5665 (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)))
5668 const char * const warp_op[2] = { "Affine", "Perspective" };
5669 const char * const interpolationMap[3] = { "nearest", "linear", "cubic" };
5670 ocl::ProgramSource program = ocl::imgproc::warp_transform_oclsrc;
5671 String kernelName = format("warp%s_%s_8u", warp_op[op_type], interpolationMap[interpolation]);
5673 bool is32f = (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
5674 int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
5675 int sctype = CV_MAKETYPE(wdepth, cn);
5678 String opts = format("-D ST=%s", ocl::typeToStr(sctype));
5680 k.create(kernelName.c_str(), program, opts);
5684 float borderBuf[] = { 0, 0, 0, 0 };
5685 scalarToRawData(borderValue, borderBuf, sctype);
5687 UMat src = _src.getUMat(), M0;
5688 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5689 UMat dst = _dst.getUMat();
5692 int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
5693 Mat matM(matRows, 3, CV_32F, M), M1 = _M0.getMat();
5694 CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) && M1.rows == matRows && M1.cols == 3 );
5695 M1.convertTo(matM, matM.type());
5697 if( !(flags & WARP_INVERSE_MAP) )
5699 if (op_type == OCL_OP_PERSPECTIVE)
5703 float D = M[0]*M[4] - M[1]*M[3];
5704 D = D != 0 ? 1.f/D : 0;
5705 float A11 = M[4]*D, A22=M[0]*D;
5706 M[0] = A11; M[1] *= -D;
5707 M[3] *= -D; M[4] = A22;
5708 float b1 = -M[0]*M[2] - M[1]*M[5];
5709 float b2 = -M[3]*M[2] - M[4]*M[5];
5710 M[2] = b1; M[5] = b2;
5713 matM.convertTo(M0, CV_32F);
5715 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
5716 ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
5718 size_t globalThreads[2];
5719 globalThreads[0] = (size_t)(dst.cols / 4);
5720 globalThreads[1] = (size_t)dst.rows;
5722 return k.run(2, globalThreads, NULL, false);
5725 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
5726 Size dsize, int flags, int borderType, const Scalar& borderValue,
5729 CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
5730 const ocl::Device & dev = ocl::Device::getDefault();
5732 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5733 const bool doubleSupport = dev.doubleFPConfig() > 0;
5735 int interpolation = flags & INTER_MAX;
5736 if( interpolation == INTER_AREA )
5737 interpolation = INTER_LINEAR;
5738 int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
5740 if ( !(borderType == cv::BORDER_CONSTANT &&
5741 (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
5742 (!doubleSupport && depth == CV_64F) || cn > 4)
5745 const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
5746 ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
5747 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
5748 const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
5750 int scalarcn = cn == 3 ? 4 : cn;
5751 bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
5752 int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
5753 int sctype = CV_MAKETYPE(wdepth, scalarcn);
5757 if (interpolation == INTER_NEAREST)
5759 opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
5760 ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
5761 ocl::typeToStr(CV_MAT_DEPTH(type)),
5762 ocl::typeToStr(sctype), cn, rowsPerWI);
5767 opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
5768 " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
5769 interpolationMap[interpolation], ocl::typeToStr(type),
5770 ocl::typeToStr(CV_MAT_DEPTH(type)),
5771 ocl::typeToStr(sctype),
5772 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
5773 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
5774 ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
5775 doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
5778 k.create(kernelName, program, opts);
5782 double borderBuf[] = { 0, 0, 0, 0 };
5783 scalarToRawData(borderValue, borderBuf, sctype);
5785 UMat src = _src.getUMat(), M0;
5786 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5787 UMat dst = _dst.getUMat();
5790 int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
5791 Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
5792 CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
5793 M1.rows == matRows && M1.cols == 3 );
5794 M1.convertTo(matM, matM.type());
5796 if( !(flags & WARP_INVERSE_MAP) )
5798 if (op_type == OCL_OP_PERSPECTIVE)
5802 double D = M[0]*M[4] - M[1]*M[3];
5803 D = D != 0 ? 1./D : 0;
5804 double A11 = M[4]*D, A22=M[0]*D;
5805 M[0] = A11; M[1] *= -D;
5806 M[3] *= -D; M[4] = A22;
5807 double b1 = -M[0]*M[2] - M[1]*M[5];
5808 double b2 = -M[3]*M[2] - M[4]*M[5];
5809 M[2] = b1; M[5] = b2;
5812 matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
5814 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
5815 ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
5817 size_t globalThreads[2] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
5818 return k.run(2, globalThreads, NULL, false);
5825 void warpAffine(int src_type,
5826 const uchar * src_data, size_t src_step, int src_width, int src_height,
5827 uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
5828 const double M[6], int interpolation, int borderType, const double borderValue[4])
5830 CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
5832 Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
5833 Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
5836 AutoBuffer<int> _abdelta(dst.cols*2);
5837 int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
5838 const int AB_BITS = MAX(10, (int)INTER_BITS);
5839 const int AB_SCALE = 1 << AB_BITS;
5841 for( x = 0; x < dst.cols; x++ )
5843 adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
5844 bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
5847 Range range(0, dst.rows);
5848 WarpAffineInvoker invoker(src, dst, interpolation, borderType,
5849 Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
5851 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5858 void cv::warpAffine( InputArray _src, OutputArray _dst,
5859 InputArray _M0, Size dsize,
5860 int flags, int borderType, const Scalar& borderValue )
5862 CV_INSTRUMENT_REGION()
5864 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
5865 _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
5866 ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType,
5867 borderValue, OCL_OP_AFFINE))
5869 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
5870 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
5871 borderValue, OCL_OP_AFFINE))
5873 Mat src = _src.getMat(), M0 = _M0.getMat();
5874 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
5875 Mat dst = _dst.getMat();
5876 CV_Assert( src.cols > 0 && src.rows > 0 );
5877 if( dst.data == src.data )
5881 Mat matM(2, 3, CV_64F, M);
5882 int interpolation = flags & INTER_MAX;
5883 if( interpolation == INTER_AREA )
5884 interpolation = INTER_LINEAR;
5886 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
5887 M0.convertTo(matM, matM.type());
5889 if( !(flags & WARP_INVERSE_MAP) )
5891 double D = M[0]*M[4] - M[1]*M[3];
5892 D = D != 0 ? 1./D : 0;
5893 double A11 = M[4]*D, A22=M[0]*D;
5894 M[0] = A11; M[1] *= -D;
5895 M[3] *= -D; M[4] = A22;
5896 double b1 = -M[0]*M[2] - M[1]*M[5];
5897 double b2 = -M[3]*M[2] - M[4]*M[5];
5898 M[2] = b1; M[5] = b2;
5901 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
5904 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5905 if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
5906 ( cn == 1 || cn == 3 || cn == 4 ) &&
5907 ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
5908 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
5910 ippiWarpAffineBackFunc ippFunc = 0;
5911 if ((flags & WARP_INVERSE_MAP) != 0)
5914 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
5915 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
5916 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
5917 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
5918 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
5919 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
5920 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
5921 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
5922 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
5928 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
5929 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
5930 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
5931 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
5932 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
5933 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
5934 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
5935 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
5936 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
5940 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
5941 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
5942 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
5944 CV_Assert(mode && ippFunc);
5946 double coeffs[2][3];
5947 for( int i = 0; i < 2; i++ )
5948 for( int j = 0; j < 3; j++ )
5949 coeffs[i][j] = matM.at<double>(i, j);
5952 Range range(0, dst.rows);
5953 IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
5954 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
5957 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5960 setIppErrorStatus();
5965 hal::warpAffine(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
5966 M, interpolation, borderType, borderValue.val);
5973 class WarpPerspectiveInvoker :
5974 public ParallelLoopBody
5977 WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, const double *_M, int _interpolation,
5978 int _borderType, const Scalar &_borderValue) :
5979 ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
5980 borderType(_borderType), borderValue(_borderValue)
5984 virtual void operator() (const Range& range) const
5986 const int BLOCK_SZ = 32;
5987 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
5988 int x, y, x1, y1, width = dst.cols, height = dst.rows;
5990 int bh0 = std::min(BLOCK_SZ/2, height);
5991 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
5992 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
5995 bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
5996 __m128d v_M0 = _mm_set1_pd(M[0]);
5997 __m128d v_M3 = _mm_set1_pd(M[3]);
5998 __m128d v_M6 = _mm_set1_pd(M[6]);
5999 __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
6000 __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
6001 __m128d v_2 = _mm_set1_pd(2),
6002 v_zero = _mm_setzero_pd(),
6003 v_1 = _mm_set1_pd(1),
6004 v_its = _mm_set1_pd(INTER_TAB_SIZE);
6005 __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
6008 for( y = range.start; y < range.end; y += bh0 )
6010 for( x = 0; x < width; x += bw0 )
6012 int bw = std::min( bw0, width - x);
6013 int bh = std::min( bh0, range.end - y); // height
6015 Mat _XY(bh, bw, CV_16SC2, XY), matA;
6016 Mat dpart(dst, Rect(x, y, bw, bh));
6018 for( y1 = 0; y1 < bh; y1++ )
6020 short* xy = XY + y1*bw*2;
6021 double X0 = M[0]*x + M[1]*(y + y1) + M[2];
6022 double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
6023 double W0 = M[6]*x + M[7]*(y + y1) + M[8];
6025 if( interpolation == INTER_NEAREST )
6032 __m128d v_X0d = _mm_set1_pd(X0);
6033 __m128d v_Y0d = _mm_set1_pd(Y0);
6034 __m128d v_W0 = _mm_set1_pd(W0);
6035 __m128d v_x1 = _mm_set_pd(1, 0);
6037 for( ; x1 <= bw - 16; x1 += 16 )
6042 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6043 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6044 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6045 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6046 v_x1 = _mm_add_pd(v_x1, v_2);
6048 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6049 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6050 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6051 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6052 v_x1 = _mm_add_pd(v_x1, v_2);
6054 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6055 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6056 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6057 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6063 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6064 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6065 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6066 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6067 v_x1 = _mm_add_pd(v_x1, v_2);
6069 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6070 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6071 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6072 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6073 v_x1 = _mm_add_pd(v_x1, v_2);
6075 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6076 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6077 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6078 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6084 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6085 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6086 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6087 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6088 v_x1 = _mm_add_pd(v_x1, v_2);
6090 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6091 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6092 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6093 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6094 v_x1 = _mm_add_pd(v_x1, v_2);
6096 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6097 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6098 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6099 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6105 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6106 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6107 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6108 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6109 v_x1 = _mm_add_pd(v_x1, v_2);
6111 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6112 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
6113 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6114 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6115 v_x1 = _mm_add_pd(v_x1, v_2);
6117 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6118 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6119 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6120 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6124 v_X0 = _mm_packs_epi32(v_X0, v_X1);
6125 v_X1 = _mm_packs_epi32(v_X2, v_X3);
6126 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
6127 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
6129 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
6131 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
6132 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
6133 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
6134 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
6139 for( ; x1 < bw; x1++ )
6141 double W = W0 + M[6]*x1;
6143 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
6144 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
6145 int X = saturate_cast<int>(fX);
6146 int Y = saturate_cast<int>(fY);
6148 xy[x1*2] = saturate_cast<short>(X);
6149 xy[x1*2+1] = saturate_cast<short>(Y);
6154 short* alpha = A + y1*bw;
6160 __m128d v_X0d = _mm_set1_pd(X0);
6161 __m128d v_Y0d = _mm_set1_pd(Y0);
6162 __m128d v_W0 = _mm_set1_pd(W0);
6163 __m128d v_x1 = _mm_set_pd(1, 0);
6165 for( ; x1 <= bw - 16; x1 += 16 )
6170 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6171 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6172 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6173 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6174 v_x1 = _mm_add_pd(v_x1, v_2);
6176 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6177 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6178 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6179 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6180 v_x1 = _mm_add_pd(v_x1, v_2);
6182 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6183 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6184 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6185 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6191 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6192 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6193 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6194 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6195 v_x1 = _mm_add_pd(v_x1, v_2);
6197 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6198 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6199 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6200 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6201 v_x1 = _mm_add_pd(v_x1, v_2);
6203 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6204 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6205 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6206 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6212 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6213 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6214 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6215 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6216 v_x1 = _mm_add_pd(v_x1, v_2);
6218 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6219 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6220 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6221 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6222 v_x1 = _mm_add_pd(v_x1, v_2);
6224 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6225 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6226 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6227 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6233 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6234 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6235 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6236 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6237 v_x1 = _mm_add_pd(v_x1, v_2);
6239 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
6240 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
6241 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
6242 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
6243 v_x1 = _mm_add_pd(v_x1, v_2);
6245 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
6246 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
6247 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
6248 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
6252 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
6253 _mm_and_si128(v_X0, v_itsi1));
6254 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
6255 _mm_and_si128(v_X1, v_itsi1));
6256 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
6258 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
6259 _mm_and_si128(v_X2, v_itsi1));
6260 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
6261 _mm_and_si128(v_X3, v_itsi1));
6262 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
6265 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
6266 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
6267 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
6268 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
6270 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
6272 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
6273 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
6274 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
6275 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
6280 for( ; x1 < bw; x1++ )
6282 double W = W0 + M[6]*x1;
6283 W = W ? INTER_TAB_SIZE/W : 0;
6284 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
6285 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
6286 int X = saturate_cast<int>(fX);
6287 int Y = saturate_cast<int>(fY);
6289 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
6290 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
6291 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
6292 (X & (INTER_TAB_SIZE-1)));
6297 if( interpolation == INTER_NEAREST )
6298 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
6301 Mat _matA(bh, bw, CV_16U, A);
6302 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
6312 int interpolation, borderType;
6316 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
6317 class IPPWarpPerspectiveInvoker :
6318 public ParallelLoopBody
6321 IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
6322 int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
6323 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
6324 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
6329 virtual void operator() (const Range& range) const
6331 IppiSize srcsize = {src.cols, src.rows};
6332 IppiRect srcroi = {0, 0, src.cols, src.rows};
6333 IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
6334 int cnn = src.channels();
6336 if( borderType == BORDER_CONSTANT )
6338 IppiSize setSize = {dst.cols, range.end - range.start};
6339 void *dataPointer = dst.ptr(range.start);
6340 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
6347 IppStatus status = CV_INSTRUMENT_FUN_PTR_CALL_IPP(func,(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode));
6348 if (status != ippStsNoErr)
6352 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6359 double (&coeffs)[3][3];
6361 const Scalar borderValue;
6362 ippiWarpPerspectiveFunc func;
6365 const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
6371 void warpPerspectve(int src_type,
6372 const uchar * src_data, size_t src_step, int src_width, int src_height,
6373 uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
6374 const double M[9], int interpolation, int borderType, const double borderValue[4])
6376 CALL_HAL(warpPerspective, cv_hal_warpPerspective, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
6377 Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
6378 Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
6380 Range range(0, dst.rows);
6381 WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
6382 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
6388 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
6389 Size dsize, int flags, int borderType, const Scalar& borderValue )
6391 CV_INSTRUMENT_REGION()
6393 CV_Assert( _src.total() > 0 );
6395 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
6396 _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
6397 ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType, borderValue,
6398 OCL_OP_PERSPECTIVE))
6400 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
6401 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
6402 OCL_OP_PERSPECTIVE))
6404 Mat src = _src.getMat(), M0 = _M0.getMat();
6405 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
6406 Mat dst = _dst.getMat();
6408 if( dst.data == src.data )
6412 Mat matM(3, 3, CV_64F, M);
6413 int interpolation = flags & INTER_MAX;
6414 if( interpolation == INTER_AREA )
6415 interpolation = INTER_LINEAR;
6417 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
6418 M0.convertTo(matM, matM.type());
6420 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
6423 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
6424 if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
6425 (cn == 1 || cn == 3 || cn == 4) &&
6426 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
6427 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
6429 ippiWarpPerspectiveFunc ippFunc = 0;
6430 if ((flags & WARP_INVERSE_MAP) != 0)
6432 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
6433 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
6434 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
6435 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
6436 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
6437 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
6438 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
6439 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
6440 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
6444 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
6445 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
6446 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
6447 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
6448 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
6449 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
6450 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
6451 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
6452 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
6455 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
6456 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
6457 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
6458 CV_Assert(mode && ippFunc);
6460 double coeffs[3][3];
6461 for( int i = 0; i < 3; i++ )
6462 for( int j = 0; j < 3; j++ )
6463 coeffs[i][j] = matM.at<double>(i, j);
6466 Range range(0, dst.rows);
6467 IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
6468 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
6471 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6474 setIppErrorStatus();
6479 if( !(flags & WARP_INVERSE_MAP) )
6482 hal::warpPerspectve(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
6483 matM.ptr<double>(), interpolation, borderType, borderValue.val);
6487 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
6489 CV_INSTRUMENT_REGION()
6492 double alpha = cos(angle)*scale;
6493 double beta = sin(angle)*scale;
6495 Mat M(2, 3, CV_64F);
6496 double* m = M.ptr<double>();
6500 m[2] = (1-alpha)*center.x - beta*center.y;
6503 m[5] = beta*center.x + (1-alpha)*center.y;
6508 /* Calculates coefficients of perspective transformation
6509 * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
6511 * c00*xi + c01*yi + c02
6512 * ui = ---------------------
6513 * c20*xi + c21*yi + c22
6515 * c10*xi + c11*yi + c12
6516 * vi = ---------------------
6517 * c20*xi + c21*yi + c22
6519 * Coefficients are calculated by solving linear system:
6520 * / x0 y0 1 0 0 0 -x0*u0 -y0*u0 \ /c00\ /u0\
6521 * | x1 y1 1 0 0 0 -x1*u1 -y1*u1 | |c01| |u1|
6522 * | x2 y2 1 0 0 0 -x2*u2 -y2*u2 | |c02| |u2|
6523 * | x3 y3 1 0 0 0 -x3*u3 -y3*u3 |.|c10|=|u3|,
6524 * | 0 0 0 x0 y0 1 -x0*v0 -y0*v0 | |c11| |v0|
6525 * | 0 0 0 x1 y1 1 -x1*v1 -y1*v1 | |c12| |v1|
6526 * | 0 0 0 x2 y2 1 -x2*v2 -y2*v2 | |c20| |v2|
6527 * \ 0 0 0 x3 y3 1 -x3*v3 -y3*v3 / \c21/ \v3/
6530 * cij - matrix coefficients, c22 = 1
6532 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
6534 CV_INSTRUMENT_REGION()
6536 Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
6537 double a[8][8], b[8];
6538 Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
6540 for( int i = 0; i < 4; ++i )
6542 a[i][0] = a[i+4][3] = src[i].x;
6543 a[i][1] = a[i+4][4] = src[i].y;
6544 a[i][2] = a[i+4][5] = 1;
6545 a[i][3] = a[i][4] = a[i][5] =
6546 a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
6547 a[i][6] = -src[i].x*dst[i].x;
6548 a[i][7] = -src[i].y*dst[i].x;
6549 a[i+4][6] = -src[i].x*dst[i].y;
6550 a[i+4][7] = -src[i].y*dst[i].y;
6555 solve( A, B, X, DECOMP_SVD );
6556 M.ptr<double>()[8] = 1.;
6561 /* Calculates coefficients of affine transformation
6562 * which maps (xi,yi) to (ui,vi), (i=1,2,3):
6564 * ui = c00*xi + c01*yi + c02
6566 * vi = c10*xi + c11*yi + c12
6568 * Coefficients are calculated by solving linear system:
6569 * / x0 y0 1 0 0 0 \ /c00\ /u0\
6570 * | x1 y1 1 0 0 0 | |c01| |u1|
6571 * | x2 y2 1 0 0 0 | |c02| |u2|
6572 * | 0 0 0 x0 y0 1 | |c10| |v0|
6573 * | 0 0 0 x1 y1 1 | |c11| |v1|
6574 * \ 0 0 0 x2 y2 1 / |c12| |v2|
6577 * cij - matrix coefficients
6580 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
6582 Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
6583 double a[6*6], b[6];
6584 Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
6586 for( int i = 0; i < 3; i++ )
6590 a[j] = a[k+3] = src[i].x;
6591 a[j+1] = a[k+4] = src[i].y;
6592 a[j+2] = a[k+5] = 1;
6593 a[j+3] = a[j+4] = a[j+5] = 0;
6594 a[k] = a[k+1] = a[k+2] = 0;
6596 b[i*2+1] = dst[i].y;
6603 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
6605 Mat matM = _matM.getMat();
6606 CV_Assert(matM.rows == 2 && matM.cols == 3);
6607 __iM.create(2, 3, matM.type());
6608 Mat _iM = __iM.getMat();
6610 if( matM.type() == CV_32F )
6612 const float* M = matM.ptr<float>();
6613 float* iM = _iM.ptr<float>();
6614 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
6616 double D = M[0]*M[step+1] - M[1]*M[step];
6617 D = D != 0 ? 1./D : 0;
6618 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
6619 double b1 = -A11*M[2] - A12*M[step+2];
6620 double b2 = -A21*M[2] - A22*M[step+2];
6622 iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
6623 iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
6625 else if( matM.type() == CV_64F )
6627 const double* M = matM.ptr<double>();
6628 double* iM = _iM.ptr<double>();
6629 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
6631 double D = M[0]*M[step+1] - M[1]*M[step];
6632 D = D != 0 ? 1./D : 0;
6633 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
6634 double b1 = -A11*M[2] - A12*M[step+2];
6635 double b2 = -A21*M[2] - A22*M[step+2];
6637 iM[0] = A11; iM[1] = A12; iM[2] = b1;
6638 iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
6641 CV_Error( CV_StsUnsupportedFormat, "" );
6644 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
6646 Mat src = _src.getMat(), dst = _dst.getMat();
6647 CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
6648 return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
6651 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
6653 Mat src = _src.getMat(), dst = _dst.getMat();
6654 CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
6655 return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
6659 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
6661 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6662 CV_Assert( src.type() == dst.type() );
6663 cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
6664 (double)dst.rows/src.rows, method );
6669 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
6670 int flags, CvScalar fillval )
6672 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6673 cv::Mat matrix = cv::cvarrToMat(marr);
6674 CV_Assert( src.type() == dst.type() );
6675 cv::warpAffine( src, dst, matrix, dst.size(), flags,
6676 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
6681 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
6682 int flags, CvScalar fillval )
6684 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6685 cv::Mat matrix = cv::cvarrToMat(marr);
6686 CV_Assert( src.type() == dst.type() );
6687 cv::warpPerspective( src, dst, matrix, dst.size(), flags,
6688 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
6693 cvRemap( const CvArr* srcarr, CvArr* dstarr,
6694 const CvArr* _mapx, const CvArr* _mapy,
6695 int flags, CvScalar fillval )
6697 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
6698 cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
6699 CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
6700 cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
6701 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
6703 CV_Assert( dst0.data == dst.data );
6708 cv2DRotationMatrix( CvPoint2D32f center, double angle,
6709 double scale, CvMat* matrix )
6711 cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
6712 CV_Assert( M.size() == M0.size() );
6713 M.convertTo(M0, M0.type());
6719 cvGetPerspectiveTransform( const CvPoint2D32f* src,
6720 const CvPoint2D32f* dst,
6723 cv::Mat M0 = cv::cvarrToMat(matrix),
6724 M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
6725 CV_Assert( M.size() == M0.size() );
6726 M.convertTo(M0, M0.type());
6732 cvGetAffineTransform( const CvPoint2D32f* src,
6733 const CvPoint2D32f* dst,
6736 cv::Mat M0 = cv::cvarrToMat(matrix),
6737 M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
6738 CV_Assert( M.size() == M0.size() );
6739 M.convertTo(M0, M0.type());
6745 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
6747 cv::Mat map1 = cv::cvarrToMat(arr1), map2;
6748 cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
6751 map2 = cv::cvarrToMat(arr2);
6754 dstmap2 = cv::cvarrToMat(dstarr2);
6755 if( dstmap2.type() == CV_16SC1 )
6756 dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
6759 cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
6762 /****************************************************************************************\
6763 * Log-Polar Transform *
6764 \****************************************************************************************/
6766 /* now it is done via Remap; more correct implementation should use
6767 some super-sampling technique outside of the "fovea" circle */
6769 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
6770 CvPoint2D32f center, double M, int flags )
6772 Mat src_with_border; // don't scope this variable (it holds image data)
6774 cv::Ptr<CvMat> mapx, mapy;
6776 CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
6777 CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
6780 if( !CV_ARE_TYPES_EQ( src, dst ))
6781 CV_Error( CV_StsUnmatchedFormats, "" );
6784 CV_Error( CV_StsOutOfRange, "M should be >0" );
6786 dsize = cvGetMatSize(dst);
6788 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
6789 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
6791 if( !(flags & CV_WARP_INVERSE_MAP) )
6794 cv::AutoBuffer<double> _exp_tab(dsize.width);
6795 double* exp_tab = _exp_tab;
6797 for( rho = 0; rho < dst->width; rho++ )
6798 exp_tab[rho] = std::exp(rho/M) - 1.0;
6800 for( phi = 0; phi < dsize.height; phi++ )
6802 double cp = cos(phi*2*CV_PI/dsize.height);
6803 double sp = sin(phi*2*CV_PI/dsize.height);
6804 float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
6805 float* my = (float*)(mapy->data.ptr + phi*mapy->step);
6807 for( rho = 0; rho < dsize.width; rho++ )
6809 double r = exp_tab[rho];
6810 double x = r*cp + center.x;
6811 double y = r*sp + center.y;
6820 const int ANGLE_BORDER = 1;
6821 Mat src_ = cv::cvarrToMat(src);
6822 cv::copyMakeBorder(src_, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
6823 srcstub = src_with_border; src = &srcstub;
6824 CvSize ssize = cvGetMatSize(src);
6825 ssize.height -= 2*ANGLE_BORDER;
6828 CvMat bufx, bufy, bufp, bufa;
6829 double ascale = ssize.height/(2*CV_PI);
6830 cv::AutoBuffer<float> _buf(4*dsize.width);
6833 bufx = cvMat( 1, dsize.width, CV_32F, buf );
6834 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
6835 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
6836 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
6838 for( x = 0; x < dsize.width; x++ )
6839 bufx.data.fl[x] = (float)x - center.x;
6841 for( y = 0; y < dsize.height; y++ )
6843 float* mx = (float*)(mapx->data.ptr + y*mapx->step);
6844 float* my = (float*)(mapy->data.ptr + y*mapy->step);
6846 for( x = 0; x < dsize.width; x++ )
6847 bufy.data.fl[x] = (float)y - center.y;
6850 cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
6852 for( x = 0; x < dsize.width; x++ )
6853 bufp.data.fl[x] += 1.f;
6855 cvLog( &bufp, &bufp );
6857 for( x = 0; x < dsize.width; x++ )
6859 double rho = bufp.data.fl[x]*M;
6860 double phi = bufa.data.fl[x]*ascale;
6863 my[x] = (float)phi + ANGLE_BORDER;
6866 for( x = 0; x < dsize.width; x++ )
6868 double xx = bufx.data.fl[x];
6869 double yy = bufy.data.fl[x];
6871 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
6872 double a = atan2(yy,xx);
6884 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
6887 void cv::logPolar( InputArray _src, OutputArray _dst,
6888 Point2f center, double M, int flags )
6890 CV_INSTRUMENT_REGION()
6892 CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
6893 ocl_logPolar(_src, _dst, center, M, flags));
6894 Mat src_with_border; // don't scope this variable (it holds image data)
6898 Mat srcstub, src = _src.getMat();
6899 _dst.create(src.size(), src.type());
6900 Size dsize = src.size();
6903 CV_Error(CV_StsOutOfRange, "M should be >0");
6906 mapx.create(dsize, CV_32F);
6907 mapy.create(dsize, CV_32F);
6909 if (!(flags & CV_WARP_INVERSE_MAP))
6912 cv::AutoBuffer<double> _exp_tab(dsize.width);
6913 double* exp_tab = _exp_tab;
6915 for (rho = 0; rho < dsize.width; rho++)
6916 exp_tab[rho] = std::exp(rho / M) - 1.0;
6918 for (phi = 0; phi < dsize.height; phi++)
6920 double cp = cos(phi * 2 * CV_PI / dsize.height);
6921 double sp = sin(phi * 2 * CV_PI / dsize.height);
6922 float* mx = (float*)(mapx.data + phi*mapx.step);
6923 float* my = (float*)(mapy.data + phi*mapy.step);
6925 for (rho = 0; rho < dsize.width; rho++)
6927 double r = exp_tab[rho];
6928 double x = r*cp + center.x;
6929 double y = r*sp + center.y;
6938 const int ANGLE_BORDER = 1;
6939 cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
6940 srcstub = src_with_border; src = srcstub;
6941 Size ssize = src.size();
6942 ssize.height -= 2 * ANGLE_BORDER;
6945 Mat bufx, bufy, bufp, bufa;
6946 double ascale = ssize.height / (2 * CV_PI);
6948 bufx = Mat(1, dsize.width, CV_32F);
6949 bufy = Mat(1, dsize.width, CV_32F);
6950 bufp = Mat(1, dsize.width, CV_32F);
6951 bufa = Mat(1, dsize.width, CV_32F);
6953 for (x = 0; x < dsize.width; x++)
6954 bufx.at<float>(0, x) = (float)x - center.x;
6956 for (y = 0; y < dsize.height; y++)
6958 float* mx = (float*)(mapx.data + y*mapx.step);
6959 float* my = (float*)(mapy.data + y*mapy.step);
6961 for (x = 0; x < dsize.width; x++)
6962 bufy.at<float>(0, x) = (float)y - center.y;
6965 cartToPolar(bufx, bufy, bufp, bufa);
6967 for (x = 0; x < dsize.width; x++)
6968 bufp.at<float>(0, x) += 1.f;
6972 for (x = 0; x < dsize.width; x++)
6974 double rho = bufp.at<float>(0, x) * M;
6975 double phi = bufa.at<float>(0, x) * ascale;
6978 my[x] = (float)phi + ANGLE_BORDER;
6981 for (x = 0; x < dsize.width; x++)
6983 double xx = bufx.at<float>(0, x);
6984 double yy = bufy.at<float>(0, x);
6985 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
6986 double a = atan2(yy, xx);
6997 remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX,
6998 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
7001 /****************************************************************************************
7002 Linear-Polar Transform
7003 J.L. Blanco, Apr 2009
7004 ****************************************************************************************/
7006 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
7007 CvPoint2D32f center, double maxRadius, int flags )
7009 Mat src_with_border; // don't scope this variable (it holds image data)
7011 cv::Ptr<CvMat> mapx, mapy;
7013 CvMat srcstub, *src = (CvMat*)srcarr;
7014 CvMat dststub, *dst = (CvMat*)dstarr;
7017 src = cvGetMat( srcarr, &srcstub,0,0 );
7018 dst = cvGetMat( dstarr, &dststub,0,0 );
7020 if( !CV_ARE_TYPES_EQ( src, dst ))
7021 CV_Error( CV_StsUnmatchedFormats, "" );
7023 dsize = cvGetMatSize(dst);
7025 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
7026 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
7028 if( !(flags & CV_WARP_INVERSE_MAP) )
7032 for( phi = 0; phi < dsize.height; phi++ )
7034 double cp = cos(phi*2*CV_PI/dsize.height);
7035 double sp = sin(phi*2*CV_PI/dsize.height);
7036 float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
7037 float* my = (float*)(mapy->data.ptr + phi*mapy->step);
7039 for( rho = 0; rho < dsize.width; rho++ )
7041 double r = maxRadius*rho/dsize.width;
7042 double x = r*cp + center.x;
7043 double y = r*sp + center.y;
7052 const int ANGLE_BORDER = 1;
7053 Mat src_ = cv::cvarrToMat(src);
7054 cv::copyMakeBorder(src_, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
7055 srcstub = src_with_border; src = &srcstub;
7056 CvSize ssize = cvGetMatSize(src);
7057 ssize.height -= 2*ANGLE_BORDER;
7060 CvMat bufx, bufy, bufp, bufa;
7061 const double ascale = ssize.height/(2*CV_PI);
7062 const double pscale = ssize.width/maxRadius;
7064 cv::AutoBuffer<float> _buf(4*dsize.width);
7067 bufx = cvMat( 1, dsize.width, CV_32F, buf );
7068 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
7069 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
7070 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
7072 for( x = 0; x < dsize.width; x++ )
7073 bufx.data.fl[x] = (float)x - center.x;
7075 for( y = 0; y < dsize.height; y++ )
7077 float* mx = (float*)(mapx->data.ptr + y*mapx->step);
7078 float* my = (float*)(mapy->data.ptr + y*mapy->step);
7080 for( x = 0; x < dsize.width; x++ )
7081 bufy.data.fl[x] = (float)y - center.y;
7083 cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
7085 for( x = 0; x < dsize.width; x++ )
7087 double rho = bufp.data.fl[x]*pscale;
7088 double phi = bufa.data.fl[x]*ascale;
7090 my[x] = (float)phi + ANGLE_BORDER;
7095 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
7098 void cv::linearPolar( InputArray _src, OutputArray _dst,
7099 Point2f center, double maxRadius, int flags )
7101 CV_INSTRUMENT_REGION()
7103 CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
7104 ocl_linearPolar(_src, _dst, center, maxRadius, flags));
7105 Mat src_with_border; // don't scope this variable (it holds image data)
7108 Mat srcstub, src = _src.getMat();
7109 _dst.create(src.size(), src.type());
7110 Size dsize = src.size();
7113 mapx.create(dsize, CV_32F);
7114 mapy.create(dsize, CV_32F);
7116 if (!(flags & CV_WARP_INVERSE_MAP))
7120 for (phi = 0; phi < dsize.height; phi++)
7122 double cp = cos(phi * 2 * CV_PI / dsize.height);
7123 double sp = sin(phi * 2 * CV_PI / dsize.height);
7124 float* mx = (float*)(mapx.data + phi*mapx.step);
7125 float* my = (float*)(mapy.data + phi*mapy.step);
7127 for (rho = 0; rho < dsize.width; rho++)
7129 double r = maxRadius*rho / dsize.width;
7130 double x = r*cp + center.x;
7131 double y = r*sp + center.y;
7140 const int ANGLE_BORDER = 1;
7142 cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, 0, 0, BORDER_WRAP);
7143 src = src_with_border;
7144 Size ssize = src_with_border.size();
7145 ssize.height -= 2 * ANGLE_BORDER;
7148 Mat bufx, bufy, bufp, bufa;
7149 const double ascale = ssize.height / (2 * CV_PI);
7150 const double pscale = ssize.width / maxRadius;
7154 bufx = Mat(1, dsize.width, CV_32F);
7155 bufy = Mat(1, dsize.width, CV_32F);
7156 bufp = Mat(1, dsize.width, CV_32F);
7157 bufa = Mat(1, dsize.width, CV_32F);
7159 for (x = 0; x < dsize.width; x++)
7160 bufx.at<float>(0, x) = (float)x - center.x;
7162 for (y = 0; y < dsize.height; y++)
7164 float* mx = (float*)(mapx.data + y*mapx.step);
7165 float* my = (float*)(mapy.data + y*mapy.step);
7167 for (x = 0; x < dsize.width; x++)
7168 bufy.at<float>(0, x) = (float)y - center.y;
7170 cartToPolar(bufx, bufy, bufp, bufa, 0);
7172 for (x = 0; x < dsize.width; x++)
7174 double rho = bufp.at<float>(0, x) * pscale;
7175 double phi = bufa.at<float>(0, x) * ascale;
7177 my[x] = (float)phi + ANGLE_BORDER;
7182 remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);