1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "precomp.hpp"
44 #include "opencl_kernels_imgproc.hpp"
49 template<typename T, int shift> struct FixPtCast
53 rtype operator ()(type1 arg) const { return (T)((arg + (1 << (shift-1))) >> shift); }
56 template<typename T, int shift> struct FltCast
60 rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); }
63 template<typename T1, typename T2> struct NoVec
65 int operator()(T1**, T2*, int, int) const { return 0; }
70 struct PyrDownVec_32s8u
72 int operator()(int** src, uchar* dst, int, int width) const
74 if( !checkHardwareSupport(CV_CPU_SSE2) )
78 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
79 __m128i delta = _mm_set1_epi16(128);
81 for( ; x <= width - 16; x += 16 )
83 __m128i r0, r1, r2, r3, r4, t0, t1;
84 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)),
85 _mm_load_si128((const __m128i*)(row0 + x + 4)));
86 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)),
87 _mm_load_si128((const __m128i*)(row1 + x + 4)));
88 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)),
89 _mm_load_si128((const __m128i*)(row2 + x + 4)));
90 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)),
91 _mm_load_si128((const __m128i*)(row3 + x + 4)));
92 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)),
93 _mm_load_si128((const __m128i*)(row4 + x + 4)));
94 r0 = _mm_add_epi16(r0, r4);
95 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
96 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
97 t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
98 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)),
99 _mm_load_si128((const __m128i*)(row0 + x + 12)));
100 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)),
101 _mm_load_si128((const __m128i*)(row1 + x + 12)));
102 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)),
103 _mm_load_si128((const __m128i*)(row2 + x + 12)));
104 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)),
105 _mm_load_si128((const __m128i*)(row3 + x + 12)));
106 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)),
107 _mm_load_si128((const __m128i*)(row4 + x + 12)));
108 r0 = _mm_add_epi16(r0, r4);
109 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
110 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
111 t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
112 t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8);
113 t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8);
114 _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1));
117 for( ; x <= width - 4; x += 4 )
119 __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128();
120 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z);
121 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z);
122 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z);
123 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z);
124 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z);
125 r0 = _mm_add_epi16(r0, r4);
126 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
127 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
128 r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
129 r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8);
130 *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0));
137 struct PyrDownVec_32f
139 int operator()(float** src, float* dst, int, int width) const
141 if( !checkHardwareSupport(CV_CPU_SSE) )
145 const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
146 __m128 _4 = _mm_set1_ps(4.f), _scale = _mm_set1_ps(1.f/256);
147 for( ; x <= width - 8; x += 8 )
149 __m128 r0, r1, r2, r3, r4, t0, t1;
150 r0 = _mm_load_ps(row0 + x);
151 r1 = _mm_load_ps(row1 + x);
152 r2 = _mm_load_ps(row2 + x);
153 r3 = _mm_load_ps(row3 + x);
154 r4 = _mm_load_ps(row4 + x);
155 r0 = _mm_add_ps(r0, r4);
156 r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
157 r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
158 t0 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
160 r0 = _mm_load_ps(row0 + x + 4);
161 r1 = _mm_load_ps(row1 + x + 4);
162 r2 = _mm_load_ps(row2 + x + 4);
163 r3 = _mm_load_ps(row3 + x + 4);
164 r4 = _mm_load_ps(row4 + x + 4);
165 r0 = _mm_add_ps(r0, r4);
166 r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
167 r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
168 t1 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
170 t0 = _mm_mul_ps(t0, _scale);
171 t1 = _mm_mul_ps(t1, _scale);
173 _mm_storeu_ps(dst + x, t0);
174 _mm_storeu_ps(dst + x + 4, t1);
181 typedef NoVec<int, ushort> PyrDownVec_32s16u;
182 typedef NoVec<int, short> PyrDownVec_32s16s;
184 typedef NoVec<float, float> PyrUpVec_32f;
188 struct PyrDownVec_32s8u
190 int operator()(int** src, uchar* dst, int, int width) const
193 const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1],
194 *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3],
195 *row4 = (unsigned int*)src[4];
196 uint16x8_t v_delta = vdupq_n_u16(128);
198 for( ; x <= width - 16; x += 16 )
200 uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
201 uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
202 uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
203 uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4)));
204 uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4)));
206 v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2));
207 v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3);
208 uint16x8_t v_dst0 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
210 v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
211 v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
212 v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
213 v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12)));
214 v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12)));
216 v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2));
217 v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3);
218 uint16x8_t v_dst1 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
220 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)),
221 vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8))));
228 struct PyrDownVec_32s16u
230 int operator()(int** src, ushort* dst, int, int width) const
233 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
234 int32x4_t v_delta = vdupq_n_s32(128);
236 for( ; x <= width - 8; x += 8 )
238 int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
239 int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
240 int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
241 int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
242 int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
244 v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20));
245 v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30);
246 int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8);
248 v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21));
249 v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31);
250 int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8);
252 vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1)));
259 struct PyrDownVec_32s16s
261 int operator()(int** src, short* dst, int, int width) const
264 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
265 int32x4_t v_delta = vdupq_n_s32(128);
267 for( ; x <= width - 8; x += 8 )
269 int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
270 int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
271 int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
272 int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
273 int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
275 v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20));
276 v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30);
277 int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8);
279 v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21));
280 v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31);
281 int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8);
283 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
290 struct PyrDownVec_32f
292 int operator()(float** src, float* dst, int, int width) const
295 const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
296 float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f);
298 for( ; x <= width - 8; x += 8 )
300 float32x4_t v_r0 = vld1q_f32(row0 + x);
301 float32x4_t v_r1 = vld1q_f32(row1 + x);
302 float32x4_t v_r2 = vld1q_f32(row2 + x);
303 float32x4_t v_r3 = vld1q_f32(row3 + x);
304 float32x4_t v_r4 = vld1q_f32(row4 + x);
306 v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
307 v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
308 vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
310 v_r0 = vld1q_f32(row0 + x + 4);
311 v_r1 = vld1q_f32(row1 + x + 4);
312 v_r2 = vld1q_f32(row2 + x + 4);
313 v_r3 = vld1q_f32(row3 + x + 4);
314 v_r4 = vld1q_f32(row4 + x + 4);
316 v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
317 v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
318 vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
327 int operator()(float** src, float* dst, int, int width) const
330 float ** dsts = (float **)dst;
331 const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
332 float *dst0 = dsts[0], *dst1 = dsts[1];
333 float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f);
335 for( ; x <= width - 8; x += 8 )
337 float32x4_t v_r0 = vld1q_f32(row0 + x);
338 float32x4_t v_r1 = vld1q_f32(row1 + x);
339 float32x4_t v_r2 = vld1q_f32(row2 + x);
341 vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
342 vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
344 v_r0 = vld1q_f32(row0 + x + 4);
345 v_r1 = vld1q_f32(row1 + x + 4);
346 v_r2 = vld1q_f32(row2 + x + 4);
348 vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
349 vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
358 typedef NoVec<int, uchar> PyrDownVec_32s8u;
359 typedef NoVec<int, ushort> PyrDownVec_32s16u;
360 typedef NoVec<int, short> PyrDownVec_32s16s;
361 typedef NoVec<float, float> PyrDownVec_32f;
363 typedef NoVec<float, float> PyrUpVec_32f;
367 template<class CastOp, class VecOp> void
368 pyrDown_( const Mat& _src, Mat& _dst, int borderType )
371 typedef typename CastOp::type1 WT;
372 typedef typename CastOp::rtype T;
374 CV_Assert( !_src.empty() );
375 Size ssize = _src.size(), dsize = _dst.size();
376 int cn = _src.channels();
377 int bufstep = (int)alignSize(dsize.width*cn, 16);
378 AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
379 WT* buf = alignPtr((WT*)_buf, 16);
380 int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
381 AutoBuffer<int> _tabM(dsize.width*cn);
387 CV_Assert( ssize.width > 0 && ssize.height > 0 &&
388 std::abs(dsize.width*2 - ssize.width) <= 2 &&
389 std::abs(dsize.height*2 - ssize.height) <= 2 );
390 int k, x, sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
392 for( x = 0; x <= PD_SZ+1; x++ )
394 int sx0 = borderInterpolate(x - PD_SZ/2, ssize.width, borderType)*cn;
395 int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, ssize.width, borderType)*cn;
396 for( k = 0; k < cn; k++ )
398 tabL[x*cn + k] = sx0 + k;
399 tabR[x*cn + k] = sx1 + k;
407 for( x = 0; x < dsize.width; x++ )
408 tabM[x] = (x/cn)*2*cn + x % cn;
410 for( int y = 0; y < dsize.height; y++ )
412 T* dst = _dst.ptr<T>(y);
413 WT *row0, *row1, *row2, *row3, *row4;
415 // fill the ring buffer (horizontal convolution and decimation)
416 for( ; sy <= y*2 + 2; sy++ )
418 WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
419 int _sy = borderInterpolate(sy, ssize.height, borderType);
420 const T* src = _src.ptr<T>(_sy);
422 const int* tab = tabL;
426 for( ; x < limit; x++ )
428 row[x] = src[tab[x+cn*2]]*6 + (src[tab[x+cn]] + src[tab[x+cn*3]])*4 +
429 src[tab[x]] + src[tab[x+cn*4]];
432 if( x == dsize.width )
437 for( ; x < width0; x++ )
438 row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 +
439 src[x*2 - 2] + src[x*2 + 2];
443 for( ; x < width0; x += 3 )
445 const T* s = src + x*2;
446 WT t0 = s[0]*6 + (s[-3] + s[3])*4 + s[-6] + s[6];
447 WT t1 = s[1]*6 + (s[-2] + s[4])*4 + s[-5] + s[7];
448 WT t2 = s[2]*6 + (s[-1] + s[5])*4 + s[-4] + s[8];
449 row[x] = t0; row[x+1] = t1; row[x+2] = t2;
454 for( ; x < width0; x += 4 )
456 const T* s = src + x*2;
457 WT t0 = s[0]*6 + (s[-4] + s[4])*4 + s[-8] + s[8];
458 WT t1 = s[1]*6 + (s[-3] + s[5])*4 + s[-7] + s[9];
459 row[x] = t0; row[x+1] = t1;
460 t0 = s[2]*6 + (s[-2] + s[6])*4 + s[-6] + s[10];
461 t1 = s[3]*6 + (s[-1] + s[7])*4 + s[-5] + s[11];
462 row[x+2] = t0; row[x+3] = t1;
467 for( ; x < width0; x++ )
470 row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 +
471 src[sx - cn*2] + src[sx + cn*2];
480 // do vertical convolution and decimation and write the result to the destination image
481 for( k = 0; k < PD_SZ; k++ )
482 rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
483 row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];
485 x = vecOp(rows, dst, (int)_dst.step, dsize.width);
486 for( ; x < dsize.width; x++ )
487 dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]);
492 template<class CastOp, class VecOp> void
493 pyrUp_( const Mat& _src, Mat& _dst, int)
496 typedef typename CastOp::type1 WT;
497 typedef typename CastOp::rtype T;
499 Size ssize = _src.size(), dsize = _dst.size();
500 int cn = _src.channels();
501 int bufstep = (int)alignSize((dsize.width+1)*cn, 16);
502 AutoBuffer<WT> _buf(bufstep*PU_SZ + 16);
503 WT* buf = alignPtr((WT*)_buf, 16);
504 AutoBuffer<int> _dtab(ssize.width*cn);
511 CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 &&
512 std::abs(dsize.height - ssize.height*2) == dsize.height % 2);
513 int k, x, sy0 = -PU_SZ/2, sy = sy0;
518 for( x = 0; x < ssize.width; x++ )
519 dtab[x] = (x/cn)*2*cn + x % cn;
521 for( int y = 0; y < ssize.height; y++ )
523 T* dst0 = _dst.ptr<T>(y*2);
524 T* dst1 = _dst.ptr<T>(std::min(y*2+1, dsize.height-1));
525 WT *row0, *row1, *row2;
527 // fill the ring buffer (horizontal convolution and decimation)
528 for( ; sy <= y + 1; sy++ )
530 WT* row = buf + ((sy - sy0) % PU_SZ)*bufstep;
531 int _sy = borderInterpolate(sy*2, dsize.height, BORDER_REFLECT_101)/2;
532 const T* src = _src.ptr<T>(_sy);
534 if( ssize.width == cn )
536 for( x = 0; x < cn; x++ )
537 row[x] = row[x + cn] = src[x]*8;
541 for( x = 0; x < cn; x++ )
544 WT t0 = src[x]*6 + src[x + cn]*2;
545 WT t1 = (src[x] + src[x + cn])*4;
546 row[dx] = t0; row[dx + cn] = t1;
547 dx = dtab[ssize.width - cn + x];
548 int sx = ssize.width - cn + x;
549 t0 = src[sx - cn] + src[sx]*7;
551 row[dx] = t0; row[dx + cn] = t1;
554 for( x = cn; x < ssize.width - cn; x++ )
557 WT t0 = src[x-cn] + src[x]*6 + src[x+cn];
558 WT t1 = (src[x] + src[x+cn])*4;
564 // do vertical convolution and decimation and write the result to the destination image
565 for( k = 0; k < PU_SZ; k++ )
566 rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep;
567 row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
568 dsts[0] = dst0; dsts[1] = dst1;
570 x = vecOp(rows, (T*)dsts, (int)_dst.step, dsize.width);
571 for( ; x < dsize.width; x++ )
573 T t1 = castOp((row1[x] + row2[x])*4);
574 T t0 = castOp(row0[x] + row1[x]*6 + row2[x]);
575 dst1[x] = t1; dst0[x] = t0;
580 typedef void (*PyrFunc)(const Mat&, Mat&, int);
584 static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
586 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
588 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
589 if (cn > 4 || (depth == CV_64F && !doubleSupport))
592 Size ssize = _src.size();
593 Size dsize = _dsz.area() == 0 ? Size((ssize.width + 1) / 2, (ssize.height + 1) / 2) : _dsz;
594 if (dsize.height < 2 || dsize.width < 2)
597 CV_Assert( ssize.width > 0 && ssize.height > 0 &&
598 std::abs(dsize.width*2 - ssize.width) <= 2 &&
599 std::abs(dsize.height*2 - ssize.height) <= 2 );
601 UMat src = _src.getUMat();
602 _dst.create( dsize, src.type() );
603 UMat dst = _dst.getUMat();
605 int float_depth = depth == CV_64F ? CV_64F : CV_32F;
606 const int local_size = 256;
608 if (depth == CV_8U && float_depth == CV_32F && cn == 1 && ocl::Device::getDefault().isIntel())
610 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
611 "BORDER_REFLECT_101" };
613 String buildOptions = format(
614 "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
615 "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d",
616 ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
617 ocl::convertTypeStr(float_depth, depth, cn, cvt[0]),
618 ocl::convertTypeStr(depth, float_depth, cn, cvt[1]),
619 doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(depth),
620 cn, kercn, float_depth, borderMap[borderType], local_size
622 ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions);
626 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
628 size_t localThreads[2] = { local_size/kercn, 1 };
629 size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, (dst.rows + 1) / 2 };
630 return k.run(2, globalThreads, localThreads, false);
633 static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
635 int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
637 if (channels > 4 || borderType != BORDER_DEFAULT)
640 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
641 if (depth == CV_64F && !doubleSupport)
644 Size ssize = _src.size();
645 if ((_dsz.area() != 0) && (_dsz != Size(ssize.width * 2, ssize.height * 2)))
648 UMat src = _src.getUMat();
649 Size dsize = Size(ssize.width * 2, ssize.height * 2);
650 _dst.create( dsize, src.type() );
651 UMat dst = _dst.getUMat();
653 int float_depth = depth == CV_64F ? CV_64F : CV_32F;
654 const int local_size = 16;
656 String buildOptions = format(
657 "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
658 "-D T1=%s -D cn=%d -D LOCAL_SIZE=%d",
659 ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
660 ocl::convertTypeStr(float_depth, depth, channels, cvt[0]),
661 ocl::convertTypeStr(depth, float_depth, channels, cvt[1]),
662 doubleSupport ? " -D DOUBLE_SUPPORT" : "",
663 ocl::typeToStr(depth), channels, local_size
665 size_t globalThreads[2] = { dst.cols, dst.rows };
666 size_t localThreads[2] = { local_size, local_size };
668 if (ocl::Device::getDefault().isIntel() && channels == 1)
670 k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
671 globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
674 k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions);
679 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
680 return k.run(2, globalThreads, localThreads, false);
687 void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
689 CV_Assert(borderType != BORDER_CONSTANT);
691 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
692 ocl_pyrDown(_src, _dst, _dsz, borderType))
694 Mat src = _src.getMat();
695 Size dsz = _dsz.area() == 0 ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz;
696 _dst.create( dsz, src.type() );
697 Mat dst = _dst.getMat();
698 int depth = src.depth();
700 #ifdef HAVE_TEGRA_OPTIMIZATION
701 if(borderType == BORDER_DEFAULT && tegra::pyrDown(src, dst))
705 #if IPP_VERSION_X100 >= 801 && 0
706 bool isolated = (borderType & BORDER_ISOLATED) != 0;
707 int borderTypeNI = borderType & ~BORDER_ISOLATED;
708 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size((src.cols + 1)/2, (src.rows + 1)/2))
710 typedef IppStatus (CV_STDCALL * ippiPyrDown)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
711 int type = src.type();
712 CV_SUPPRESS_DEPRECATED_START
713 ippiPyrDown pyrDownFunc = type == CV_8UC1 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_8u_C1R :
714 type == CV_8UC3 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_8u_C3R :
715 type == CV_32FC1 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_32f_C1R :
716 type == CV_32FC3 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_32f_C3R : 0;
717 CV_SUPPRESS_DEPRECATED_END
722 IppiSize srcRoi = { src.cols, src.rows };
723 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
724 CV_SUPPRESS_DEPRECATED_START
725 IppStatus ok = ippiPyrDownGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
726 CV_SUPPRESS_DEPRECATED_END
729 Ipp8u* buffer = ippsMalloc_8u(bufferSize);
730 ok = pyrDownFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
743 func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>;
744 else if( depth == CV_16S )
745 func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >;
746 else if( depth == CV_16U )
747 func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >;
748 else if( depth == CV_32F )
749 func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>;
750 else if( depth == CV_64F )
751 func = pyrDown_<FltCast<double, 8>, NoVec<double, double> >;
753 CV_Error( CV_StsUnsupportedFormat, "" );
755 func( src, dst, borderType );
758 void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
760 CV_Assert(borderType == BORDER_DEFAULT);
762 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
763 ocl_pyrUp(_src, _dst, _dsz, borderType))
765 Mat src = _src.getMat();
766 Size dsz = _dsz.area() == 0 ? Size(src.cols*2, src.rows*2) : _dsz;
767 _dst.create( dsz, src.type() );
768 Mat dst = _dst.getMat();
769 int depth = src.depth();
771 #ifdef HAVE_TEGRA_OPTIMIZATION
772 if(borderType == BORDER_DEFAULT && tegra::pyrUp(src, dst))
776 #if IPP_VERSION_X100 >= 801 && 0
777 bool isolated = (borderType & BORDER_ISOLATED) != 0;
778 int borderTypeNI = borderType & ~BORDER_ISOLATED;
779 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size(src.cols*2, src.rows*2))
781 typedef IppStatus (CV_STDCALL * ippiPyrUp)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
782 int type = src.type();
783 CV_SUPPRESS_DEPRECATED_START
784 ippiPyrUp pyrUpFunc = type == CV_8UC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C1R :
785 type == CV_8UC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C3R :
786 type == CV_32FC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C1R :
787 type == CV_32FC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C3R : 0;
788 CV_SUPPRESS_DEPRECATED_END
793 IppiSize srcRoi = { src.cols, src.rows };
794 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
795 CV_SUPPRESS_DEPRECATED_START
796 IppStatus ok = ippiPyrUpGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
797 CV_SUPPRESS_DEPRECATED_END
800 Ipp8u* buffer = ippsMalloc_8u(bufferSize);
801 ok = pyrUpFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
814 func = pyrUp_<FixPtCast<uchar, 6>, NoVec<int, uchar> >;
815 else if( depth == CV_16S )
816 func = pyrUp_<FixPtCast<short, 6>, NoVec<int, short> >;
817 else if( depth == CV_16U )
818 func = pyrUp_<FixPtCast<ushort, 6>, NoVec<int, ushort> >;
819 else if( depth == CV_32F )
820 func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >;
821 else if( depth == CV_64F )
822 func = pyrUp_<FltCast<double, 6>, NoVec<double, double> >;
824 CV_Error( CV_StsUnsupportedFormat, "" );
826 func( src, dst, borderType );
829 void cv::buildPyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
831 CV_Assert(borderType != BORDER_CONSTANT);
833 if (_src.dims() <= 2 && _dst.isUMatVector())
835 UMat src = _src.getUMat();
836 _dst.create( maxlevel + 1, 1, 0 );
837 _dst.getUMatRef(0) = src;
838 for( int i = 1; i <= maxlevel; i++ )
839 pyrDown( _dst.getUMatRef(i-1), _dst.getUMatRef(i), Size(), borderType );
843 Mat src = _src.getMat();
844 _dst.create( maxlevel + 1, 1, 0 );
845 _dst.getMatRef(0) = src;
849 #if IPP_VERSION_X100 >= 801 && 0
850 bool isolated = (borderType & BORDER_ISOLATED) != 0;
851 int borderTypeNI = borderType & ~BORDER_ISOLATED;
852 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated))
854 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownInitAlloc)(void** ppState, IppiSize srcRoi, Ipp32f rate, void* pKernel, int kerSize, int mode);
855 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDown)(void* pSrc, int srcStep, IppiSize srcRoiSize, void* pDst, int dstStep, IppiSize dstRoiSize, void* pState);
856 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownFree)(void* pState);
858 int type = src.type();
859 int depth = src.depth();
860 ippiPyramidLayerDownInitAlloc pyrInitAllocFunc = 0;
861 ippiPyramidLayerDown pyrDownFunc = 0;
862 ippiPyramidLayerDownFree pyrFreeFunc = 0;
866 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C1R;
867 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C1R;
868 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C1R;
870 else if (type == CV_8UC3)
872 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C3R;
873 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C3R;
874 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C3R;
876 else if (type == CV_32FC1)
878 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C1R;
879 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C1R;
880 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C1R;
882 else if (type == CV_32FC3)
884 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C3R;
885 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C3R;
886 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C3R;
889 if (pyrInitAllocFunc && pyrDownFunc && pyrFreeFunc)
892 IppiSize srcRoi = { src.cols, src.rows };
894 IppStatus ok = ippiPyramidInitAlloc(&gPyr, maxlevel + 1, srcRoi, rate);
896 Ipp16s iKernel[5] = { 1, 4, 6, 4, 1 };
897 Ipp32f fKernel[5] = { 1.f, 4.f, 6.f, 4.f, 1.f };
898 void* kernel = depth >= CV_32F ? (void*) fKernel : (void*) iKernel;
900 if (ok >= 0) ok = pyrInitAllocFunc((void**) &(gPyr->pState), srcRoi, rate, kernel, 5, IPPI_INTER_LINEAR);
903 gPyr->pImage[0] = src.data;
904 gPyr->pStep[0] = (int) src.step;
905 gPyr->pRoi[0] = srcRoi;
906 for( ; i <= maxlevel; i++ )
909 ok = ippiGetPyramidDownROI(gPyr->pRoi[i-1], &dstRoi, rate);
910 Mat& dst = _dst.getMatRef(i);
911 dst.create(Size(dstRoi.width, dstRoi.height), type);
912 gPyr->pImage[i] = dst.data;
913 gPyr->pStep[i] = (int) dst.step;
914 gPyr->pRoi[i] = dstRoi;
916 if (ok >= 0) ok = pyrDownFunc(gPyr->pImage[i-1], gPyr->pStep[i-1], gPyr->pRoi[i-1],
917 gPyr->pImage[i], gPyr->pStep[i], gPyr->pRoi[i], gPyr->pState);
925 pyrFreeFunc(gPyr->pState);
930 ippiPyramidFree(gPyr);
934 for( ; i <= maxlevel; i++ )
935 pyrDown( _dst.getMatRef(i-1), _dst.getMatRef(i), Size(), borderType );
938 CV_IMPL void cvPyrDown( const void* srcarr, void* dstarr, int _filter )
940 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
942 CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
943 cv::pyrDown( src, dst, dst.size() );
946 CV_IMPL void cvPyrUp( const void* srcarr, void* dstarr, int _filter )
948 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
950 CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
951 cv::pyrUp( src, dst, dst.size() );
956 cvReleasePyramid( CvMat*** _pyramid, int extra_layers )
959 CV_Error( CV_StsNullPtr, "" );
962 for( int i = 0; i <= extra_layers; i++ )
963 cvReleaseMat( &(*_pyramid)[i] );
970 cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate,
971 const CvSize* layer_sizes, CvArr* bufarr,
972 int calc, int filter )
974 const float eps = 0.1f;
977 CvMat stub, *src = cvGetMat( srcarr, &stub );
979 if( extra_layers < 0 )
980 CV_Error( CV_StsOutOfRange, "The number of extra layers must be non negative" );
982 int i, layer_step, elem_size = CV_ELEM_SIZE(src->type);
983 CvSize layer_size, size = cvGetMatSize(src);
990 buf = cvGetMat( bufarr, &bstub );
991 bufsize = buf->rows*buf->cols*CV_ELEM_SIZE(buf->type);
993 for( i = 1; i <= extra_layers; i++ )
997 layer_size.width = cvRound(layer_size.width*rate+eps);
998 layer_size.height = cvRound(layer_size.height*rate+eps);
1001 layer_size = layer_sizes[i-1];
1002 layer_step = layer_size.width*elem_size;
1003 bufsize -= layer_step*layer_size.height;
1007 CV_Error( CV_StsOutOfRange, "The buffer is too small to fit the pyramid" );
1008 ptr = buf->data.ptr;
1011 CvMat** pyramid = (CvMat**)cvAlloc( (extra_layers+1)*sizeof(pyramid[0]) );
1012 memset( pyramid, 0, (extra_layers+1)*sizeof(pyramid[0]) );
1014 pyramid[0] = cvCreateMatHeader( size.height, size.width, src->type );
1015 cvSetData( pyramid[0], src->data.ptr, src->step );
1018 for( i = 1; i <= extra_layers; i++ )
1022 layer_size.width = cvRound(layer_size.width*rate + eps);
1023 layer_size.height = cvRound(layer_size.height*rate + eps);
1026 layer_size = layer_sizes[i];
1030 pyramid[i] = cvCreateMatHeader( layer_size.height, layer_size.width, src->type );
1031 layer_step = layer_size.width*elem_size;
1032 cvSetData( pyramid[i], ptr, layer_step );
1033 ptr += layer_step*layer_size.height;
1036 pyramid[i] = cvCreateMat( layer_size.height, layer_size.width, src->type );
1039 cvPyrDown( pyramid[i-1], pyramid[i], filter );
1040 //cvResize( pyramid[i-1], pyramid[i], CV_INTER_LINEAR );