1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
44 #include "precomp.hpp"
45 #include "opencl_kernels_imgproc.hpp"
47 #include "opencv2/core/openvx/ovx_defs.hpp"
52 template<typename T, int shift> struct FixPtCast
56 rtype operator ()(type1 arg) const { return (T)((arg + (1 << (shift-1))) >> shift); }
59 template<typename T, int shift> struct FltCast
63 rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); }
66 template<typename T1, typename T2> struct PyrDownNoVec
68 int operator()(T1**, T2*, int, int) const { return 0; }
71 template<typename T1, typename T2> struct PyrUpNoVec
73 int operator()(T1**, T2**, int, int) const { return 0; }
78 struct PyrDownVec_32s8u
80 int operator()(int** src, uchar* dst, int, int width) const
82 if( !checkHardwareSupport(CV_CPU_SSE2) )
86 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
87 __m128i delta = _mm_set1_epi16(128);
89 for( ; x <= width - 16; x += 16 )
91 __m128i r0, r1, r2, r3, r4, t0, t1;
92 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)),
93 _mm_load_si128((const __m128i*)(row0 + x + 4)));
94 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)),
95 _mm_load_si128((const __m128i*)(row1 + x + 4)));
96 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)),
97 _mm_load_si128((const __m128i*)(row2 + x + 4)));
98 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)),
99 _mm_load_si128((const __m128i*)(row3 + x + 4)));
100 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)),
101 _mm_load_si128((const __m128i*)(row4 + x + 4)));
102 r0 = _mm_add_epi16(r0, r4);
103 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
104 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
105 t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
106 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)),
107 _mm_load_si128((const __m128i*)(row0 + x + 12)));
108 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)),
109 _mm_load_si128((const __m128i*)(row1 + x + 12)));
110 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)),
111 _mm_load_si128((const __m128i*)(row2 + x + 12)));
112 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)),
113 _mm_load_si128((const __m128i*)(row3 + x + 12)));
114 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)),
115 _mm_load_si128((const __m128i*)(row4 + x + 12)));
116 r0 = _mm_add_epi16(r0, r4);
117 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
118 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
119 t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
120 t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8);
121 t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8);
122 _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1));
125 for( ; x <= width - 4; x += 4 )
127 __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128();
128 r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z);
129 r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z);
130 r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z);
131 r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z);
132 r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z);
133 r0 = _mm_add_epi16(r0, r4);
134 r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
135 r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
136 r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
137 r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8);
138 *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0));
145 struct PyrDownVec_32f
147 int operator()(float** src, float* dst, int, int width) const
149 if( !checkHardwareSupport(CV_CPU_SSE) )
153 const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
154 __m128 _4 = _mm_set1_ps(4.f), _scale = _mm_set1_ps(1.f/256);
155 for( ; x <= width - 8; x += 8 )
157 __m128 r0, r1, r2, r3, r4, t0, t1;
158 r0 = _mm_load_ps(row0 + x);
159 r1 = _mm_load_ps(row1 + x);
160 r2 = _mm_load_ps(row2 + x);
161 r3 = _mm_load_ps(row3 + x);
162 r4 = _mm_load_ps(row4 + x);
163 r0 = _mm_add_ps(r0, r4);
164 r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
165 r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
166 t0 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
168 r0 = _mm_load_ps(row0 + x + 4);
169 r1 = _mm_load_ps(row1 + x + 4);
170 r2 = _mm_load_ps(row2 + x + 4);
171 r3 = _mm_load_ps(row3 + x + 4);
172 r4 = _mm_load_ps(row4 + x + 4);
173 r0 = _mm_add_ps(r0, r4);
174 r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
175 r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
176 t1 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
178 t0 = _mm_mul_ps(t0, _scale);
179 t1 = _mm_mul_ps(t1, _scale);
181 _mm_storeu_ps(dst + x, t0);
182 _mm_storeu_ps(dst + x + 4, t1);
191 struct PyrDownVec_32s16u
195 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
198 int operator()(int** src, ushort* dst, int, int width) const
205 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
206 __m128i v_delta = _mm_set1_epi32(128);
208 for( ; x <= width - 8; x += 8 )
210 __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
211 v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
212 __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
213 v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
214 __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
215 v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
216 __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
217 v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
218 __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
219 v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
221 v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
222 v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
224 v_r10 = _mm_slli_epi32(v_r10, 2);
225 __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
227 v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
228 v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
229 v_r11 = _mm_slli_epi32(v_r11, 2);
230 __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
232 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
243 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
247 struct PyrDownVec_32s16s
251 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
254 int operator()(int** src, short* dst, int, int width) const
261 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
262 __m128i v_delta = _mm_set1_epi32(128);
264 for( ; x <= width - 8; x += 8 )
266 __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
267 v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
268 __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
269 v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
270 __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
271 v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
272 __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
273 v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
274 __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
275 v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
277 v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
278 v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
280 v_r10 = _mm_slli_epi32(v_r10, 2);
281 __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
283 v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
284 v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
285 v_r11 = _mm_slli_epi32(v_r11, 2);
286 __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
288 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
297 struct PyrUpVec_32s8u
299 int operator()(int** src, uchar** dst, int, int width) const
303 if (!checkHardwareSupport(CV_CPU_SSE2))
306 uchar *dst0 = dst[0], *dst1 = dst[1];
307 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
308 __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
310 for( ; x <= width - 16; x += 16 )
312 __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
313 _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
314 __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
315 _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
316 __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
317 _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
319 __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
320 __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
321 __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
323 v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
324 _mm_loadu_si128((__m128i const *)(row0 + x + 12)));
325 v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
326 _mm_loadu_si128((__m128i const *)(row1 + x + 12)));
327 v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
328 _mm_loadu_si128((__m128i const *)(row2 + x + 12)));
330 v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
331 __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
332 __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
334 _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
335 _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
336 _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
337 _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
340 for( ; x <= width - 8; x += 8 )
342 __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
343 _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
344 __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
345 _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
346 __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
347 _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
349 __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
350 __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
351 __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
353 _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
354 _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
361 struct PyrUpVec_32s16s
363 int operator()(int** src, short** dst, int, int width) const
367 if (!checkHardwareSupport(CV_CPU_SSE2))
370 short *dst0 = dst[0], *dst1 = dst[1];
371 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
372 __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
374 for( ; x <= width - 8; x += 8 )
376 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
377 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
378 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
379 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
380 __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
381 __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
383 v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
384 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
385 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
386 v_2r1 = _mm_slli_epi32(v_r1, 1);
387 v_4r1 = _mm_slli_epi32(v_r1, 2);
388 __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
389 __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
391 _mm_storeu_si128((__m128i *)(dst0 + x),
392 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
393 _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
394 _mm_storeu_si128((__m128i *)(dst1 + x),
395 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
396 _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
399 for( ; x <= width - 4; x += 4 )
401 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
402 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
403 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
404 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
406 __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
407 __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
409 _mm_storel_epi64((__m128i *)(dst0 + x),
410 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
411 _mm_storel_epi64((__m128i *)(dst1 + x),
412 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
421 struct PyrUpVec_32s16u
423 int operator()(int** src, ushort** dst, int, int width) const
427 if (!checkHardwareSupport(CV_CPU_SSE4_1))
430 ushort *dst0 = dst[0], *dst1 = dst[1];
431 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
432 __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
434 for( ; x <= width - 8; x += 8 )
436 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
437 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
438 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
439 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
440 __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
441 __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
443 v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
444 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
445 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
446 v_2r1 = _mm_slli_epi32(v_r1, 1);
447 v_4r1 = _mm_slli_epi32(v_r1, 2);
448 __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
449 __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
451 _mm_storeu_si128((__m128i *)(dst0 + x),
452 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
453 _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
454 _mm_storeu_si128((__m128i *)(dst1 + x),
455 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
456 _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
459 for( ; x <= width - 4; x += 4 )
461 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
462 v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
463 v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
464 __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
466 __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
467 __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
469 _mm_storel_epi64((__m128i *)(dst0 + x),
470 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
471 _mm_storel_epi64((__m128i *)(dst1 + x),
472 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
481 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
487 int operator()(float** src, float** dst, int, int width) const
491 if (!checkHardwareSupport(CV_CPU_SSE2))
494 const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
495 float *dst0 = dst[0], *dst1 = dst[1];
496 __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
497 v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
499 for( ; x <= width - 8; x += 8 )
501 __m128 v_r0 = _mm_loadu_ps(row0 + x);
502 __m128 v_r1 = _mm_loadu_ps(row1 + x);
503 __m128 v_r2 = _mm_loadu_ps(row2 + x);
505 _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
506 _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
508 v_r0 = _mm_loadu_ps(row0 + x + 4);
509 v_r1 = _mm_loadu_ps(row1 + x + 4);
510 v_r2 = _mm_loadu_ps(row2 + x + 4);
512 _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
513 _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
522 struct PyrDownVec_32s8u
524 int operator()(int** src, uchar* dst, int, int width) const
527 const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1],
528 *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3],
529 *row4 = (unsigned int*)src[4];
530 uint16x8_t v_delta = vdupq_n_u16(128);
532 for( ; x <= width - 16; x += 16 )
534 uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
535 uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
536 uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
537 uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4)));
538 uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4)));
540 v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
541 v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
542 uint16x8_t v_dst0 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
544 v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
545 v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
546 v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
547 v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12)));
548 v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12)));
550 v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
551 v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
552 uint16x8_t v_dst1 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
554 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)),
555 vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8))));
562 struct PyrDownVec_32s16u
564 int operator()(int** src, ushort* dst, int, int width) const
567 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
568 int32x4_t v_delta = vdupq_n_s32(128);
570 for( ; x <= width - 8; x += 8 )
572 int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
573 int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
574 int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
575 int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
576 int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
578 v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
579 v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
581 v_r10 = vshlq_n_s32(v_r10, 2);
582 int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
584 v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
585 v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
586 v_r11 = vshlq_n_s32(v_r11, 2);
587 int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
589 vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1)));
596 struct PyrDownVec_32s16s
598 int operator()(int** src, short* dst, int, int width) const
601 const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
602 int32x4_t v_delta = vdupq_n_s32(128);
604 for( ; x <= width - 8; x += 8 )
606 int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
607 int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
608 int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
609 int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
610 int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
612 v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
613 v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
614 v_r10 = vshlq_n_s32(v_r10, 2);
615 int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
617 v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
618 v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
619 v_r11 = vshlq_n_s32(v_r11, 2);
620 int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
622 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
629 struct PyrDownVec_32f
631 int operator()(float** src, float* dst, int, int width) const
634 const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
635 float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f);
637 for( ; x <= width - 8; x += 8 )
639 float32x4_t v_r0 = vld1q_f32(row0 + x);
640 float32x4_t v_r1 = vld1q_f32(row1 + x);
641 float32x4_t v_r2 = vld1q_f32(row2 + x);
642 float32x4_t v_r3 = vld1q_f32(row3 + x);
643 float32x4_t v_r4 = vld1q_f32(row4 + x);
645 v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
646 v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
647 vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
649 v_r0 = vld1q_f32(row0 + x + 4);
650 v_r1 = vld1q_f32(row1 + x + 4);
651 v_r2 = vld1q_f32(row2 + x + 4);
652 v_r3 = vld1q_f32(row3 + x + 4);
653 v_r4 = vld1q_f32(row4 + x + 4);
655 v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
656 v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
657 vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
664 struct PyrUpVec_32s8u
666 int operator()(int** src, uchar** dst, int, int width) const
669 uchar *dst0 = dst[0], *dst1 = dst[1];
670 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
671 uint16x8_t v_delta = vdupq_n_u16(32);
673 for( ; x <= width - 16; x += 16 )
675 uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
676 uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
677 uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
679 uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
680 uint16x8_t v_dst00 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
681 uint16x8_t v_dst10 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
683 v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
684 v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
685 v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
687 v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
688 uint16x8_t v_dst01 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
689 uint16x8_t v_dst11 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
691 vst1q_u8(dst0 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst00, v_delta), 6)),
692 vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst01, v_delta), 6))));
693 vst1q_u8(dst1 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst10, v_delta), 6)),
694 vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst11, v_delta), 6))));
697 for( ; x <= width - 8; x += 8 )
699 uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
700 uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
701 uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
703 uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
704 uint16x8_t v_dst0 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
705 uint16x8_t v_dst1 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
707 vst1_u8(dst0 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 6)));
708 vst1_u8(dst1 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 6)));
715 struct PyrUpVec_32s16u
717 int operator()(int** src, ushort** dst, int, int width) const
720 ushort *dst0 = dst[0], *dst1 = dst[1];
721 const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
722 uint32x4_t v_delta = vdupq_n_u32(32);
724 for( ; x <= width - 8; x += 8 )
726 uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
727 uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
728 uint32x4_t v_dst00 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
729 uint32x4_t v_dst10 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
731 v_r0 = vld1q_u32(row0 + x + 4);
732 v_r1 = vld1q_u32(row1 + x + 4);
733 v_r2 = vld1q_u32(row2 + x + 4);
734 v_2r1 = vshlq_n_u32(v_r1, 1);
735 v_4r1 = vshlq_n_u32(v_r1, 2);
736 uint32x4_t v_dst01 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
737 uint32x4_t v_dst11 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
739 vst1q_u16(dst0 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst00, v_delta), 6)),
740 vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst01, v_delta), 6))));
741 vst1q_u16(dst1 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst10, v_delta), 6)),
742 vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst11, v_delta), 6))));
745 for( ; x <= width - 4; x += 4 )
747 uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
748 uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
750 uint32x4_t v_dst0 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
751 uint32x4_t v_dst1 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
753 vst1_u16(dst0 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0, v_delta), 6)));
754 vst1_u16(dst1 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1, v_delta), 6)));
761 struct PyrUpVec_32s16s
763 int operator()(int** src, short** dst, int, int width) const
766 short *dst0 = dst[0], *dst1 = dst[1];
767 const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
768 int32x4_t v_delta = vdupq_n_s32(32);
770 for( ; x <= width - 8; x += 8 )
772 int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
773 int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
774 int32x4_t v_dst00 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
775 int32x4_t v_dst10 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
777 v_r0 = vld1q_s32(row0 + x + 4);
778 v_r1 = vld1q_s32(row1 + x + 4);
779 v_r2 = vld1q_s32(row2 + x + 4);
780 v_2r1 = vshlq_n_s32(v_r1, 1);
781 v_4r1 = vshlq_n_s32(v_r1, 2);
782 int32x4_t v_dst01 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
783 int32x4_t v_dst11 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
785 vst1q_s16(dst0 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst00, v_delta), 6)),
786 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst01, v_delta), 6))));
787 vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst10, v_delta), 6)),
788 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst11, v_delta), 6))));
791 for( ; x <= width - 4; x += 4 )
793 int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
794 int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
796 int32x4_t v_dst0 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
797 int32x4_t v_dst1 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
799 vst1_s16(dst0 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst0, v_delta), 6)));
800 vst1_s16(dst1 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst1, v_delta), 6)));
809 int operator()(float** src, float** dst, int, int width) const
812 const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
813 float *dst0 = dst[0], *dst1 = dst[1];
814 float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f);
816 for( ; x <= width - 8; x += 8 )
818 float32x4_t v_r0 = vld1q_f32(row0 + x);
819 float32x4_t v_r1 = vld1q_f32(row1 + x);
820 float32x4_t v_r2 = vld1q_f32(row2 + x);
822 vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
823 vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
825 v_r0 = vld1q_f32(row0 + x + 4);
826 v_r1 = vld1q_f32(row1 + x + 4);
827 v_r2 = vld1q_f32(row2 + x + 4);
829 vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
830 vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
839 typedef PyrDownNoVec<int, uchar> PyrDownVec_32s8u;
840 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
841 typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
842 typedef PyrDownNoVec<float, float> PyrDownVec_32f;
844 typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
845 typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
846 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
847 typedef PyrUpNoVec<float, float> PyrUpVec_32f;
851 template<class CastOp, class VecOp> void
852 pyrDown_( const Mat& _src, Mat& _dst, int borderType )
855 typedef typename CastOp::type1 WT;
856 typedef typename CastOp::rtype T;
858 CV_Assert( !_src.empty() );
859 Size ssize = _src.size(), dsize = _dst.size();
860 int cn = _src.channels();
861 int bufstep = (int)alignSize(dsize.width*cn, 16);
862 AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
863 WT* buf = alignPtr((WT*)_buf, 16);
864 int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
865 AutoBuffer<int> _tabM(dsize.width*cn);
871 CV_Assert( ssize.width > 0 && ssize.height > 0 &&
872 std::abs(dsize.width*2 - ssize.width) <= 2 &&
873 std::abs(dsize.height*2 - ssize.height) <= 2 );
874 int k, x, sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
876 for( x = 0; x <= PD_SZ+1; x++ )
878 int sx0 = borderInterpolate(x - PD_SZ/2, ssize.width, borderType)*cn;
879 int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, ssize.width, borderType)*cn;
880 for( k = 0; k < cn; k++ )
882 tabL[x*cn + k] = sx0 + k;
883 tabR[x*cn + k] = sx1 + k;
891 for( x = 0; x < dsize.width; x++ )
892 tabM[x] = (x/cn)*2*cn + x % cn;
894 for( int y = 0; y < dsize.height; y++ )
896 T* dst = _dst.ptr<T>(y);
897 WT *row0, *row1, *row2, *row3, *row4;
899 // fill the ring buffer (horizontal convolution and decimation)
900 for( ; sy <= y*2 + 2; sy++ )
902 WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
903 int _sy = borderInterpolate(sy, ssize.height, borderType);
904 const T* src = _src.ptr<T>(_sy);
906 const int* tab = tabL;
910 for( ; x < limit; x++ )
912 row[x] = src[tab[x+cn*2]]*6 + (src[tab[x+cn]] + src[tab[x+cn*3]])*4 +
913 src[tab[x]] + src[tab[x+cn*4]];
916 if( x == dsize.width )
921 for( ; x < width0; x++ )
922 row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 +
923 src[x*2 - 2] + src[x*2 + 2];
927 for( ; x < width0; x += 3 )
929 const T* s = src + x*2;
930 WT t0 = s[0]*6 + (s[-3] + s[3])*4 + s[-6] + s[6];
931 WT t1 = s[1]*6 + (s[-2] + s[4])*4 + s[-5] + s[7];
932 WT t2 = s[2]*6 + (s[-1] + s[5])*4 + s[-4] + s[8];
933 row[x] = t0; row[x+1] = t1; row[x+2] = t2;
938 for( ; x < width0; x += 4 )
940 const T* s = src + x*2;
941 WT t0 = s[0]*6 + (s[-4] + s[4])*4 + s[-8] + s[8];
942 WT t1 = s[1]*6 + (s[-3] + s[5])*4 + s[-7] + s[9];
943 row[x] = t0; row[x+1] = t1;
944 t0 = s[2]*6 + (s[-2] + s[6])*4 + s[-6] + s[10];
945 t1 = s[3]*6 + (s[-1] + s[7])*4 + s[-5] + s[11];
946 row[x+2] = t0; row[x+3] = t1;
951 for( ; x < width0; x++ )
954 row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 +
955 src[sx - cn*2] + src[sx + cn*2];
964 // do vertical convolution and decimation and write the result to the destination image
965 for( k = 0; k < PD_SZ; k++ )
966 rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
967 row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];
969 x = vecOp(rows, dst, (int)_dst.step, dsize.width);
970 for( ; x < dsize.width; x++ )
971 dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]);
976 template<class CastOp, class VecOp> void
977 pyrUp_( const Mat& _src, Mat& _dst, int)
980 typedef typename CastOp::type1 WT;
981 typedef typename CastOp::rtype T;
983 Size ssize = _src.size(), dsize = _dst.size();
984 int cn = _src.channels();
985 int bufstep = (int)alignSize((dsize.width+1)*cn, 16);
986 AutoBuffer<WT> _buf(bufstep*PU_SZ + 16);
987 WT* buf = alignPtr((WT*)_buf, 16);
988 AutoBuffer<int> _dtab(ssize.width*cn);
995 CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 &&
996 std::abs(dsize.height - ssize.height*2) == dsize.height % 2);
997 int k, x, sy0 = -PU_SZ/2, sy = sy0;
1002 for( x = 0; x < ssize.width; x++ )
1003 dtab[x] = (x/cn)*2*cn + x % cn;
1005 for( int y = 0; y < ssize.height; y++ )
1007 T* dst0 = _dst.ptr<T>(y*2);
1008 T* dst1 = _dst.ptr<T>(std::min(y*2+1, dsize.height-1));
1009 WT *row0, *row1, *row2;
1011 // fill the ring buffer (horizontal convolution and decimation)
1012 for( ; sy <= y + 1; sy++ )
1014 WT* row = buf + ((sy - sy0) % PU_SZ)*bufstep;
1015 int _sy = borderInterpolate(sy*2, ssize.height*2, BORDER_REFLECT_101)/2;
1016 const T* src = _src.ptr<T>(_sy);
1018 if( ssize.width == cn )
1020 for( x = 0; x < cn; x++ )
1021 row[x] = row[x + cn] = src[x]*8;
1025 for( x = 0; x < cn; x++ )
1028 WT t0 = src[x]*6 + src[x + cn]*2;
1029 WT t1 = (src[x] + src[x + cn])*4;
1030 row[dx] = t0; row[dx + cn] = t1;
1031 dx = dtab[ssize.width - cn + x];
1032 int sx = ssize.width - cn + x;
1033 t0 = src[sx - cn] + src[sx]*7;
1035 row[dx] = t0; row[dx + cn] = t1;
1037 if (dsize.width > ssize.width*2)
1039 row[(_dst.cols-1) + x] = row[dx + cn];
1043 for( x = cn; x < ssize.width - cn; x++ )
1046 WT t0 = src[x-cn] + src[x]*6 + src[x+cn];
1047 WT t1 = (src[x] + src[x+cn])*4;
1053 // do vertical convolution and decimation and write the result to the destination image
1054 for( k = 0; k < PU_SZ; k++ )
1055 rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep;
1056 row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
1057 dsts[0] = dst0; dsts[1] = dst1;
1059 x = vecOp(rows, dsts, (int)_dst.step, dsize.width);
1060 for( ; x < dsize.width; x++ )
1062 T t1 = castOp((row1[x] + row2[x])*4);
1063 T t0 = castOp(row0[x] + row1[x]*6 + row2[x]);
1064 dst1[x] = t1; dst0[x] = t0;
1068 if (dsize.height > ssize.height*2)
1070 T* dst0 = _dst.ptr<T>(ssize.height*2-2);
1071 T* dst2 = _dst.ptr<T>(ssize.height*2);
1073 for(x = 0; x < dsize.width ; x++ )
1080 typedef void (*PyrFunc)(const Mat&, Mat&, int);
1084 static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
1086 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1088 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
1089 if (cn > 4 || (depth == CV_64F && !doubleSupport))
1092 Size ssize = _src.size();
1093 Size dsize = _dsz.area() == 0 ? Size((ssize.width + 1) / 2, (ssize.height + 1) / 2) : _dsz;
1094 if (dsize.height < 2 || dsize.width < 2)
1097 CV_Assert( ssize.width > 0 && ssize.height > 0 &&
1098 std::abs(dsize.width*2 - ssize.width) <= 2 &&
1099 std::abs(dsize.height*2 - ssize.height) <= 2 );
1101 UMat src = _src.getUMat();
1102 _dst.create( dsize, src.type() );
1103 UMat dst = _dst.getUMat();
1105 int float_depth = depth == CV_64F ? CV_64F : CV_32F;
1106 const int local_size = 256;
1108 if (depth == CV_8U && float_depth == CV_32F && cn == 1 && ocl::Device::getDefault().isIntel())
1110 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
1111 "BORDER_REFLECT_101" };
1113 String buildOptions = format(
1114 "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
1115 "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d",
1116 ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
1117 ocl::convertTypeStr(float_depth, depth, cn, cvt[0]),
1118 ocl::convertTypeStr(depth, float_depth, cn, cvt[1]),
1119 doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(depth),
1120 cn, kercn, float_depth, borderMap[borderType], local_size
1122 ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions);
1126 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
1128 size_t localThreads[2] = { (size_t)local_size/kercn, 1 };
1129 size_t globalThreads[2] = { ((size_t)src.cols + (kercn-1))/kercn, ((size_t)dst.rows + 1) / 2 };
1130 return k.run(2, globalThreads, localThreads, false);
1133 static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
1135 int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
1137 if (channels > 4 || borderType != BORDER_DEFAULT)
1140 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
1141 if (depth == CV_64F && !doubleSupport)
1144 Size ssize = _src.size();
1145 if ((_dsz.area() != 0) && (_dsz != Size(ssize.width * 2, ssize.height * 2)))
1148 UMat src = _src.getUMat();
1149 Size dsize = Size(ssize.width * 2, ssize.height * 2);
1150 _dst.create( dsize, src.type() );
1151 UMat dst = _dst.getUMat();
1153 int float_depth = depth == CV_64F ? CV_64F : CV_32F;
1154 const int local_size = 16;
1156 String buildOptions = format(
1157 "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
1158 "-D T1=%s -D cn=%d -D LOCAL_SIZE=%d",
1159 ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
1160 ocl::convertTypeStr(float_depth, depth, channels, cvt[0]),
1161 ocl::convertTypeStr(depth, float_depth, channels, cvt[1]),
1162 doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1163 ocl::typeToStr(depth), channels, local_size
1165 size_t globalThreads[2] = { (size_t)dst.cols, (size_t)dst.rows };
1166 size_t localThreads[2] = { (size_t)local_size, (size_t)local_size };
1168 if (ocl::Device::getDefault().isIntel() && channels == 1)
1170 if (type == CV_8UC1 && src.cols % 2 == 0)
1172 buildOptions.clear();
1173 k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions);
1174 globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2;
1178 k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
1179 globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
1183 k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions);
1188 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
1189 return k.run(2, globalThreads, localThreads, false);
1196 #if defined(HAVE_IPP)
1199 static bool ipp_pyrdown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
1201 CV_INSTRUMENT_REGION_IPP()
1203 #if IPP_VERSION_X100 >= 810 && !IPP_DISABLE_PYRAMIDS_DOWN
1204 Size dsz = _dsz.area() == 0 ? Size((_src.cols() + 1)/2, (_src.rows() + 1)/2) : _dsz;
1205 bool isolated = (borderType & BORDER_ISOLATED) != 0;
1206 int borderTypeNI = borderType & ~BORDER_ISOLATED;
1208 Mat src = _src.getMat();
1209 _dst.create( dsz, src.type() );
1210 Mat dst = _dst.getMat();
1211 int depth = src.depth();
1215 bool isolated = (borderType & BORDER_ISOLATED) != 0;
1216 int borderTypeNI = borderType & ~BORDER_ISOLATED;
1217 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size(src.cols*2, src.rows*2))
1219 typedef IppStatus (CV_STDCALL * ippiPyrUp)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
1220 int type = src.type();
1221 CV_SUPPRESS_DEPRECATED_START
1222 ippiPyrUp pyrUpFunc = type == CV_8UC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C1R :
1223 type == CV_8UC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C3R :
1224 type == CV_32FC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C1R :
1225 type == CV_32FC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C3R : 0;
1226 CV_SUPPRESS_DEPRECATED_END
1231 IppiSize srcRoi = { src.cols, src.rows };
1232 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
1233 CV_SUPPRESS_DEPRECATED_START
1234 IppStatus ok = ippiPyrUpGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
1235 CV_SUPPRESS_DEPRECATED_END
1238 Ipp8u* buffer = ippsMalloc_8u_L(bufferSize);
1239 ok = pyrUpFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
1244 CV_IMPL_ADD(CV_IMPL_IPP);
1252 CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(_dsz); CV_UNUSED(borderType);
1262 static bool openvx_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
1264 using namespace ivx;
1266 Mat srcMat = _src.getMat();
1268 if (ovx::skipSmallImages<VX_KERNEL_HALFSCALE_GAUSSIAN>(srcMat.cols, srcMat.rows))
1271 CV_Assert(!srcMat.empty());
1273 Size ssize = _src.size();
1274 Size acceptableSize = Size((ssize.width + 1) / 2, (ssize.height + 1) / 2);
1276 // OpenVX limitations
1277 if((srcMat.type() != CV_8U) ||
1278 (borderType != BORDER_REPLICATE) ||
1279 (_dsz != acceptableSize && _dsz.area() != 0))
1282 // The only border mode which is supported by both cv::pyrDown() and OpenVX
1283 // and produces predictable results
1284 ivx::border_t borderMode;
1285 borderMode.mode = VX_BORDER_REPLICATE;
1287 _dst.create( acceptableSize, srcMat.type() );
1288 Mat dstMat = _dst.getMat();
1290 CV_Assert( ssize.width > 0 && ssize.height > 0 &&
1291 std::abs(acceptableSize.width*2 - ssize.width) <= 2 &&
1292 std::abs(acceptableSize.height*2 - ssize.height) <= 2 );
1296 Context context = ovx::getOpenVXContext();
1297 if(context.vendorID() == VX_ID_KHRONOS)
1299 // This implementation performs floor-like rounding
1300 // (OpenCV uses floor(x+0.5)-like rounding)
1301 // and ignores border mode (and loses 1px size border)
1305 Image srcImg = Image::createFromHandle(context, Image::matTypeToFormat(srcMat.type()),
1306 Image::createAddressing(srcMat), (void*)srcMat.data);
1307 Image dstImg = Image::createFromHandle(context, Image::matTypeToFormat(dstMat.type()),
1308 Image::createAddressing(dstMat), (void*)dstMat.data);
1310 ivx::Scalar kernelSize = ivx::Scalar::create<VX_TYPE_INT32>(context, 5);
1311 Graph graph = Graph::create(context);
1312 ivx::Node halfNode = ivx::Node::create(graph, VX_KERNEL_HALFSCALE_GAUSSIAN, srcImg, dstImg, kernelSize);
1313 halfNode.setBorder(borderMode);
1317 #ifdef VX_VERSION_1_1
1318 //we should take user memory back before release
1319 //(it's not done automatically according to standard)
1320 srcImg.swapHandle(); dstImg.swapHandle();
1323 catch (RuntimeError & e)
1325 VX_DbgThrow(e.what());
1327 catch (WrapperError & e)
1329 VX_DbgThrow(e.what());
1338 void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
1340 CV_INSTRUMENT_REGION()
1342 CV_Assert(borderType != BORDER_CONSTANT);
1344 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
1345 ocl_pyrDown(_src, _dst, _dsz, borderType))
1347 CV_OVX_RUN(_src.dims() <= 2,
1348 openvx_pyrDown(_src, _dst, _dsz, borderType))
1350 Mat src = _src.getMat();
1351 Size dsz = _dsz.area() == 0 ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz;
1352 _dst.create( dsz, src.type() );
1353 Mat dst = _dst.getMat();
1354 int depth = src.depth();
1356 CALL_HAL(pyrDown, cv_hal_pyrdown, src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, depth, src.channels(), borderType);
1359 bool isolated = (borderType & BORDER_ISOLATED) != 0;
1360 int borderTypeNI = borderType & ~BORDER_ISOLATED;
1362 CV_IPP_RUN(borderTypeNI == BORDER_DEFAULT && (!_src.isSubmatrix() || isolated) && dsz == Size((_src.cols() + 1)/2, (_src.rows() + 1)/2),
1363 ipp_pyrdown( _src, _dst, _dsz, borderType));
1367 if( depth == CV_8U )
1368 func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>;
1369 else if( depth == CV_16S )
1370 func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >;
1371 else if( depth == CV_16U )
1372 func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >;
1373 else if( depth == CV_32F )
1374 func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>;
1375 else if( depth == CV_64F )
1376 func = pyrDown_<FltCast<double, 8>, PyrDownNoVec<double, double> >;
1378 CV_Error( CV_StsUnsupportedFormat, "" );
1380 func( src, dst, borderType );
1384 #if defined(HAVE_IPP)
1387 static bool ipp_pyrup( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
1389 CV_INSTRUMENT_REGION_IPP()
1391 #if IPP_VERSION_X100 >= 810 && !IPP_DISABLE_PYRAMIDS_UP
1392 Size sz = _src.dims() <= 2 ? _src.size() : Size();
1393 Size dsz = _dsz.area() == 0 ? Size(_src.cols()*2, _src.rows()*2) : _dsz;
1395 Mat src = _src.getMat();
1396 _dst.create( dsz, src.type() );
1397 Mat dst = _dst.getMat();
1398 int depth = src.depth();
1401 bool isolated = (borderType & BORDER_ISOLATED) != 0;
1402 int borderTypeNI = borderType & ~BORDER_ISOLATED;
1403 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size(src.cols*2, src.rows*2))
1405 typedef IppStatus (CV_STDCALL * ippiPyrUp)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
1406 int type = src.type();
1407 CV_SUPPRESS_DEPRECATED_START
1408 ippiPyrUp pyrUpFunc = type == CV_8UC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C1R :
1409 type == CV_8UC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C3R :
1410 type == CV_32FC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C1R :
1411 type == CV_32FC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C3R : 0;
1412 CV_SUPPRESS_DEPRECATED_END
1417 IppiSize srcRoi = { src.cols, src.rows };
1418 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
1419 CV_SUPPRESS_DEPRECATED_START
1420 IppStatus ok = ippiPyrUpGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
1421 CV_SUPPRESS_DEPRECATED_END
1424 Ipp8u* buffer = ippsMalloc_8u_L(bufferSize);
1425 ok = pyrUpFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
1430 CV_IMPL_ADD(CV_IMPL_IPP);
1438 CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(_dsz); CV_UNUSED(borderType);
1445 void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
1447 CV_INSTRUMENT_REGION()
1449 CV_Assert(borderType == BORDER_DEFAULT);
1451 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
1452 ocl_pyrUp(_src, _dst, _dsz, borderType))
1455 Mat src = _src.getMat();
1456 Size dsz = _dsz.area() == 0 ? Size(src.cols*2, src.rows*2) : _dsz;
1457 _dst.create( dsz, src.type() );
1458 Mat dst = _dst.getMat();
1459 int depth = src.depth();
1462 bool isolated = (borderType & BORDER_ISOLATED) != 0;
1463 int borderTypeNI = borderType & ~BORDER_ISOLATED;
1465 CV_IPP_RUN(borderTypeNI == BORDER_DEFAULT && (!_src.isSubmatrix() || isolated) && dsz == Size(_src.cols()*2, _src.rows()*2),
1466 ipp_pyrup( _src, _dst, _dsz, borderType));
1470 if( depth == CV_8U )
1471 func = pyrUp_<FixPtCast<uchar, 6>, PyrUpVec_32s8u >;
1472 else if( depth == CV_16S )
1473 func = pyrUp_<FixPtCast<short, 6>, PyrUpVec_32s16s >;
1474 else if( depth == CV_16U )
1475 func = pyrUp_<FixPtCast<ushort, 6>, PyrUpVec_32s16u >;
1476 else if( depth == CV_32F )
1477 func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >;
1478 else if( depth == CV_64F )
1479 func = pyrUp_<FltCast<double, 6>, PyrUpNoVec<double, double> >;
1481 CV_Error( CV_StsUnsupportedFormat, "" );
1483 func( src, dst, borderType );
1490 static bool ipp_buildpyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
1492 CV_INSTRUMENT_REGION_IPP()
1494 #if IPP_VERSION_X100 >= 810 && !IPP_DISABLE_PYRAMIDS_BUILD
1495 Mat src = _src.getMat();
1496 _dst.create( maxlevel + 1, 1, 0 );
1497 _dst.getMatRef(0) = src;
1502 bool isolated = (borderType & BORDER_ISOLATED) != 0;
1503 int borderTypeNI = borderType & ~BORDER_ISOLATED;
1504 if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated))
1506 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownInitAlloc)(void** ppState, IppiSize srcRoi, Ipp32f rate, void* pKernel, int kerSize, int mode);
1507 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDown)(void* pSrc, int srcStep, IppiSize srcRoiSize, void* pDst, int dstStep, IppiSize dstRoiSize, void* pState);
1508 typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownFree)(void* pState);
1510 int type = src.type();
1511 int depth = src.depth();
1512 ippiPyramidLayerDownInitAlloc pyrInitAllocFunc = 0;
1513 ippiPyramidLayerDown pyrDownFunc = 0;
1514 ippiPyramidLayerDownFree pyrFreeFunc = 0;
1516 if (type == CV_8UC1)
1518 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C1R;
1519 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C1R;
1520 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C1R;
1522 else if (type == CV_8UC3)
1524 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C3R;
1525 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C3R;
1526 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C3R;
1528 else if (type == CV_32FC1)
1530 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C1R;
1531 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C1R;
1532 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C1R;
1534 else if (type == CV_32FC3)
1536 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C3R;
1537 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C3R;
1538 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C3R;
1541 if (pyrInitAllocFunc && pyrDownFunc && pyrFreeFunc)
1544 IppiSize srcRoi = { src.cols, src.rows };
1546 IppStatus ok = ippiPyramidInitAlloc(&gPyr, maxlevel + 1, srcRoi, rate);
1548 Ipp16s iKernel[5] = { 1, 4, 6, 4, 1 };
1549 Ipp32f fKernel[5] = { 1.f, 4.f, 6.f, 4.f, 1.f };
1550 void* kernel = depth >= CV_32F ? (void*) fKernel : (void*) iKernel;
1552 if (ok >= 0) ok = pyrInitAllocFunc((void**) &(gPyr->pState), srcRoi, rate, kernel, 5, IPPI_INTER_LINEAR);
1555 gPyr->pImage[0] = src.data;
1556 gPyr->pStep[0] = (int) src.step;
1557 gPyr->pRoi[0] = srcRoi;
1558 for( ; i <= maxlevel; i++ )
1561 ok = ippiGetPyramidDownROI(gPyr->pRoi[i-1], &dstRoi, rate);
1562 Mat& dst = _dst.getMatRef(i);
1563 dst.create(Size(dstRoi.width, dstRoi.height), type);
1564 gPyr->pImage[i] = dst.data;
1565 gPyr->pStep[i] = (int) dst.step;
1566 gPyr->pRoi[i] = dstRoi;
1568 if (ok >= 0) ok = pyrDownFunc(gPyr->pImage[i-1], gPyr->pStep[i-1], gPyr->pRoi[i-1],
1569 gPyr->pImage[i], gPyr->pStep[i], gPyr->pRoi[i], gPyr->pState);
1573 pyrFreeFunc(gPyr->pState);
1578 CV_IMPL_ADD(CV_IMPL_IPP);
1581 pyrFreeFunc(gPyr->pState);
1585 ippiPyramidFree(gPyr);
1588 ippiPyramidFree(gPyr);
1595 CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(maxlevel); CV_UNUSED(borderType);
1602 void cv::buildPyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
1604 CV_INSTRUMENT_REGION()
1606 CV_Assert(borderType != BORDER_CONSTANT);
1608 if (_src.dims() <= 2 && _dst.isUMatVector())
1610 UMat src = _src.getUMat();
1611 _dst.create( maxlevel + 1, 1, 0 );
1612 _dst.getUMatRef(0) = src;
1613 for( int i = 1; i <= maxlevel; i++ )
1614 pyrDown( _dst.getUMatRef(i-1), _dst.getUMatRef(i), Size(), borderType );
1618 Mat src = _src.getMat();
1619 _dst.create( maxlevel + 1, 1, 0 );
1620 _dst.getMatRef(0) = src;
1624 CV_IPP_RUN(((IPP_VERSION_X100 >= 810) && ((borderType & ~BORDER_ISOLATED) == BORDER_DEFAULT && (!_src.isSubmatrix() || ((borderType & BORDER_ISOLATED) != 0)))),
1625 ipp_buildpyramid( _src, _dst, maxlevel, borderType));
1627 for( ; i <= maxlevel; i++ )
1628 pyrDown( _dst.getMatRef(i-1), _dst.getMatRef(i), Size(), borderType );
1631 CV_IMPL void cvPyrDown( const void* srcarr, void* dstarr, int _filter )
1633 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
1635 CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
1636 cv::pyrDown( src, dst, dst.size() );
1639 CV_IMPL void cvPyrUp( const void* srcarr, void* dstarr, int _filter )
1641 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
1643 CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
1644 cv::pyrUp( src, dst, dst.size() );
1649 cvReleasePyramid( CvMat*** _pyramid, int extra_layers )
1652 CV_Error( CV_StsNullPtr, "" );
1655 for( int i = 0; i <= extra_layers; i++ )
1656 cvReleaseMat( &(*_pyramid)[i] );
1663 cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate,
1664 const CvSize* layer_sizes, CvArr* bufarr,
1665 int calc, int filter )
1667 const float eps = 0.1f;
1670 CvMat stub, *src = cvGetMat( srcarr, &stub );
1672 if( extra_layers < 0 )
1673 CV_Error( CV_StsOutOfRange, "The number of extra layers must be non negative" );
1675 int i, layer_step, elem_size = CV_ELEM_SIZE(src->type);
1676 CvSize layer_size, size = cvGetMatSize(src);
1683 buf = cvGetMat( bufarr, &bstub );
1684 bufsize = buf->rows*buf->cols*CV_ELEM_SIZE(buf->type);
1686 for( i = 1; i <= extra_layers; i++ )
1690 layer_size.width = cvRound(layer_size.width*rate+eps);
1691 layer_size.height = cvRound(layer_size.height*rate+eps);
1694 layer_size = layer_sizes[i-1];
1695 layer_step = layer_size.width*elem_size;
1696 bufsize -= layer_step*layer_size.height;
1700 CV_Error( CV_StsOutOfRange, "The buffer is too small to fit the pyramid" );
1701 ptr = buf->data.ptr;
1704 CvMat** pyramid = (CvMat**)cvAlloc( (extra_layers+1)*sizeof(pyramid[0]) );
1705 memset( pyramid, 0, (extra_layers+1)*sizeof(pyramid[0]) );
1707 pyramid[0] = cvCreateMatHeader( size.height, size.width, src->type );
1708 cvSetData( pyramid[0], src->data.ptr, src->step );
1711 for( i = 1; i <= extra_layers; i++ )
1715 layer_size.width = cvRound(layer_size.width*rate + eps);
1716 layer_size.height = cvRound(layer_size.height*rate + eps);
1719 layer_size = layer_sizes[i];
1723 pyramid[i] = cvCreateMatHeader( layer_size.height, layer_size.width, src->type );
1724 layer_step = layer_size.width*elem_size;
1725 cvSetData( pyramid[i], ptr, layer_step );
1726 ptr += layer_step*layer_size.height;
1729 pyramid[i] = cvCreateMat( layer_size.height, layer_size.width, src->type );
1732 cvPyrDown( pyramid[i-1], pyramid[i], filter );
1733 //cvResize( pyramid[i-1], pyramid[i], CV_INTER_LINEAR );