1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "precomp.hpp"
44 #include "opencl_kernels_imgproc.hpp"
47 * This file includes the code, contributed by Simon Perreault
48 * (the function icvMedianBlur_8u_O1)
50 * Constant-time median filtering -- http://nomis80.org/ctmf.html
51 * Copyright (C) 2006 Simon Perreault
54 * Laboratoire de vision et systemes numeriques
55 * Pavillon Adrien-Pouliot
57 * Sainte-Foy, Quebec, Canada
60 * perreaul@gel.ulaval.ca
66 /****************************************************************************************\
68 \****************************************************************************************/
70 template<typename T, typename ST>
74 RowSum( int _ksize, int _anchor ) :
81 virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
83 const T* S = (const T*)src;
85 int i = 0, k, ksz_cn = ksize*cn;
87 width = (width - 1)*cn;
88 for( k = 0; k < cn; k++, S++, D++ )
91 for( i = 0; i < ksz_cn; i += cn )
94 for( i = 0; i < width; i += cn )
96 s += S[i + ksz_cn] - S[i];
104 template<typename ST, typename T>
106 public BaseColumnFilter
108 ColumnSum( int _ksize, int _anchor, double _scale ) :
117 virtual void reset() { sumCount = 0; }
119 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
123 bool haveScale = scale != 1;
124 double _scale = scale;
126 if( width != (int)sum.size() )
135 memset((void*)SUM, 0, width*sizeof(ST));
137 for( ; sumCount < ksize - 1; sumCount++, src++ )
139 const ST* Sp = (const ST*)src[0];
140 for( i = 0; i <= width - 2; i += 2 )
142 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
143 SUM[i] = s0; SUM[i+1] = s1;
146 for( ; i < width; i++ )
152 CV_Assert( sumCount == ksize-1 );
156 for( ; count--; src++ )
158 const ST* Sp = (const ST*)src[0];
159 const ST* Sm = (const ST*)src[1-ksize];
163 for( i = 0; i <= width - 2; i += 2 )
165 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
166 D[i] = saturate_cast<T>(s0*_scale);
167 D[i+1] = saturate_cast<T>(s1*_scale);
168 s0 -= Sm[i]; s1 -= Sm[i+1];
169 SUM[i] = s0; SUM[i+1] = s1;
172 for( ; i < width; i++ )
174 ST s0 = SUM[i] + Sp[i];
175 D[i] = saturate_cast<T>(s0*_scale);
181 for( i = 0; i <= width - 2; i += 2 )
183 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
184 D[i] = saturate_cast<T>(s0);
185 D[i+1] = saturate_cast<T>(s1);
186 s0 -= Sm[i]; s1 -= Sm[i+1];
187 SUM[i] = s0; SUM[i+1] = s1;
190 for( ; i < width; i++ )
192 ST s0 = SUM[i] + Sp[i];
193 D[i] = saturate_cast<T>(s0);
208 struct ColumnSum<int, uchar> :
209 public BaseColumnFilter
211 ColumnSum( int _ksize, int _anchor, double _scale ) :
220 virtual void reset() { sumCount = 0; }
222 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
226 bool haveScale = scale != 1;
227 double _scale = scale;
230 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
233 if( width != (int)sum.size() )
242 memset((void*)SUM, 0, width*sizeof(int));
243 for( ; sumCount < ksize - 1; sumCount++, src++ )
245 const int* Sp = (const int*)src[0];
250 for( ; i <= width-4; i+=4 )
252 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
253 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
254 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
258 for( ; i <= width - 4; i+=4 )
259 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
261 for( ; i < width; i++ )
267 CV_Assert( sumCount == ksize-1 );
271 for( ; count--; src++ )
273 const int* Sp = (const int*)src[0];
274 const int* Sm = (const int*)src[1-ksize];
275 uchar* D = (uchar*)dst;
282 const __m128 scale4 = _mm_set1_ps((float)_scale);
283 for( ; i <= width-8; i+=8 )
285 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
286 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
288 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
289 _mm_loadu_si128((const __m128i*)(Sp+i)));
290 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
291 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
293 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
294 __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
296 _s0T = _mm_packs_epi32(_s0T, _s0T1);
298 _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
300 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
301 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
305 float32x4_t v_scale = vdupq_n_f32((float)_scale);
306 for( ; i <= width-8; i+=8 )
308 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
309 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
311 uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
312 uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
314 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
315 vst1_u8(D + i, vqmovn_u16(v_dst));
317 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
318 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
321 for( ; i < width; i++ )
323 int s0 = SUM[i] + Sp[i];
324 D[i] = saturate_cast<uchar>(s0*_scale);
334 for( ; i <= width-8; i+=8 )
336 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
337 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
339 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
340 _mm_loadu_si128((const __m128i*)(Sp+i)));
341 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
342 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
344 __m128i _s0T = _mm_packs_epi32(_s0, _s01);
346 _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
348 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
349 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
353 for( ; i <= width-8; i+=8 )
355 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
356 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
358 uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
359 vst1_u8(D + i, vqmovn_u16(v_dst));
361 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
362 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
366 for( ; i < width; i++ )
368 int s0 = SUM[i] + Sp[i];
369 D[i] = saturate_cast<uchar>(s0);
379 std::vector<int> sum;
383 struct ColumnSum<int, short> :
384 public BaseColumnFilter
386 ColumnSum( int _ksize, int _anchor, double _scale ) :
395 virtual void reset() { sumCount = 0; }
397 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
401 bool haveScale = scale != 1;
402 double _scale = scale;
405 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
408 if( width != (int)sum.size() )
416 memset((void*)SUM, 0, width*sizeof(int));
417 for( ; sumCount < ksize - 1; sumCount++, src++ )
419 const int* Sp = (const int*)src[0];
424 for( ; i <= width-4; i+=4 )
426 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
427 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
428 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
432 for( ; i <= width - 4; i+=4 )
433 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
435 for( ; i < width; i++ )
441 CV_Assert( sumCount == ksize-1 );
445 for( ; count--; src++ )
447 const int* Sp = (const int*)src[0];
448 const int* Sm = (const int*)src[1-ksize];
449 short* D = (short*)dst;
456 const __m128 scale4 = _mm_set1_ps((float)_scale);
457 for( ; i <= width-8; i+=8 )
459 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
460 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
462 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
463 _mm_loadu_si128((const __m128i*)(Sp+i)));
464 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
465 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
467 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
468 __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
470 _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1));
472 _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
473 _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1));
477 float32x4_t v_scale = vdupq_n_f32((float)_scale);
478 for( ; i <= width-8; i+=8 )
480 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
481 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
483 int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
484 int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
485 vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
487 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
488 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
491 for( ; i < width; i++ )
493 int s0 = SUM[i] + Sp[i];
494 D[i] = saturate_cast<short>(s0*_scale);
504 for( ; i <= width-8; i+=8 )
507 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
508 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
510 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
511 _mm_loadu_si128((const __m128i*)(Sp+i)));
512 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
513 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
515 _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01));
517 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
518 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
522 for( ; i <= width-8; i+=8 )
524 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
525 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
527 vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
529 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
530 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
534 for( ; i < width; i++ )
536 int s0 = SUM[i] + Sp[i];
537 D[i] = saturate_cast<short>(s0);
547 std::vector<int> sum;
552 struct ColumnSum<int, ushort> :
553 public BaseColumnFilter
555 ColumnSum( int _ksize, int _anchor, double _scale ) :
564 virtual void reset() { sumCount = 0; }
566 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
570 bool haveScale = scale != 1;
571 double _scale = scale;
573 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
576 if( width != (int)sum.size() )
584 memset((void*)SUM, 0, width*sizeof(int));
585 for( ; sumCount < ksize - 1; sumCount++, src++ )
587 const int* Sp = (const int*)src[0];
592 for( ; i < width-4; i+=4 )
594 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
595 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
596 _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
600 for( ; i <= width - 4; i+=4 )
601 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
603 for( ; i < width; i++ )
609 CV_Assert( sumCount == ksize-1 );
613 for( ; count--; src++ )
615 const int* Sp = (const int*)src[0];
616 const int* Sm = (const int*)src[1-ksize];
617 ushort* D = (ushort*)dst;
624 const __m128 scale4 = _mm_set1_ps((float)_scale);
625 const __m128i delta0 = _mm_set1_epi32(0x8000);
626 const __m128i delta1 = _mm_set1_epi32(0x80008000);
628 for( ; i < width-4; i+=4)
630 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
631 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
632 _mm_loadu_si128((const __m128i*)(Sp+i)));
634 __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
636 _res = _mm_sub_epi32(_res, delta0);
637 _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
639 _mm_storel_epi64((__m128i*)(D+i), _res);
640 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
644 float32x4_t v_scale = vdupq_n_f32((float)_scale);
645 for( ; i <= width-8; i+=8 )
647 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
648 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
650 uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
651 uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
652 vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
654 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
655 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
658 for( ; i < width; i++ )
660 int s0 = SUM[i] + Sp[i];
661 D[i] = saturate_cast<ushort>(s0*_scale);
671 const __m128i delta0 = _mm_set1_epi32(0x8000);
672 const __m128i delta1 = _mm_set1_epi32(0x80008000);
674 for( ; i < width-4; i+=4 )
676 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
677 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
678 _mm_loadu_si128((const __m128i*)(Sp+i)));
680 __m128i _res = _mm_sub_epi32(_s0, delta0);
681 _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
683 _mm_storel_epi64((__m128i*)(D+i), _res);
684 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
688 for( ; i <= width-8; i+=8 )
690 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
691 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
693 vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
695 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
696 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
700 for( ; i < width; i++ )
702 int s0 = SUM[i] + Sp[i];
703 D[i] = saturate_cast<ushort>(s0);
713 std::vector<int> sum;
717 struct ColumnSum<int, float> :
718 public BaseColumnFilter
720 ColumnSum( int _ksize, int _anchor, double _scale ) :
729 virtual void reset() { sumCount = 0; }
731 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
735 bool haveScale = scale != 1;
736 double _scale = scale;
739 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
742 if( width != (int)sum.size() )
751 memset((void *)SUM, 0, sizeof(int) * width);
753 for( ; sumCount < ksize - 1; sumCount++, src++ )
755 const int* Sp = (const int*)src[0];
761 for( ; i < width-4; i+=4 )
763 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
764 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
765 _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
769 for( ; i <= width - 4; i+=4 )
770 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
773 for( ; i < width; i++ )
779 CV_Assert( sumCount == ksize-1 );
783 for( ; count--; src++ )
785 const int * Sp = (const int*)src[0];
786 const int * Sm = (const int*)src[1-ksize];
787 float* D = (float*)dst;
795 const __m128 scale4 = _mm_set1_ps((float)_scale);
797 for( ; i < width-4; i+=4)
799 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
800 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
801 _mm_loadu_si128((const __m128i*)(Sp+i)));
803 _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
804 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
808 float32x4_t v_scale = vdupq_n_f32((float)_scale);
809 for( ; i <= width-8; i+=8 )
811 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
812 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
814 vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
815 vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
817 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
818 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
822 for( ; i < width; i++ )
824 int s0 = SUM[i] + Sp[i];
825 D[i] = (float)(s0*_scale);
836 for( ; i < width-4; i+=4)
838 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
839 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
840 _mm_loadu_si128((const __m128i*)(Sp+i)));
842 _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0));
843 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
847 for( ; i <= width-8; i+=8 )
849 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
850 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
852 vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
853 vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
855 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
856 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
860 for( ; i < width; i++ )
862 int s0 = SUM[i] + Sp[i];
873 std::vector<int> sum;
878 #define DIVUP(total, grain) ((total + grain - 1) / (grain))
879 #define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
881 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
882 Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
884 const ocl::Device & dev = ocl::Device::getDefault();
885 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
886 bool doubleSupport = dev.doubleFPConfig() > 0;
891 if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
892 _src.offset() % esz != 0 || _src.step() % esz != 0)
896 anchor.x = ksize.width / 2;
898 anchor.y = ksize.height / 2;
900 int computeUnits = ocl::Device::getDefault().maxComputeUnits();
901 float alpha = 1.0f / (ksize.height * ksize.width);
902 Size size = _src.size(), wholeSize;
903 bool isolated = (borderType & BORDER_ISOLATED) != 0;
904 borderType &= ~BORDER_ISOLATED;
905 int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
906 wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
908 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
909 size_t globalsize[2] = { size.width, size.height };
910 size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
912 UMat src = _src.getUMat();
916 src.locateROI(wholeSize, ofs);
919 int h = isolated ? size.height : wholeSize.height;
920 int w = isolated ? size.width : wholeSize.width;
922 size_t maxWorkItemSizes[32];
923 ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
924 int tryWorkItems = (int)maxWorkItemSizes[0];
928 if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
929 ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
930 (ksize.width == 5 && ksize.height == 5 && cn == 1)))
932 if (w < ksize.width || h < ksize.height)
935 // Figure out what vector size to use for loading the pixels.
936 int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
937 int pxLoadVecSize = cn * pxLoadNumPixels;
939 // Figure out how many pixels per work item to compute in X and Y
940 // directions. Too many and we run out of registers.
941 int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
942 if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
944 pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
945 pxPerWorkItemY = size.height % 2 ? 1 : 2;
947 else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
949 pxPerWorkItemX = size.width % 2 ? 1 : 2;
950 pxPerWorkItemY = size.height % 2 ? 1 : 2;
952 globalsize[0] = size.width / pxPerWorkItemX;
953 globalsize[1] = size.height / pxPerWorkItemY;
955 // Need some padding in the private array for pixels
956 int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
958 // Make the global size a nice round number so the runtime can pick
959 // from reasonable choices for the workgroup size
960 const int wgRound = 256;
961 globalsize[0] = ROUNDUP(globalsize[0], wgRound);
963 char build_options[1024], cvt[2][40];
964 sprintf(build_options, "-D cn=%d "
965 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
966 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
967 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
968 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
969 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
970 "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
971 cn, anchor.x, anchor.y, ksize.width, ksize.height,
972 pxLoadVecSize, pxLoadNumPixels,
973 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
974 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
975 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
976 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
977 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
978 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
979 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
980 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
981 ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
985 if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
990 localsize = localsize_general;
993 int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
995 while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
997 while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
1000 if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
1004 String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
1005 " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
1006 " -D ST1=%s -D DT1=%s -D cn=%d",
1007 BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
1008 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
1009 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
1010 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
1011 anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
1012 isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1013 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
1014 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
1016 localsize[0] = BLOCK_SIZE_X;
1017 globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
1018 globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);
1020 kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
1024 size_t kernelWorkGroupSize = kernel.workGroupSize();
1025 if (localsize[0] <= kernelWorkGroupSize)
1027 if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
1030 tryWorkItems = (int)kernelWorkGroupSize;
1034 _dst.create(size, CV_MAKETYPE(ddepth, cn));
1035 UMat dst = _dst.getUMat();
1037 int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
1038 idxArg = kernel.set(idxArg, (int)src.step);
1039 int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
1040 int srcOffsetY = (int)(src.offset / src.step);
1041 int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
1042 int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
1043 idxArg = kernel.set(idxArg, srcOffsetX);
1044 idxArg = kernel.set(idxArg, srcOffsetY);
1045 idxArg = kernel.set(idxArg, srcEndX);
1046 idxArg = kernel.set(idxArg, srcEndY);
1047 idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
1049 idxArg = kernel.set(idxArg, (float)alpha);
1051 return kernel.run(2, globalsize, localsize, false);
1061 cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
1063 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
1064 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
1069 if( sdepth == CV_8U && ddepth == CV_32S )
1070 return makePtr<RowSum<uchar, int> >(ksize, anchor);
1071 if( sdepth == CV_8U && ddepth == CV_64F )
1072 return makePtr<RowSum<uchar, double> >(ksize, anchor);
1073 if( sdepth == CV_16U && ddepth == CV_32S )
1074 return makePtr<RowSum<ushort, int> >(ksize, anchor);
1075 if( sdepth == CV_16U && ddepth == CV_64F )
1076 return makePtr<RowSum<ushort, double> >(ksize, anchor);
1077 if( sdepth == CV_16S && ddepth == CV_32S )
1078 return makePtr<RowSum<short, int> >(ksize, anchor);
1079 if( sdepth == CV_32S && ddepth == CV_32S )
1080 return makePtr<RowSum<int, int> >(ksize, anchor);
1081 if( sdepth == CV_16S && ddepth == CV_64F )
1082 return makePtr<RowSum<short, double> >(ksize, anchor);
1083 if( sdepth == CV_32F && ddepth == CV_64F )
1084 return makePtr<RowSum<float, double> >(ksize, anchor);
1085 if( sdepth == CV_64F && ddepth == CV_64F )
1086 return makePtr<RowSum<double, double> >(ksize, anchor);
1088 CV_Error_( CV_StsNotImplemented,
1089 ("Unsupported combination of source format (=%d), and buffer format (=%d)",
1092 return Ptr<BaseRowFilter>();
1096 cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
1097 int anchor, double scale)
1099 int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
1100 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
1105 if( ddepth == CV_8U && sdepth == CV_32S )
1106 return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
1107 if( ddepth == CV_8U && sdepth == CV_64F )
1108 return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
1109 if( ddepth == CV_16U && sdepth == CV_32S )
1110 return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
1111 if( ddepth == CV_16U && sdepth == CV_64F )
1112 return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
1113 if( ddepth == CV_16S && sdepth == CV_32S )
1114 return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
1115 if( ddepth == CV_16S && sdepth == CV_64F )
1116 return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
1117 if( ddepth == CV_32S && sdepth == CV_32S )
1118 return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
1119 if( ddepth == CV_32F && sdepth == CV_32S )
1120 return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
1121 if( ddepth == CV_32F && sdepth == CV_64F )
1122 return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
1123 if( ddepth == CV_64F && sdepth == CV_32S )
1124 return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
1125 if( ddepth == CV_64F && sdepth == CV_64F )
1126 return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
1128 CV_Error_( CV_StsNotImplemented,
1129 ("Unsupported combination of sum format (=%d), and destination format (=%d)",
1132 return Ptr<BaseColumnFilter>();
1136 cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
1137 Point anchor, bool normalize, int borderType )
1139 int sdepth = CV_MAT_DEPTH(srcType);
1140 int cn = CV_MAT_CN(srcType), sumType = CV_64F;
1141 if( sdepth <= CV_32S && (!normalize ||
1142 ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
1143 sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
1145 sumType = CV_MAKETYPE( sumType, cn );
1147 Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
1148 Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
1149 dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
1151 return makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
1152 srcType, dstType, sumType, borderType );
1156 void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
1157 Size ksize, Point anchor,
1158 bool normalize, int borderType )
1160 CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
1162 Mat src = _src.getMat();
1163 int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
1166 _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
1167 Mat dst = _dst.getMat();
1168 if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
1175 #ifdef HAVE_TEGRA_OPTIMIZATION
1176 if ( tegra::box(src, dst, ksize, anchor, normalize, borderType) )
1180 #if defined(HAVE_IPP)
1183 int ippBorderType = borderType & ~BORDER_ISOLATED;
1184 Point ocvAnchor, ippAnchor;
1185 ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
1186 ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
1187 ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
1188 ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
1190 if (normalize && !src.isSubmatrix() && ddepth == sdepth &&
1191 (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
1192 ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
1193 dst.cols != ksize.width && dst.rows != ksize.height) // returns ippStsMaskSizeErr: mask has an illegal value
1196 IppiSize roiSize = { dst.cols, dst.rows }, maskSize = { ksize.width, ksize.height };
1198 #define IPP_FILTER_BOX_BORDER(ippType, ippDataType, flavor) \
1201 if (ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ippDataType, cn, &bufSize) >= 0) \
1203 Ipp8u * buffer = ippsMalloc_8u(bufSize); \
1204 ippType borderValue[4] = { 0, 0, 0, 0 }; \
1205 ippBorderType = ippBorderType == BORDER_CONSTANT ? ippBorderConst : ippBorderRepl; \
1206 IppStatus status = ippiFilterBoxBorder_##flavor(src.ptr<ippType>(), (int)src.step, dst.ptr<ippType>(), \
1207 (int)dst.step, roiSize, maskSize, \
1208 (IppiBorderType)ippBorderType, borderValue, buffer); \
1212 CV_IMPL_ADD(CV_IMPL_IPP); \
1216 setIppErrorStatus(); \
1217 } while ((void)0, 0)
1219 if (stype == CV_8UC1)
1220 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C1R);
1221 else if (stype == CV_8UC3)
1222 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C3R);
1223 else if (stype == CV_8UC4)
1224 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C4R);
1226 // Oct 2014: performance with BORDER_CONSTANT
1227 //else if (stype == CV_16UC1)
1228 // IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C1R);
1229 else if (stype == CV_16UC3)
1230 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C3R);
1231 else if (stype == CV_16UC4)
1232 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C4R);
1234 // Oct 2014: performance with BORDER_CONSTANT
1235 //else if (stype == CV_16SC1)
1236 // IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C1R);
1237 else if (stype == CV_16SC3)
1238 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C3R);
1239 else if (stype == CV_16SC4)
1240 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C4R);
1242 else if (stype == CV_32FC1)
1243 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C1R);
1244 else if (stype == CV_32FC3)
1245 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C3R);
1246 else if (stype == CV_32FC4)
1247 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C4R);
1249 #undef IPP_FILTER_BOX_BORDER
1253 Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
1254 ksize, anchor, normalize, borderType );
1255 f->apply( src, dst );
1258 void cv::blur( InputArray src, OutputArray dst,
1259 Size ksize, Point anchor, int borderType )
1261 boxFilter( src, dst, -1, ksize, anchor, true, borderType );
1265 /****************************************************************************************\
1267 \****************************************************************************************/
1272 template<typename T, typename ST>
1274 public BaseRowFilter
1276 SqrRowSum( int _ksize, int _anchor ) :
1283 virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
1285 const T* S = (const T*)src;
1287 int i = 0, k, ksz_cn = ksize*cn;
1289 width = (width - 1)*cn;
1290 for( k = 0; k < cn; k++, S++, D++ )
1293 for( i = 0; i < ksz_cn; i += cn )
1299 for( i = 0; i < width; i += cn )
1301 ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
1302 s += val1*val1 - val0*val0;
1309 static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
1311 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
1312 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
1317 if( sdepth == CV_8U && ddepth == CV_32S )
1318 return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
1319 if( sdepth == CV_8U && ddepth == CV_64F )
1320 return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
1321 if( sdepth == CV_16U && ddepth == CV_64F )
1322 return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
1323 if( sdepth == CV_16S && ddepth == CV_64F )
1324 return makePtr<SqrRowSum<short, double> >(ksize, anchor);
1325 if( sdepth == CV_32F && ddepth == CV_64F )
1326 return makePtr<SqrRowSum<float, double> >(ksize, anchor);
1327 if( sdepth == CV_64F && ddepth == CV_64F )
1328 return makePtr<SqrRowSum<double, double> >(ksize, anchor);
1330 CV_Error_( CV_StsNotImplemented,
1331 ("Unsupported combination of source format (=%d), and buffer format (=%d)",
1334 return Ptr<BaseRowFilter>();
1339 void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
1340 Size ksize, Point anchor,
1341 bool normalize, int borderType )
1343 int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
1344 Size size = _src.size();
1347 ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
1349 if( borderType != BORDER_CONSTANT && normalize )
1351 if( size.height == 1 )
1353 if( size.width == 1 )
1357 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
1358 ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
1360 int sumDepth = CV_64F;
1361 if( sdepth == CV_8U )
1363 int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
1365 Mat src = _src.getMat();
1366 _dst.create( size, dstType );
1367 Mat dst = _dst.getMat();
1369 Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
1370 Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
1371 dstType, ksize.height, anchor.y,
1372 normalize ? 1./(ksize.width*ksize.height) : 1);
1374 Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
1375 srcType, dstType, sumType, borderType );
1376 f->apply( src, dst );
1380 /****************************************************************************************\
1382 \****************************************************************************************/
1384 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
1386 const int SMALL_GAUSSIAN_SIZE = 7;
1387 static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
1390 {0.25f, 0.5f, 0.25f},
1391 {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f},
1392 {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f}
1395 const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ?
1396 small_gaussian_tab[n>>1] : 0;
1398 CV_Assert( ktype == CV_32F || ktype == CV_64F );
1399 Mat kernel(n, 1, ktype);
1400 float* cf = kernel.ptr<float>();
1401 double* cd = kernel.ptr<double>();
1403 double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8;
1404 double scale2X = -0.5/(sigmaX*sigmaX);
1408 for( i = 0; i < n; i++ )
1410 double x = i - (n-1)*0.5;
1411 double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x);
1412 if( ktype == CV_32F )
1425 for( i = 0; i < n; i++ )
1427 if( ktype == CV_32F )
1428 cf[i] = (float)(cf[i]*sum);
1438 static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize,
1439 double sigma1, double sigma2 )
1441 int depth = CV_MAT_DEPTH(type);
1445 // automatic detection of kernel size from sigma
1446 if( ksize.width <= 0 && sigma1 > 0 )
1447 ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
1448 if( ksize.height <= 0 && sigma2 > 0 )
1449 ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
1451 CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 &&
1452 ksize.height > 0 && ksize.height % 2 == 1 );
1454 sigma1 = std::max( sigma1, 0. );
1455 sigma2 = std::max( sigma2, 0. );
1457 kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
1458 if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
1461 ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
1466 cv::Ptr<cv::FilterEngine> cv::createGaussianFilter( int type, Size ksize,
1467 double sigma1, double sigma2,
1471 createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
1473 return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType );
1477 void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
1478 double sigma1, double sigma2,
1481 int type = _src.type();
1482 Size size = _src.size();
1483 _dst.create( size, type );
1485 if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
1487 if( size.height == 1 )
1489 if( size.width == 1 )
1493 if( ksize.width == 1 && ksize.height == 1 )
1499 #ifdef HAVE_TEGRA_OPTIMIZATION
1500 if(sigma1 == 0 && sigma2 == 0 && tegra::gaussian(_src.getMat(), _dst.getMat(), ksize, borderType))
1504 #if IPP_VERSION_X100 >= 801 && 0 // these functions are slower in IPP 8.1
1507 int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1509 if ((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn == 1 || cn == 3) &&
1510 sigma1 == sigma2 && ksize.width == ksize.height && sigma1 != 0.0 )
1512 IppiBorderType ippBorder = ippiGetBorderType(borderType);
1513 if (ippBorderConst == ippBorder || ippBorderRepl == ippBorder)
1515 Mat src = _src.getMat(), dst = _dst.getMat();
1516 IppiSize roiSize = { src.cols, src.rows };
1517 IppDataType dataType = ippiGetDataType(depth);
1518 Ipp32s specSize = 0, bufferSize = 0;
1520 if (ippiFilterGaussianGetBufferSize(roiSize, (Ipp32u)ksize.width, dataType, cn, &specSize, &bufferSize) >= 0)
1522 IppFilterGaussianSpec * pSpec = (IppFilterGaussianSpec *)ippMalloc(specSize);
1523 Ipp8u * pBuffer = (Ipp8u*)ippMalloc(bufferSize);
1525 if (ippiFilterGaussianInit(roiSize, (Ipp32u)ksize.width, (Ipp32f)sigma1, ippBorder, dataType, 1, pSpec, pBuffer) >= 0)
1527 #define IPP_FILTER_GAUSS(ippfavor, ippcn) \
1530 typedef Ipp##ippfavor ippType; \
1531 ippType borderValues[] = { 0, 0, 0 }; \
1532 IppStatus status = ippcn == 1 ? \
1533 ippiFilterGaussianBorder_##ippfavor##_C1R(src.ptr<ippType>(), (int)src.step, \
1534 dst.ptr<ippType>(), (int)dst.step, roiSize, borderValues[0], pSpec, pBuffer) : \
1535 ippiFilterGaussianBorder_##ippfavor##_C3R(src.ptr<ippType>(), (int)src.step, \
1536 dst.ptr<ippType>(), (int)dst.step, roiSize, borderValues, pSpec, pBuffer); \
1541 CV_IMPL_ADD(CV_IMPL_IPP); \
1544 } while ((void)0, 0)
1546 if (type == CV_8UC1)
1547 IPP_FILTER_GAUSS(8u, 1);
1548 else if (type == CV_8UC3)
1549 IPP_FILTER_GAUSS(8u, 3);
1550 else if (type == CV_16UC1)
1551 IPP_FILTER_GAUSS(16u, 1);
1552 else if (type == CV_16UC3)
1553 IPP_FILTER_GAUSS(16u, 3);
1554 else if (type == CV_16SC1)
1555 IPP_FILTER_GAUSS(16s, 1);
1556 else if (type == CV_16SC3)
1557 IPP_FILTER_GAUSS(16s, 3);
1558 else if (type == CV_32FC1)
1559 IPP_FILTER_GAUSS(32f, 1);
1560 else if (type == CV_32FC3)
1561 IPP_FILTER_GAUSS(32f, 3);
1562 #undef IPP_FILTER_GAUSS
1565 setIppErrorStatus();
1572 createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
1573 sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
1576 /****************************************************************************************\
1578 \****************************************************************************************/
1585 * This structure represents a two-tier histogram. The first tier (known as the
1586 * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
1587 * is 8 bit wide. Pixels inserted in the fine level also get inserted into the
1588 * coarse bucket designated by the 4 MSBs of the fine bucket value.
1590 * The structure is aligned on 16 bits, which is a prerequisite for SIMD
1591 * instructions. Each bucket is 16 bit wide, which means that extra care must be
1592 * taken to prevent overflow.
1602 #define MEDIAN_HAVE_SIMD 1
1604 static inline void histogram_add_simd( const HT x[16], HT y[16] )
1606 const __m128i* rx = (const __m128i*)x;
1607 __m128i* ry = (__m128i*)y;
1608 __m128i r0 = _mm_add_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
1609 __m128i r1 = _mm_add_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
1610 _mm_store_si128(ry+0, r0);
1611 _mm_store_si128(ry+1, r1);
1614 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
1616 const __m128i* rx = (const __m128i*)x;
1617 __m128i* ry = (__m128i*)y;
1618 __m128i r0 = _mm_sub_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
1619 __m128i r1 = _mm_sub_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
1620 _mm_store_si128(ry+0, r0);
1621 _mm_store_si128(ry+1, r1);
1625 #define MEDIAN_HAVE_SIMD 1
1627 static inline void histogram_add_simd( const HT x[16], HT y[16] )
1629 vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y)));
1630 vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
1633 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
1635 vst1q_u16(y, vsubq_u16(vld1q_u16(x), vld1q_u16(y)));
1636 vst1q_u16(y + 8, vsubq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
1640 #define MEDIAN_HAVE_SIMD 0
1644 static inline void histogram_add( const HT x[16], HT y[16] )
1647 for( i = 0; i < 16; ++i )
1648 y[i] = (HT)(y[i] + x[i]);
1651 static inline void histogram_sub( const HT x[16], HT y[16] )
1654 for( i = 0; i < 16; ++i )
1655 y[i] = (HT)(y[i] - x[i]);
1658 static inline void histogram_muladd( int a, const HT x[16],
1661 for( int i = 0; i < 16; ++i )
1662 y[i] = (HT)(y[i] + a * x[i]);
1666 medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
1669 * HOP is short for Histogram OPeration. This macro makes an operation \a op on
1670 * histogram \a h for pixel value \a x. It takes care of handling both levels.
1672 #define HOP(h,x,op) \
1673 h.coarse[x>>4] op, \
1674 *((HT*)h.fine + x) op
1676 #define COP(c,j,x,op) \
1677 h_coarse[ 16*(n*c+j) + (x>>4) ] op, \
1678 h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op
1680 int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
1681 size_t sstep = _src.step, dstep = _dst.step;
1682 Histogram CV_DECL_ALIGNED(16) H[4];
1683 HT CV_DECL_ALIGNED(16) luc[4][16];
1685 int STRIPE_SIZE = std::min( _dst.cols, 512/cn );
1687 std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
1688 std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
1689 HT* h_coarse = alignPtr(&_h_coarse[0], 16);
1690 HT* h_fine = alignPtr(&_h_fine[0], 16);
1691 #if MEDIAN_HAVE_SIMD
1692 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
1695 for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
1697 int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2;
1698 const uchar* src = _src.ptr() + x*cn;
1699 uchar* dst = _dst.ptr() + (x - r)*cn;
1701 memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) );
1702 memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) );
1704 // First row initialization
1705 for( c = 0; c < cn; c++ )
1707 for( j = 0; j < n; j++ )
1708 COP( c, j, src[cn*j+c], += (cv::HT)(r+2) );
1710 for( i = 1; i < r; i++ )
1712 const uchar* p = src + sstep*std::min(i, m-1);
1713 for ( j = 0; j < n; j++ )
1714 COP( c, j, p[cn*j+c], ++ );
1718 for( i = 0; i < m; i++ )
1720 const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
1721 const uchar* p1 = src + sstep * std::min( m-1, i+r );
1723 memset( H, 0, cn*sizeof(H[0]) );
1724 memset( luc, 0, cn*sizeof(luc[0]) );
1725 for( c = 0; c < cn; c++ )
1727 // Update column histograms for the entire row.
1728 for( j = 0; j < n; j++ )
1730 COP( c, j, p0[j*cn + c], -- );
1731 COP( c, j, p1[j*cn + c], ++ );
1734 // First column initialization
1735 for( k = 0; k < 16; ++k )
1736 histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] );
1738 #if MEDIAN_HAVE_SIMD
1741 for( j = 0; j < 2*r; ++j )
1742 histogram_add_simd( &h_coarse[16*(n*c+j)], H[c].coarse );
1744 for( j = r; j < n-r; j++ )
1746 int t = 2*r*r + 2*r, b, sum = 0;
1749 histogram_add_simd( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
1751 // Find median at coarse level
1752 for ( k = 0; k < 16 ; ++k )
1754 sum += H[c].coarse[k];
1757 sum -= H[c].coarse[k];
1763 /* Update corresponding histogram segment */
1764 if ( luc[c][k] <= j-r )
1766 memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
1767 for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
1768 histogram_add_simd( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
1770 if ( luc[c][k] < j+r+1 )
1772 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
1773 luc[c][k] = (HT)(j+r+1);
1778 for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
1780 histogram_sub_simd( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
1781 histogram_add_simd( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
1785 histogram_sub_simd( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
1787 /* Find median in segment */
1788 segment = H[c].fine[k];
1789 for ( b = 0; b < 16 ; b++ )
1794 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
1804 for( j = 0; j < 2*r; ++j )
1805 histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse );
1807 for( j = r; j < n-r; j++ )
1809 int t = 2*r*r + 2*r, b, sum = 0;
1812 histogram_add( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
1814 // Find median at coarse level
1815 for ( k = 0; k < 16 ; ++k )
1817 sum += H[c].coarse[k];
1820 sum -= H[c].coarse[k];
1826 /* Update corresponding histogram segment */
1827 if ( luc[c][k] <= j-r )
1829 memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
1830 for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
1831 histogram_add( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
1833 if ( luc[c][k] < j+r+1 )
1835 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
1836 luc[c][k] = (HT)(j+r+1);
1841 for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
1843 histogram_sub( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
1844 histogram_add( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
1848 histogram_sub( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
1850 /* Find median in segment */
1851 segment = H[c].fine[k];
1852 for ( b = 0; b < 16 ; b++ )
1857 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
1873 medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m )
1880 Size size = _dst.size();
1881 const uchar* src = _src.ptr();
1882 uchar* dst = _dst.ptr();
1883 int src_step = (int)_src.step, dst_step = (int)_dst.step;
1884 int cn = _src.channels();
1885 const uchar* src_max = src + size.height*src_step;
1887 #define UPDATE_ACC01( pix, cn, op ) \
1891 zone0[cn][p >> 4] op; \
1894 //CV_Assert( size.height >= nx && size.width >= nx );
1895 for( x = 0; x < size.width; x++, src += cn, dst += cn )
1897 uchar* dst_cur = dst;
1898 const uchar* src_top = src;
1899 const uchar* src_bottom = src;
1901 int src_step1 = src_step, dst_step1 = dst_step;
1905 src_bottom = src_top += src_step*(size.height-1);
1906 dst_cur += dst_step*(size.height-1);
1907 src_step1 = -src_step1;
1908 dst_step1 = -dst_step1;
1912 memset( zone0, 0, sizeof(zone0[0])*cn );
1913 memset( zone1, 0, sizeof(zone1[0])*cn );
1915 for( y = 0; y <= m/2; y++ )
1917 for( c = 0; c < cn; c++ )
1921 for( k = 0; k < m*cn; k += cn )
1922 UPDATE_ACC01( src_bottom[k+c], c, ++ );
1926 for( k = 0; k < m*cn; k += cn )
1927 UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 );
1931 if( (src_step1 > 0 && y < size.height-1) ||
1932 (src_step1 < 0 && size.height-y-1 > 0) )
1933 src_bottom += src_step1;
1936 for( y = 0; y < size.height; y++, dst_cur += dst_step1 )
1939 for( c = 0; c < cn; c++ )
1944 int t = s + zone0[c][k];
1955 dst_cur[c] = (uchar)k;
1958 if( y+1 == size.height )
1963 for( k = 0; k < m; k++ )
1966 int q = src_bottom[k];
1975 for( k = 0; k < m*3; k += 3 )
1977 UPDATE_ACC01( src_top[k], 0, -- );
1978 UPDATE_ACC01( src_top[k+1], 1, -- );
1979 UPDATE_ACC01( src_top[k+2], 2, -- );
1981 UPDATE_ACC01( src_bottom[k], 0, ++ );
1982 UPDATE_ACC01( src_bottom[k+1], 1, ++ );
1983 UPDATE_ACC01( src_bottom[k+2], 2, ++ );
1989 for( k = 0; k < m*4; k += 4 )
1991 UPDATE_ACC01( src_top[k], 0, -- );
1992 UPDATE_ACC01( src_top[k+1], 1, -- );
1993 UPDATE_ACC01( src_top[k+2], 2, -- );
1994 UPDATE_ACC01( src_top[k+3], 3, -- );
1996 UPDATE_ACC01( src_bottom[k], 0, ++ );
1997 UPDATE_ACC01( src_bottom[k+1], 1, ++ );
1998 UPDATE_ACC01( src_bottom[k+2], 2, ++ );
1999 UPDATE_ACC01( src_bottom[k+3], 3, ++ );
2003 if( (src_step1 > 0 && src_bottom + src_step1 < src_max) ||
2004 (src_step1 < 0 && src_bottom + src_step1 >= src) )
2005 src_bottom += src_step1;
2008 src_top += src_step1;
2018 typedef uchar value_type;
2019 typedef int arg_type;
2021 arg_type load(const uchar* ptr) { return *ptr; }
2022 void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
2023 void operator()(arg_type& a, arg_type& b) const
2025 int t = CV_FAST_CAST_8U(a - b);
2032 typedef ushort value_type;
2033 typedef int arg_type;
2035 arg_type load(const ushort* ptr) { return *ptr; }
2036 void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
2037 void operator()(arg_type& a, arg_type& b) const
2047 typedef short value_type;
2048 typedef int arg_type;
2050 arg_type load(const short* ptr) { return *ptr; }
2051 void store(short* ptr, arg_type val) { *ptr = (short)val; }
2052 void operator()(arg_type& a, arg_type& b) const
2062 typedef float value_type;
2063 typedef float arg_type;
2065 arg_type load(const float* ptr) { return *ptr; }
2066 void store(float* ptr, arg_type val) { *ptr = val; }
2067 void operator()(arg_type& a, arg_type& b) const
2079 typedef uchar value_type;
2080 typedef __m128i arg_type;
2082 arg_type load(const uchar* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
2083 void store(uchar* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
2084 void operator()(arg_type& a, arg_type& b) const
2087 a = _mm_min_epu8(a, b);
2088 b = _mm_max_epu8(b, t);
2095 typedef ushort value_type;
2096 typedef __m128i arg_type;
2098 arg_type load(const ushort* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
2099 void store(ushort* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
2100 void operator()(arg_type& a, arg_type& b) const
2102 arg_type t = _mm_subs_epu16(a, b);
2103 a = _mm_subs_epu16(a, t);
2104 b = _mm_adds_epu16(b, t);
2111 typedef short value_type;
2112 typedef __m128i arg_type;
2114 arg_type load(const short* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
2115 void store(short* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
2116 void operator()(arg_type& a, arg_type& b) const
2119 a = _mm_min_epi16(a, b);
2120 b = _mm_max_epi16(b, t);
2127 typedef float value_type;
2128 typedef __m128 arg_type;
2130 arg_type load(const float* ptr) { return _mm_loadu_ps(ptr); }
2131 void store(float* ptr, arg_type val) { _mm_storeu_ps(ptr, val); }
2132 void operator()(arg_type& a, arg_type& b) const
2135 a = _mm_min_ps(a, b);
2136 b = _mm_max_ps(b, t);
2144 typedef uchar value_type;
2145 typedef uint8x16_t arg_type;
2147 arg_type load(const uchar* ptr) { return vld1q_u8(ptr); }
2148 void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); }
2149 void operator()(arg_type& a, arg_type& b) const
2160 typedef ushort value_type;
2161 typedef uint16x8_t arg_type;
2163 arg_type load(const ushort* ptr) { return vld1q_u16(ptr); }
2164 void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); }
2165 void operator()(arg_type& a, arg_type& b) const
2168 a = vminq_u16(a, b);
2169 b = vmaxq_u16(b, t);
2176 typedef short value_type;
2177 typedef int16x8_t arg_type;
2179 arg_type load(const short* ptr) { return vld1q_s16(ptr); }
2180 void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); }
2181 void operator()(arg_type& a, arg_type& b) const
2184 a = vminq_s16(a, b);
2185 b = vmaxq_s16(b, t);
2192 typedef float value_type;
2193 typedef float32x4_t arg_type;
2195 arg_type load(const float* ptr) { return vld1q_f32(ptr); }
2196 void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); }
2197 void operator()(arg_type& a, arg_type& b) const
2200 a = vminq_f32(a, b);
2201 b = vmaxq_f32(b, t);
2208 typedef MinMax8u MinMaxVec8u;
2209 typedef MinMax16u MinMaxVec16u;
2210 typedef MinMax16s MinMaxVec16s;
2211 typedef MinMax32f MinMaxVec32f;
2215 template<class Op, class VecOp>
2217 medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
2219 typedef typename Op::value_type T;
2220 typedef typename Op::arg_type WT;
2221 typedef typename VecOp::arg_type VT;
2223 const T* src = _src.ptr<T>();
2224 T* dst = _dst.ptr<T>();
2225 int sstep = (int)(_src.step/sizeof(T));
2226 int dstep = (int)(_dst.step/sizeof(T));
2227 Size size = _dst.size();
2228 int i, j, k, cn = _src.channels();
2231 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
2235 if( size.width == 1 || size.height == 1 )
2237 int len = size.width + size.height - 1;
2238 int sdelta = size.height == 1 ? cn : sstep;
2239 int sdelta0 = size.height == 1 ? 0 : sstep - cn;
2240 int ddelta = size.height == 1 ? cn : dstep;
2242 for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
2243 for( j = 0; j < cn; j++, src++ )
2245 WT p0 = src[i > 0 ? -sdelta : 0];
2247 WT p2 = src[i < len - 1 ? sdelta : 0];
2249 op(p0, p1); op(p1, p2); op(p0, p1);
2256 for( i = 0; i < size.height; i++, dst += dstep )
2258 const T* row0 = src + std::max(i - 1, 0)*sstep;
2259 const T* row1 = src + i*sstep;
2260 const T* row2 = src + std::min(i + 1, size.height-1)*sstep;
2261 int limit = useSIMD ? cn : size.width;
2265 for( ; j < limit; j++ )
2267 int j0 = j >= cn ? j - cn : j;
2268 int j2 = j < size.width - cn ? j + cn : j;
2269 WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
2270 WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
2271 WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
2273 op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
2274 op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
2275 op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
2276 op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
2277 op(p4, p2); op(p6, p4); op(p4, p2);
2281 if( limit == size.width )
2284 for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
2286 VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
2287 VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
2288 VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn);
2290 vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
2291 vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
2292 vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
2293 vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
2294 vop(p4, p2); vop(p6, p4); vop(p4, p2);
2295 vop.store(dst+j, p4);
2304 if( size.width == 1 || size.height == 1 )
2306 int len = size.width + size.height - 1;
2307 int sdelta = size.height == 1 ? cn : sstep;
2308 int sdelta0 = size.height == 1 ? 0 : sstep - cn;
2309 int ddelta = size.height == 1 ? cn : dstep;
2311 for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
2312 for( j = 0; j < cn; j++, src++ )
2314 int i1 = i > 0 ? -sdelta : 0;
2315 int i0 = i > 1 ? -sdelta*2 : i1;
2316 int i3 = i < len-1 ? sdelta : 0;
2317 int i4 = i < len-2 ? sdelta*2 : i3;
2318 WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
2320 op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2);
2321 op(p2, p4); op(p1, p3); op(p1, p2);
2328 for( i = 0; i < size.height; i++, dst += dstep )
2331 row[0] = src + std::max(i - 2, 0)*sstep;
2332 row[1] = src + std::max(i - 1, 0)*sstep;
2333 row[2] = src + i*sstep;
2334 row[3] = src + std::min(i + 1, size.height-1)*sstep;
2335 row[4] = src + std::min(i + 2, size.height-1)*sstep;
2336 int limit = useSIMD ? cn*2 : size.width;
2340 for( ; j < limit; j++ )
2343 int j1 = j >= cn ? j - cn : j;
2344 int j0 = j >= cn*2 ? j - cn*2 : j1;
2345 int j3 = j < size.width - cn ? j + cn : j;
2346 int j4 = j < size.width - cn*2 ? j + cn*2 : j3;
2347 for( k = 0; k < 5; k++ )
2349 const T* rowk = row[k];
2350 p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
2351 p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
2352 p[k*5+4] = rowk[j4];
2355 op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]);
2356 op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]);
2357 op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]);
2358 op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]);
2359 op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]);
2360 op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]);
2361 op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]);
2362 op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]);
2363 op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]);
2364 op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]);
2365 op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]);
2366 op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]);
2367 op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]);
2368 op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]);
2369 op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]);
2370 op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]);
2371 op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]);
2372 op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]);
2373 op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]);
2374 op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]);
2375 op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]);
2376 op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]);
2377 op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]);
2381 if( limit == size.width )
2384 for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
2387 for( k = 0; k < 5; k++ )
2389 const T* rowk = row[k];
2390 p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
2391 p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
2392 p[k*5+4] = vop.load(rowk+j+cn*2);
2395 vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
2396 vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
2397 vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
2398 vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
2399 vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
2400 vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
2401 vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
2402 vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
2403 vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
2404 vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
2405 vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
2406 vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
2407 vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
2408 vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
2409 vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
2410 vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
2411 vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
2412 vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
2413 vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
2414 vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
2415 vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
2416 vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
2417 vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
2418 vop.store(dst+j, p[12]);
2429 static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m)
2431 size_t localsize[2] = { 16, 16 };
2432 size_t globalsize[2];
2433 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2435 if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) )
2438 Size imgSize = _src.size();
2439 bool useOptimized = (1 == cn) &&
2440 (size_t)imgSize.width >= localsize[0] * 8 &&
2441 (size_t)imgSize.height >= localsize[1] * 8 &&
2442 imgSize.width % 4 == 0 &&
2443 imgSize.height % 4 == 0 &&
2444 (ocl::Device::getDefault().isIntel());
2446 cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ;
2447 cv::String kdefs = useOptimized ?
2448 format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type),
2449 ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn)
2451 format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ;
2453 ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() );
2458 UMat src = _src.getUMat();
2459 _dst.create(src.size(), type);
2460 UMat dst = _dst.getUMat();
2462 k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
2466 globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0];
2467 globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1];
2471 globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0];
2472 globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1];
2475 return k.run(2, globalsize, localsize, false);
2482 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
2484 CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 ));
2492 CV_OCL_RUN(_dst.isUMat(),
2493 ocl_medianFilter(_src0,_dst, ksize))
2495 Mat src0 = _src0.getMat();
2496 _dst.create( src0.size(), src0.type() );
2497 Mat dst = _dst.getMat();
2499 #if IPP_VERSION_X100 >= 801
2502 #define IPP_FILTER_MEDIAN_BORDER(ippType, ippDataType, flavor) \
2505 if (ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, \
2506 ippDataType, CV_MAT_CN(type), &bufSize) >= 0) \
2508 Ipp8u * buffer = ippsMalloc_8u(bufSize); \
2509 IppStatus status = ippiFilterMedianBorder_##flavor(src.ptr<ippType>(), (int)src.step, \
2510 dst.ptr<ippType>(), (int)dst.step, dstRoiSize, maskSize, \
2511 ippBorderRepl, (ippType)0, buffer); \
2515 CV_IMPL_ADD(CV_IMPL_IPP); \
2519 setIppErrorStatus(); \
2526 IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize);
2528 if( dst.data != src0.data )
2533 int type = src0.type();
2534 if (type == CV_8UC1)
2535 IPP_FILTER_MEDIAN_BORDER(Ipp8u, ipp8u, 8u_C1R);
2536 else if (type == CV_16UC1)
2537 IPP_FILTER_MEDIAN_BORDER(Ipp16u, ipp16u, 16u_C1R);
2538 else if (type == CV_16SC1)
2539 IPP_FILTER_MEDIAN_BORDER(Ipp16s, ipp16s, 16s_C1R);
2540 else if (type == CV_32FC1)
2541 IPP_FILTER_MEDIAN_BORDER(Ipp32f, ipp32f, 32f_C1R);
2543 #undef IPP_FILTER_MEDIAN_BORDER
2547 #ifdef HAVE_TEGRA_OPTIMIZATION
2548 if (tegra::medianBlur(src0, dst, ksize))
2552 bool useSortNet = ksize == 3 || (ksize == 5
2553 #if !(CV_SSE2 || CV_NEON)
2554 && src0.depth() > CV_8U
2561 if( dst.data != src0.data )
2566 if( src.depth() == CV_8U )
2567 medianBlur_SortNet<MinMax8u, MinMaxVec8u>( src, dst, ksize );
2568 else if( src.depth() == CV_16U )
2569 medianBlur_SortNet<MinMax16u, MinMaxVec16u>( src, dst, ksize );
2570 else if( src.depth() == CV_16S )
2571 medianBlur_SortNet<MinMax16s, MinMaxVec16s>( src, dst, ksize );
2572 else if( src.depth() == CV_32F )
2573 medianBlur_SortNet<MinMax32f, MinMaxVec32f>( src, dst, ksize );
2575 CV_Error(CV_StsUnsupportedFormat, "");
2581 cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE );
2583 int cn = src0.channels();
2584 CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) );
2586 double img_size_mp = (double)(src0.total())/(1 << 20);
2587 if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*
2588 (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3))
2589 medianBlur_8u_Om( src, dst, ksize );
2591 medianBlur_8u_O1( src, dst, ksize );
2595 /****************************************************************************************\
2597 \****************************************************************************************/
2602 class BilateralFilter_8u_Invoker :
2603 public ParallelLoopBody
2606 BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
2607 int* _space_ofs, float *_space_weight, float *_color_weight) :
2608 temp(&_temp), dest(&_dest), radius(_radius),
2609 maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
2613 virtual void operator() (const Range& range) const
2615 int i, j, cn = dest->channels(), k;
2616 Size size = dest->size();
2618 int CV_DECL_ALIGNED(16) buf[4];
2619 float CV_DECL_ALIGNED(16) bufSum[4];
2620 static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
2621 bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
2624 for( i = range.start; i < range.end; i++ )
2626 const uchar* sptr = temp->ptr(i+radius) + radius*cn;
2627 uchar* dptr = dest->ptr(i);
2631 for( j = 0; j < size.width; j++ )
2633 float sum = 0, wsum = 0;
2639 __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
2640 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
2642 for( ; k <= maxk - 4; k += 4 )
2644 __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
2645 sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
2647 __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
2648 _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
2650 __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
2651 color_weight[buf[1]],color_weight[buf[0]]);
2652 __m128 _sw = _mm_loadu_ps(space_weight+k);
2653 __m128 _w = _mm_mul_ps(_cw, _sw);
2654 _cw = _mm_mul_ps(_w, _valF);
2656 _sw = _mm_hadd_ps(_w, _cw);
2657 _sw = _mm_hadd_ps(_sw, _sw);
2658 _mm_storel_pi((__m64*)bufSum, _sw);
2665 for( ; k < maxk; k++ )
2667 int val = sptr[j + space_ofs[k]];
2668 float w = space_weight[k]*color_weight[std::abs(val - val0)];
2672 // overflow is not possible here => there is no need to use cv::saturate_cast
2673 dptr[j] = (uchar)cvRound(sum/wsum);
2679 for( j = 0; j < size.width*3; j += 3 )
2681 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
2682 int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
2687 const __m128i izero = _mm_setzero_si128();
2688 const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
2689 const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
2690 const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
2691 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
2693 for( ; k <= maxk - 4; k += 4 )
2695 const int* const sptr_k0 = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
2696 const int* const sptr_k1 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
2697 const int* const sptr_k2 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
2698 const int* const sptr_k3 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
2700 __m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
2701 __m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
2702 __m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
2703 __m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
2705 _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
2707 __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
2708 __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
2709 __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
2711 bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
2712 _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
2714 __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
2715 color_weight[buf[1]],color_weight[buf[0]]);
2716 __m128 _sw = _mm_loadu_ps(space_weight+k);
2718 _w = _mm_mul_ps(_w,_sw);
2719 _b = _mm_mul_ps(_b, _w);
2720 _g = _mm_mul_ps(_g, _w);
2721 _r = _mm_mul_ps(_r, _w);
2723 _w = _mm_hadd_ps(_w, _b);
2724 _g = _mm_hadd_ps(_g, _r);
2726 _w = _mm_hadd_ps(_w, _g);
2727 _mm_store_ps(bufSum, _w);
2737 for( ; k < maxk; k++ )
2739 const uchar* sptr_k = sptr + j + space_ofs[k];
2740 int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
2741 float w = space_weight[k]*color_weight[std::abs(b - b0) +
2742 std::abs(g - g0) + std::abs(r - r0)];
2743 sum_b += b*w; sum_g += g*w; sum_r += r*w;
2747 b0 = cvRound(sum_b*wsum);
2748 g0 = cvRound(sum_g*wsum);
2749 r0 = cvRound(sum_r*wsum);
2750 dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
2759 int radius, maxk, *space_ofs;
2760 float *space_weight, *color_weight;
2763 #if defined (HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY) && 0
2764 class IPPBilateralFilter_8u_Invoker :
2765 public ParallelLoopBody
2768 IPPBilateralFilter_8u_Invoker(Mat &_src, Mat &_dst, double _sigma_color, double _sigma_space, int _radius, bool *_ok) :
2769 ParallelLoopBody(), src(_src), dst(_dst), sigma_color(_sigma_color), sigma_space(_sigma_space), radius(_radius), ok(_ok)
2774 virtual void operator() (const Range& range) const
2776 int d = radius * 2 + 1;
2777 IppiSize kernel = {d, d};
2778 IppiSize roi={dst.cols, range.end - range.start};
2780 if (0 > ippiFilterBilateralGetBufSize_8u_C1R( ippiFilterBilateralGauss, roi, kernel, &bufsize))
2785 AutoBuffer<uchar> buf(bufsize);
2786 IppiFilterBilateralSpec *pSpec = (IppiFilterBilateralSpec *)alignPtr(&buf[0], 32);
2787 if (0 > ippiFilterBilateralInit_8u_C1R( ippiFilterBilateralGauss, kernel, (Ipp32f)sigma_color, (Ipp32f)sigma_space, 1, pSpec ))
2792 if (0 > ippiFilterBilateral_8u_C1R( src.ptr<uchar>(range.start) + radius * ((int)src.step[0] + 1), (int)src.step[0], dst.ptr<uchar>(range.start), (int)dst.step[0], roi, kernel, pSpec ))
2796 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2806 const IPPBilateralFilter_8u_Invoker& operator= (const IPPBilateralFilter_8u_Invoker&);
2812 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
2813 double sigma_color, double sigma_space,
2816 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2817 int i, j, maxk, radius;
2819 if (depth != CV_8U || cn > 4)
2822 if (sigma_color <= 0)
2824 if (sigma_space <= 0)
2827 double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
2828 double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
2831 radius = cvRound(sigma_space * 1.5);
2834 radius = MAX(radius, 1);
2837 UMat src = _src.getUMat(), dst = _dst.getUMat(), temp;
2841 copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
2842 std::vector<float> _space_weight(d * d);
2843 std::vector<int> _space_ofs(d * d);
2844 float * const space_weight = &_space_weight[0];
2845 int * const space_ofs = &_space_ofs[0];
2847 // initialize space-related bilateral filter coefficients
2848 for( i = -radius, maxk = 0; i <= radius; i++ )
2849 for( j = -radius; j <= radius; j++ )
2851 double r = std::sqrt((double)i * i + (double)j * j);
2854 space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
2855 space_ofs[maxk++] = (int)(i * temp.step + j * cn);
2859 String cnstr = cn > 1 ? format("%d", cn) : "";
2860 String kernelName("bilateral");
2862 if ((ocl::Device::getDefault().isIntel()) &&
2863 (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
2866 if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
2868 kernelName = "bilateral_float4";
2872 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
2873 format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
2874 " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=%f",
2875 radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
2876 ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
2877 ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
2878 ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
2879 ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
2883 Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
2884 Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
2885 UMat ucolor_weight, uspace_weight, uspace_ofs;
2887 mspace_weight.copyTo(uspace_weight);
2888 mspace_ofs.copyTo(uspace_ofs);
2890 k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
2891 ocl::KernelArg::PtrReadOnly(uspace_weight),
2892 ocl::KernelArg::PtrReadOnly(uspace_ofs));
2894 size_t globalsize[2] = { dst.cols / sizeDiv, dst.rows };
2895 return k.run(2, globalsize, NULL, false);
2900 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
2901 double sigma_color, double sigma_space,
2904 int cn = src.channels();
2905 int i, j, maxk, radius;
2906 Size size = src.size();
2908 CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
2910 if( sigma_color <= 0 )
2912 if( sigma_space <= 0 )
2915 double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
2916 double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
2919 radius = cvRound(sigma_space*1.5);
2922 radius = MAX(radius, 1);
2926 copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
2928 #if defined HAVE_IPP && (IPP_VERSION_MAJOR >= 7) && 0
2934 IPPBilateralFilter_8u_Invoker body(temp, dst, sigma_color * sigma_color, sigma_space * sigma_space, radius, &ok );
2935 parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16));
2938 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
2941 setIppErrorStatus();
2946 std::vector<float> _color_weight(cn*256);
2947 std::vector<float> _space_weight(d*d);
2948 std::vector<int> _space_ofs(d*d);
2949 float* color_weight = &_color_weight[0];
2950 float* space_weight = &_space_weight[0];
2951 int* space_ofs = &_space_ofs[0];
2953 // initialize color-related bilateral filter coefficients
2955 for( i = 0; i < 256*cn; i++ )
2956 color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
2958 // initialize space-related bilateral filter coefficients
2959 for( i = -radius, maxk = 0; i <= radius; i++ )
2963 for( ; j <= radius; j++ )
2965 double r = std::sqrt((double)i*i + (double)j*j);
2968 space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
2969 space_ofs[maxk++] = (int)(i*temp.step + j*cn);
2973 BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
2974 parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
2978 class BilateralFilter_32f_Invoker :
2979 public ParallelLoopBody
2983 BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
2984 const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
2985 cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
2986 temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
2990 virtual void operator() (const Range& range) const
2993 Size size = dest->size();
2995 int CV_DECL_ALIGNED(16) idxBuf[4];
2996 float CV_DECL_ALIGNED(16) bufSum32[4];
2997 static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
2998 bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
3001 for( i = range.start; i < range.end; i++ )
3003 const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
3004 float* dptr = dest->ptr<float>(i);
3008 for( j = 0; j < size.width; j++ )
3010 float sum = 0, wsum = 0;
3011 float val0 = sptr[j];
3016 __m128 psum = _mm_setzero_ps();
3017 const __m128 _val0 = _mm_set1_ps(sptr[j]);
3018 const __m128 _scale_index = _mm_set1_ps(scale_index);
3019 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
3021 for( ; k <= maxk - 4 ; k += 4 )
3023 __m128 _sw = _mm_loadu_ps(space_weight + k);
3024 __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
3025 sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
3026 __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
3028 __m128i _idx = _mm_cvtps_epi32(_alpha);
3029 _mm_store_si128((__m128i*)idxBuf, _idx);
3030 _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
3032 __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
3033 expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
3034 __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
3035 expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
3037 __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
3038 _val = _mm_mul_ps(_w, _val);
3040 _sw = _mm_hadd_ps(_w, _val);
3041 _sw = _mm_hadd_ps(_sw, _sw);
3042 psum = _mm_add_ps(_sw, psum);
3044 _mm_storel_pi((__m64*)bufSum32, psum);
3051 for( ; k < maxk; k++ )
3053 float val = sptr[j + space_ofs[k]];
3054 float alpha = (float)(std::abs(val - val0)*scale_index);
3055 int idx = cvFloor(alpha);
3057 float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
3061 dptr[j] = (float)(sum/wsum);
3066 CV_Assert( cn == 3 );
3067 for( j = 0; j < size.width*3; j += 3 )
3069 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
3070 float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
3075 __m128 sum = _mm_setzero_ps();
3076 const __m128 _b0 = _mm_set1_ps(b0);
3077 const __m128 _g0 = _mm_set1_ps(g0);
3078 const __m128 _r0 = _mm_set1_ps(r0);
3079 const __m128 _scale_index = _mm_set1_ps(scale_index);
3080 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
3082 for( ; k <= maxk-4; k += 4 )
3084 __m128 _sw = _mm_loadu_ps(space_weight + k);
3086 const float* const sptr_k0 = sptr + j + space_ofs[k];
3087 const float* const sptr_k1 = sptr + j + space_ofs[k+1];
3088 const float* const sptr_k2 = sptr + j + space_ofs[k+2];
3089 const float* const sptr_k3 = sptr + j + space_ofs[k+3];
3091 __m128 _b = _mm_loadu_ps(sptr_k0);
3092 __m128 _g = _mm_loadu_ps(sptr_k1);
3093 __m128 _r = _mm_loadu_ps(sptr_k2);
3094 __m128 _z = _mm_loadu_ps(sptr_k3);
3095 _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
3097 __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
3098 __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
3099 __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
3101 __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
3103 __m128i _idx = _mm_cvtps_epi32(_alpha);
3104 _mm_store_si128((__m128i*)idxBuf, _idx);
3105 _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
3107 __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
3108 __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
3110 __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
3112 _b = _mm_mul_ps(_b, _w);
3113 _g = _mm_mul_ps(_g, _w);
3114 _r = _mm_mul_ps(_r, _w);
3116 _w = _mm_hadd_ps(_w, _b);
3117 _g = _mm_hadd_ps(_g, _r);
3119 _w = _mm_hadd_ps(_w, _g);
3120 sum = _mm_add_ps(sum, _w);
3122 _mm_store_ps(bufSum32, sum);
3124 sum_b = bufSum32[1];
3125 sum_g = bufSum32[2];
3126 sum_r = bufSum32[3];
3130 for(; k < maxk; k++ )
3132 const float* sptr_k = sptr + j + space_ofs[k];
3133 float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
3134 float alpha = (float)((std::abs(b - b0) +
3135 std::abs(g - g0) + std::abs(r - r0))*scale_index);
3136 int idx = cvFloor(alpha);
3138 float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
3139 sum_b += b*w; sum_g += g*w; sum_r += r*w;
3146 dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
3153 int cn, radius, maxk, *space_ofs;
3156 float scale_index, *space_weight, *expLUT;
3161 bilateralFilter_32f( const Mat& src, Mat& dst, int d,
3162 double sigma_color, double sigma_space,
3165 int cn = src.channels();
3166 int i, j, maxk, radius;
3167 double minValSrc=-1, maxValSrc=1;
3168 const int kExpNumBinsPerChannel = 1 << 12;
3169 int kExpNumBins = 0;
3170 float lastExpVal = 1.f;
3171 float len, scale_index;
3172 Size size = src.size();
3174 CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
3176 if( sigma_color <= 0 )
3178 if( sigma_space <= 0 )
3181 double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
3182 double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
3185 radius = cvRound(sigma_space*1.5);
3188 radius = MAX(radius, 1);
3190 // compute the min/max range for the input image (even if multichannel)
3192 minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
3193 if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
3199 // temporary copy of the image with borders for easy processing
3201 copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
3202 const double insteadNaNValue = -5. * sigma_color;
3203 patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative
3204 // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption
3205 // allocate lookup tables
3206 std::vector<float> _space_weight(d*d);
3207 std::vector<int> _space_ofs(d*d);
3208 float* space_weight = &_space_weight[0];
3209 int* space_ofs = &_space_ofs[0];
3211 // assign a length which is slightly more than needed
3212 len = (float)(maxValSrc - minValSrc) * cn;
3213 kExpNumBins = kExpNumBinsPerChannel * cn;
3214 std::vector<float> _expLUT(kExpNumBins+2);
3215 float* expLUT = &_expLUT[0];
3217 scale_index = kExpNumBins/len;
3219 // initialize the exp LUT
3220 for( i = 0; i < kExpNumBins+2; i++ )
3222 if( lastExpVal > 0.f )
3224 double val = i / scale_index;
3225 expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
3226 lastExpVal = expLUT[i];
3232 // initialize space-related bilateral filter coefficients
3233 for( i = -radius, maxk = 0; i <= radius; i++ )
3234 for( j = -radius; j <= radius; j++ )
3236 double r = std::sqrt((double)i*i + (double)j*j);
3239 space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
3240 space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
3243 // parallel_for usage
3245 BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
3246 parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
3251 void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
3252 double sigmaColor, double sigmaSpace,
3255 _dst.create( _src.size(), _src.type() );
3257 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
3258 ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
3260 Mat src = _src.getMat(), dst = _dst.getMat();
3262 if( src.depth() == CV_8U )
3263 bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType );
3264 else if( src.depth() == CV_32F )
3265 bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
3267 CV_Error( CV_StsUnsupportedFormat,
3268 "Bilateral filtering is only implemented for 8u and 32f images" );
3271 //////////////////////////////////////////////////////////////////////////////////////////
3274 cvSmooth( const void* srcarr, void* dstarr, int smooth_type,
3275 int param1, int param2, double param3, double param4 )
3277 cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
3279 CV_Assert( dst.size() == src.size() &&
3280 (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) );
3285 if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE )
3286 cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point(-1,-1),
3287 smooth_type == CV_BLUR, cv::BORDER_REPLICATE );
3288 else if( smooth_type == CV_GAUSSIAN )
3289 cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE );
3290 else if( smooth_type == CV_MEDIAN )
3291 cv::medianBlur( src, dst, param1 );
3293 cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE );
3295 if( dst.data != dst0.data )
3296 CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" );