1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
44 #include "precomp.hpp"
45 #include "opencl_kernels_imgproc.hpp"
48 #define IVX_HIDE_INFO_WARNINGS
49 #define IVX_USE_OPENCV
53 * This file includes the code, contributed by Simon Perreault
54 * (the function icvMedianBlur_8u_O1)
56 * Constant-time median filtering -- http://nomis80.org/ctmf.html
57 * Copyright (C) 2006 Simon Perreault
60 * Laboratoire de vision et systemes numeriques
61 * Pavillon Adrien-Pouliot
63 * Sainte-Foy, Quebec, Canada
66 * perreaul@gel.ulaval.ca
72 /****************************************************************************************\
74 \****************************************************************************************/
76 template<typename T, typename ST>
80 RowSum( int _ksize, int _anchor ) :
87 virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
89 const T* S = (const T*)src;
91 int i = 0, k, ksz_cn = ksize*cn;
93 width = (width - 1)*cn;
96 for( i = 0; i < width + cn; i++ )
98 D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2];
101 else if( ksize == 5 )
103 for( i = 0; i < width + cn; i++ )
105 D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4];
111 for( i = 0; i < ksz_cn; i++ )
114 for( i = 0; i < width; i++ )
116 s += (ST)S[i + ksz_cn] - (ST)S[i];
122 ST s0 = 0, s1 = 0, s2 = 0;
123 for( i = 0; i < ksz_cn; i += 3 )
132 for( i = 0; i < width; i += 3 )
134 s0 += (ST)S[i + ksz_cn] - (ST)S[i];
135 s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
136 s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
144 ST s0 = 0, s1 = 0, s2 = 0, s3 = 0;
145 for( i = 0; i < ksz_cn; i += 4 )
156 for( i = 0; i < width; i += 4 )
158 s0 += (ST)S[i + ksz_cn] - (ST)S[i];
159 s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
160 s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
161 s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3];
169 for( k = 0; k < cn; k++, S++, D++ )
172 for( i = 0; i < ksz_cn; i += cn )
175 for( i = 0; i < width; i += cn )
177 s += (ST)S[i + ksz_cn] - (ST)S[i];
185 template<typename ST, typename T>
187 public BaseColumnFilter
189 ColumnSum( int _ksize, int _anchor, double _scale ) :
198 virtual void reset() { sumCount = 0; }
200 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
204 bool haveScale = scale != 1;
205 double _scale = scale;
207 if( width != (int)sum.size() )
216 memset((void*)SUM, 0, width*sizeof(ST));
218 for( ; sumCount < ksize - 1; sumCount++, src++ )
220 const ST* Sp = (const ST*)src[0];
222 for( i = 0; i < width; i++ )
228 CV_Assert( sumCount == ksize-1 );
232 for( ; count--; src++ )
234 const ST* Sp = (const ST*)src[0];
235 const ST* Sm = (const ST*)src[1-ksize];
239 for( i = 0; i <= width - 2; i += 2 )
241 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
242 D[i] = saturate_cast<T>(s0*_scale);
243 D[i+1] = saturate_cast<T>(s1*_scale);
244 s0 -= Sm[i]; s1 -= Sm[i+1];
245 SUM[i] = s0; SUM[i+1] = s1;
248 for( ; i < width; i++ )
250 ST s0 = SUM[i] + Sp[i];
251 D[i] = saturate_cast<T>(s0*_scale);
257 for( i = 0; i <= width - 2; i += 2 )
259 ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
260 D[i] = saturate_cast<T>(s0);
261 D[i+1] = saturate_cast<T>(s1);
262 s0 -= Sm[i]; s1 -= Sm[i+1];
263 SUM[i] = s0; SUM[i+1] = s1;
266 for( ; i < width; i++ )
268 ST s0 = SUM[i] + Sp[i];
269 D[i] = saturate_cast<T>(s0);
284 struct ColumnSum<int, uchar> :
285 public BaseColumnFilter
287 ColumnSum( int _ksize, int _anchor, double _scale ) :
296 virtual void reset() { sumCount = 0; }
298 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
301 bool haveScale = scale != 1;
302 double _scale = scale;
305 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
307 bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
310 if( width != (int)sum.size() )
319 memset((void*)SUM, 0, width*sizeof(int));
320 for( ; sumCount < ksize - 1; sumCount++, src++ )
322 const int* Sp = (const int*)src[0];
327 for( ; i <= width-4; i+=4 )
329 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
330 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
331 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
337 for( ; i <= width - 4; i+=4 )
338 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
341 for( ; i < width; i++ )
347 CV_Assert( sumCount == ksize-1 );
351 for( ; count--; src++ )
353 const int* Sp = (const int*)src[0];
354 const int* Sm = (const int*)src[1-ksize];
355 uchar* D = (uchar*)dst;
362 const __m128 scale4 = _mm_set1_ps((float)_scale);
363 for( ; i <= width-8; i+=8 )
365 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
366 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
368 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
369 _mm_loadu_si128((const __m128i*)(Sp+i)));
370 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
371 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
373 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
374 __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
376 _s0T = _mm_packs_epi32(_s0T, _s0T1);
378 _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
380 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
381 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
387 float32x4_t v_scale = vdupq_n_f32((float)_scale);
388 for( ; i <= width-8; i+=8 )
390 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
391 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
393 uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
394 uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
396 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
397 vst1_u8(D + i, vqmovn_u16(v_dst));
399 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
400 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
404 for( ; i < width; i++ )
406 int s0 = SUM[i] + Sp[i];
407 D[i] = saturate_cast<uchar>(s0*_scale);
417 for( ; i <= width-8; i+=8 )
419 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
420 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
422 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
423 _mm_loadu_si128((const __m128i*)(Sp+i)));
424 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
425 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
427 __m128i _s0T = _mm_packs_epi32(_s0, _s01);
429 _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
431 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
432 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
438 for( ; i <= width-8; i+=8 )
440 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
441 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
443 uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
444 vst1_u8(D + i, vqmovn_u16(v_dst));
446 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
447 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
452 for( ; i < width; i++ )
454 int s0 = SUM[i] + Sp[i];
455 D[i] = saturate_cast<uchar>(s0);
465 std::vector<int> sum;
470 struct ColumnSum<ushort, uchar> :
471 public BaseColumnFilter
473 ColumnSum( int _ksize, int _anchor, double _scale ) :
484 int d = cvRound(1./scale);
485 double scalef = ((double)(1 << 16))/d;
486 divScale = cvFloor(scalef);
496 virtual void reset() { sumCount = 0; }
498 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
500 const int ds = divScale;
501 const int dd = divDelta;
503 const bool haveScale = scale != 1;
506 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
508 bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
511 if( width != (int)sum.size() )
520 memset((void*)SUM, 0, width*sizeof(SUM[0]));
521 for( ; sumCount < ksize - 1; sumCount++, src++ )
523 const ushort* Sp = (const ushort*)src[0];
528 for( ; i <= width-8; i+=8 )
530 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
531 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
532 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi16(_sum, _sp));
538 for( ; i <= width - 8; i+=8 )
539 vst1q_u16(SUM + i, vaddq_u16(vld1q_u16(SUM + i), vld1q_u16(Sp + i)));
542 for( ; i < width; i++ )
548 CV_Assert( sumCount == ksize-1 );
552 for( ; count--; src++ )
554 const ushort* Sp = (const ushort*)src[0];
555 const ushort* Sm = (const ushort*)src[1-ksize];
556 uchar* D = (uchar*)dst;
563 __m128i ds8 = _mm_set1_epi16((short)ds);
564 __m128i dd8 = _mm_set1_epi16((short)dd);
566 for( ; i <= width-16; i+=16 )
568 __m128i _sm0 = _mm_loadu_si128((const __m128i*)(Sm+i));
569 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+8));
571 __m128i _s0 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i)),
572 _mm_loadu_si128((const __m128i*)(Sp+i)));
573 __m128i _s1 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i+8)),
574 _mm_loadu_si128((const __m128i*)(Sp+i+8)));
575 __m128i _s2 = _mm_mulhi_epu16(_mm_adds_epu16(_s0, dd8), ds8);
576 __m128i _s3 = _mm_mulhi_epu16(_mm_adds_epu16(_s1, dd8), ds8);
577 _s0 = _mm_sub_epi16(_s0, _sm0);
578 _s1 = _mm_sub_epi16(_s1, _sm1);
579 _mm_storeu_si128((__m128i*)(D+i), _mm_packus_epi16(_s2, _s3));
580 _mm_storeu_si128((__m128i*)(SUM+i), _s0);
581 _mm_storeu_si128((__m128i*)(SUM+i+8), _s1);
585 for( ; i < width; i++ )
587 int s0 = SUM[i] + Sp[i];
588 D[i] = (uchar)((s0 + dd)*ds >> 16);
589 SUM[i] = (ushort)(s0 - Sm[i]);
595 for( ; i < width; i++ )
597 int s0 = SUM[i] + Sp[i];
598 D[i] = saturate_cast<uchar>(s0);
599 SUM[i] = (ushort)(s0 - Sm[i]);
610 std::vector<ushort> sum;
615 struct ColumnSum<int, short> :
616 public BaseColumnFilter
618 ColumnSum( int _ksize, int _anchor, double _scale ) :
627 virtual void reset() { sumCount = 0; }
629 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
633 bool haveScale = scale != 1;
634 double _scale = scale;
637 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
639 bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
642 if( width != (int)sum.size() )
651 memset((void*)SUM, 0, width*sizeof(int));
652 for( ; sumCount < ksize - 1; sumCount++, src++ )
654 const int* Sp = (const int*)src[0];
659 for( ; i <= width-4; i+=4 )
661 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
662 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
663 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
669 for( ; i <= width - 4; i+=4 )
670 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
673 for( ; i < width; i++ )
679 CV_Assert( sumCount == ksize-1 );
683 for( ; count--; src++ )
685 const int* Sp = (const int*)src[0];
686 const int* Sm = (const int*)src[1-ksize];
687 short* D = (short*)dst;
694 const __m128 scale4 = _mm_set1_ps((float)_scale);
695 for( ; i <= width-8; i+=8 )
697 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
698 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
700 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
701 _mm_loadu_si128((const __m128i*)(Sp+i)));
702 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
703 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
705 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
706 __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
708 _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1));
710 _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
711 _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1));
717 float32x4_t v_scale = vdupq_n_f32((float)_scale);
718 for( ; i <= width-8; i+=8 )
720 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
721 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
723 int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
724 int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
725 vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
727 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
728 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
732 for( ; i < width; i++ )
734 int s0 = SUM[i] + Sp[i];
735 D[i] = saturate_cast<short>(s0*_scale);
745 for( ; i <= width-8; i+=8 )
748 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
749 __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
751 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
752 _mm_loadu_si128((const __m128i*)(Sp+i)));
753 __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
754 _mm_loadu_si128((const __m128i*)(Sp+i+4)));
756 _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01));
758 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
759 _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
765 for( ; i <= width-8; i+=8 )
767 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
768 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
770 vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
772 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
773 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
778 for( ; i < width; i++ )
780 int s0 = SUM[i] + Sp[i];
781 D[i] = saturate_cast<short>(s0);
791 std::vector<int> sum;
796 struct ColumnSum<int, ushort> :
797 public BaseColumnFilter
799 ColumnSum( int _ksize, int _anchor, double _scale ) :
808 virtual void reset() { sumCount = 0; }
810 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
813 bool haveScale = scale != 1;
814 double _scale = scale;
817 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
819 bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
822 if( width != (int)sum.size() )
831 memset((void*)SUM, 0, width*sizeof(int));
832 for( ; sumCount < ksize - 1; sumCount++, src++ )
834 const int* Sp = (const int*)src[0];
839 for( ; i <= width-4; i+=4 )
841 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
842 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
843 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
849 for( ; i <= width - 4; i+=4 )
850 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
853 for( ; i < width; i++ )
859 CV_Assert( sumCount == ksize-1 );
863 for( ; count--; src++ )
865 const int* Sp = (const int*)src[0];
866 const int* Sm = (const int*)src[1-ksize];
867 ushort* D = (ushort*)dst;
874 const __m128 scale4 = _mm_set1_ps((float)_scale);
875 const __m128i delta0 = _mm_set1_epi32(0x8000);
876 const __m128i delta1 = _mm_set1_epi32(0x80008000);
878 for( ; i < width-4; i+=4)
880 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
881 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
882 _mm_loadu_si128((const __m128i*)(Sp+i)));
884 __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
886 _res = _mm_sub_epi32(_res, delta0);
887 _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
889 _mm_storel_epi64((__m128i*)(D+i), _res);
890 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
896 float32x4_t v_scale = vdupq_n_f32((float)_scale);
897 for( ; i <= width-8; i+=8 )
899 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
900 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
902 uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
903 uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
904 vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
906 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
907 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
911 for( ; i < width; i++ )
913 int s0 = SUM[i] + Sp[i];
914 D[i] = saturate_cast<ushort>(s0*_scale);
924 const __m128i delta0 = _mm_set1_epi32(0x8000);
925 const __m128i delta1 = _mm_set1_epi32(0x80008000);
927 for( ; i < width-4; i+=4 )
929 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
930 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
931 _mm_loadu_si128((const __m128i*)(Sp+i)));
933 __m128i _res = _mm_sub_epi32(_s0, delta0);
934 _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
936 _mm_storel_epi64((__m128i*)(D+i), _res);
937 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
943 for( ; i <= width-8; i+=8 )
945 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
946 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
948 vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
950 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
951 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
956 for( ; i < width; i++ )
958 int s0 = SUM[i] + Sp[i];
959 D[i] = saturate_cast<ushort>(s0);
969 std::vector<int> sum;
973 struct ColumnSum<int, int> :
974 public BaseColumnFilter
976 ColumnSum( int _ksize, int _anchor, double _scale ) :
985 virtual void reset() { sumCount = 0; }
987 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
990 bool haveScale = scale != 1;
991 double _scale = scale;
994 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
996 bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
999 if( width != (int)sum.size() )
1008 memset((void*)SUM, 0, width*sizeof(int));
1009 for( ; sumCount < ksize - 1; sumCount++, src++ )
1011 const int* Sp = (const int*)src[0];
1016 for( ; i <= width-4; i+=4 )
1018 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
1019 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
1020 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
1026 for( ; i <= width - 4; i+=4 )
1027 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
1030 for( ; i < width; i++ )
1036 CV_Assert( sumCount == ksize-1 );
1040 for( ; count--; src++ )
1042 const int* Sp = (const int*)src[0];
1043 const int* Sm = (const int*)src[1-ksize];
1051 const __m128 scale4 = _mm_set1_ps((float)_scale);
1052 for( ; i <= width-4; i+=4 )
1054 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
1056 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1057 _mm_loadu_si128((const __m128i*)(Sp+i)));
1059 __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
1061 _mm_storeu_si128((__m128i*)(D+i), _s0T);
1062 _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
1068 float32x4_t v_scale = vdupq_n_f32((float)_scale);
1069 for( ; i <= width-4; i+=4 )
1071 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1073 int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
1074 vst1q_s32(D + i, v_s0d);
1076 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1080 for( ; i < width; i++ )
1082 int s0 = SUM[i] + Sp[i];
1083 D[i] = saturate_cast<int>(s0*_scale);
1084 SUM[i] = s0 - Sm[i];
1093 for( ; i <= width-4; i+=4 )
1095 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
1096 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1097 _mm_loadu_si128((const __m128i*)(Sp+i)));
1099 _mm_storeu_si128((__m128i*)(D+i), _s0);
1100 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
1106 for( ; i <= width-4; i+=4 )
1108 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1110 vst1q_s32(D + i, v_s0);
1111 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1116 for( ; i < width; i++ )
1118 int s0 = SUM[i] + Sp[i];
1120 SUM[i] = s0 - Sm[i];
1129 std::vector<int> sum;
1134 struct ColumnSum<int, float> :
1135 public BaseColumnFilter
1137 ColumnSum( int _ksize, int _anchor, double _scale ) :
1146 virtual void reset() { sumCount = 0; }
1148 virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
1151 bool haveScale = scale != 1;
1152 double _scale = scale;
1155 bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1157 bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
1160 if( width != (int)sum.size() )
1169 memset((void*)SUM, 0, width*sizeof(int));
1170 for( ; sumCount < ksize - 1; sumCount++, src++ )
1172 const int* Sp = (const int*)src[0];
1177 for( ; i <= width-4; i+=4 )
1179 __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
1180 __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
1181 _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
1187 for( ; i <= width - 4; i+=4 )
1188 vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
1192 for( ; i < width; i++ )
1198 CV_Assert( sumCount == ksize-1 );
1202 for( ; count--; src++ )
1204 const int * Sp = (const int*)src[0];
1205 const int * Sm = (const int*)src[1-ksize];
1206 float* D = (float*)dst;
1214 const __m128 scale4 = _mm_set1_ps((float)_scale);
1216 for( ; i < width-4; i+=4)
1218 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
1219 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1220 _mm_loadu_si128((const __m128i*)(Sp+i)));
1222 _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
1223 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
1229 float32x4_t v_scale = vdupq_n_f32((float)_scale);
1230 for( ; i <= width-8; i+=8 )
1232 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1233 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
1235 vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
1236 vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
1238 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1239 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
1244 for( ; i < width; i++ )
1246 int s0 = SUM[i] + Sp[i];
1247 D[i] = (float)(s0*_scale);
1248 SUM[i] = s0 - Sm[i];
1258 for( ; i < width-4; i+=4)
1260 __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
1261 __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1262 _mm_loadu_si128((const __m128i*)(Sp+i)));
1264 _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0));
1265 _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
1271 for( ; i <= width-8; i+=8 )
1273 int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1274 int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
1276 vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
1277 vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
1279 vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1280 vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
1285 for( ; i < width; i++ )
1287 int s0 = SUM[i] + Sp[i];
1289 SUM[i] = s0 - Sm[i];
1298 std::vector<int> sum;
1303 static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth,
1304 Size ksize, Point anchor, int borderType, bool normalize )
1306 const ocl::Device & dev = ocl::Device::getDefault();
1307 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1313 anchor.x = ksize.width / 2;
1315 anchor.y = ksize.height / 2;
1317 if ( !(dev.isIntel() && (type == CV_8UC1) &&
1318 (_src.offset() == 0) && (_src.step() % 4 == 0) &&
1319 (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) &&
1320 (anchor.x == 1) && (anchor.y == 1) &&
1321 (ksize.width == 3) && (ksize.height == 3)) )
1324 float alpha = 1.0f / (ksize.height * ksize.width);
1325 Size size = _src.size();
1326 size_t globalsize[2] = { 0, 0 };
1327 size_t localsize[2] = { 0, 0 };
1328 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
1330 globalsize[0] = size.width / 16;
1331 globalsize[1] = size.height / 2;
1333 char build_opts[1024];
1334 sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
1336 ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts);
1340 UMat src = _src.getUMat();
1341 _dst.create(size, CV_MAKETYPE(ddepth, cn));
1342 if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
1344 UMat dst = _dst.getUMat();
1346 int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
1347 idxArg = kernel.set(idxArg, (int)src.step);
1348 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
1349 idxArg = kernel.set(idxArg, (int)dst.step);
1350 idxArg = kernel.set(idxArg, (int)dst.rows);
1351 idxArg = kernel.set(idxArg, (int)dst.cols);
1353 idxArg = kernel.set(idxArg, (float)alpha);
1355 return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
1358 #define DIVUP(total, grain) ((total + grain - 1) / (grain))
1359 #define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
1361 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
1362 Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
1364 const ocl::Device & dev = ocl::Device::getDefault();
1365 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
1366 bool doubleSupport = dev.doubleFPConfig() > 0;
1371 if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
1372 _src.offset() % esz != 0 || _src.step() % esz != 0)
1376 anchor.x = ksize.width / 2;
1378 anchor.y = ksize.height / 2;
1380 int computeUnits = ocl::Device::getDefault().maxComputeUnits();
1381 float alpha = 1.0f / (ksize.height * ksize.width);
1382 Size size = _src.size(), wholeSize;
1383 bool isolated = (borderType & BORDER_ISOLATED) != 0;
1384 borderType &= ~BORDER_ISOLATED;
1385 int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
1386 wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
1388 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
1389 size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
1390 size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
1392 UMat src = _src.getUMat();
1396 src.locateROI(wholeSize, ofs);
1399 int h = isolated ? size.height : wholeSize.height;
1400 int w = isolated ? size.width : wholeSize.width;
1402 size_t maxWorkItemSizes[32];
1403 ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
1404 int tryWorkItems = (int)maxWorkItemSizes[0];
1408 if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
1409 ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
1410 (ksize.width == 5 && ksize.height == 5 && cn == 1)))
1412 if (w < ksize.width || h < ksize.height)
1415 // Figure out what vector size to use for loading the pixels.
1416 int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
1417 int pxLoadVecSize = cn * pxLoadNumPixels;
1419 // Figure out how many pixels per work item to compute in X and Y
1420 // directions. Too many and we run out of registers.
1421 int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
1422 if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
1424 pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
1425 pxPerWorkItemY = size.height % 2 ? 1 : 2;
1427 else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
1429 pxPerWorkItemX = size.width % 2 ? 1 : 2;
1430 pxPerWorkItemY = size.height % 2 ? 1 : 2;
1432 globalsize[0] = size.width / pxPerWorkItemX;
1433 globalsize[1] = size.height / pxPerWorkItemY;
1435 // Need some padding in the private array for pixels
1436 int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
1438 // Make the global size a nice round number so the runtime can pick
1439 // from reasonable choices for the workgroup size
1440 const int wgRound = 256;
1441 globalsize[0] = ROUNDUP(globalsize[0], wgRound);
1443 char build_options[1024], cvt[2][40];
1444 sprintf(build_options, "-D cn=%d "
1445 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
1446 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
1447 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
1448 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
1449 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
1450 "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
1451 cn, anchor.x, anchor.y, ksize.width, ksize.height,
1452 pxLoadVecSize, pxLoadNumPixels,
1453 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
1454 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
1455 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
1456 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
1457 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
1458 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
1459 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
1460 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
1461 ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
1465 if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
1470 localsize = localsize_general;
1473 int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
1475 while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
1477 while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
1480 if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
1484 String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
1485 " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
1486 " -D ST1=%s -D DT1=%s -D cn=%d",
1487 BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
1488 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
1489 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
1490 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
1491 anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
1492 isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1493 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
1494 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
1496 localsize[0] = BLOCK_SIZE_X;
1497 globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
1498 globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);
1500 kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
1504 size_t kernelWorkGroupSize = kernel.workGroupSize();
1505 if (localsize[0] <= kernelWorkGroupSize)
1507 if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
1510 tryWorkItems = (int)kernelWorkGroupSize;
1514 _dst.create(size, CV_MAKETYPE(ddepth, cn));
1515 UMat dst = _dst.getUMat();
1517 int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
1518 idxArg = kernel.set(idxArg, (int)src.step);
1519 int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
1520 int srcOffsetY = (int)(src.offset / src.step);
1521 int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
1522 int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
1523 idxArg = kernel.set(idxArg, srcOffsetX);
1524 idxArg = kernel.set(idxArg, srcOffsetY);
1525 idxArg = kernel.set(idxArg, srcEndX);
1526 idxArg = kernel.set(idxArg, srcEndY);
1527 idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
1529 idxArg = kernel.set(idxArg, (float)alpha);
1531 return kernel.run(2, globalsize, localsize, false);
1541 cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
1543 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
1544 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
1549 if( sdepth == CV_8U && ddepth == CV_32S )
1550 return makePtr<RowSum<uchar, int> >(ksize, anchor);
1551 if( sdepth == CV_8U && ddepth == CV_16U )
1552 return makePtr<RowSum<uchar, ushort> >(ksize, anchor);
1553 if( sdepth == CV_8U && ddepth == CV_64F )
1554 return makePtr<RowSum<uchar, double> >(ksize, anchor);
1555 if( sdepth == CV_16U && ddepth == CV_32S )
1556 return makePtr<RowSum<ushort, int> >(ksize, anchor);
1557 if( sdepth == CV_16U && ddepth == CV_64F )
1558 return makePtr<RowSum<ushort, double> >(ksize, anchor);
1559 if( sdepth == CV_16S && ddepth == CV_32S )
1560 return makePtr<RowSum<short, int> >(ksize, anchor);
1561 if( sdepth == CV_32S && ddepth == CV_32S )
1562 return makePtr<RowSum<int, int> >(ksize, anchor);
1563 if( sdepth == CV_16S && ddepth == CV_64F )
1564 return makePtr<RowSum<short, double> >(ksize, anchor);
1565 if( sdepth == CV_32F && ddepth == CV_64F )
1566 return makePtr<RowSum<float, double> >(ksize, anchor);
1567 if( sdepth == CV_64F && ddepth == CV_64F )
1568 return makePtr<RowSum<double, double> >(ksize, anchor);
1570 CV_Error_( CV_StsNotImplemented,
1571 ("Unsupported combination of source format (=%d), and buffer format (=%d)",
1574 return Ptr<BaseRowFilter>();
1578 cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
1579 int anchor, double scale)
1581 int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
1582 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
1587 if( ddepth == CV_8U && sdepth == CV_32S )
1588 return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
1589 if( ddepth == CV_8U && sdepth == CV_16U )
1590 return makePtr<ColumnSum<ushort, uchar> >(ksize, anchor, scale);
1591 if( ddepth == CV_8U && sdepth == CV_64F )
1592 return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
1593 if( ddepth == CV_16U && sdepth == CV_32S )
1594 return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
1595 if( ddepth == CV_16U && sdepth == CV_64F )
1596 return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
1597 if( ddepth == CV_16S && sdepth == CV_32S )
1598 return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
1599 if( ddepth == CV_16S && sdepth == CV_64F )
1600 return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
1601 if( ddepth == CV_32S && sdepth == CV_32S )
1602 return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
1603 if( ddepth == CV_32F && sdepth == CV_32S )
1604 return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
1605 if( ddepth == CV_32F && sdepth == CV_64F )
1606 return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
1607 if( ddepth == CV_64F && sdepth == CV_32S )
1608 return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
1609 if( ddepth == CV_64F && sdepth == CV_64F )
1610 return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
1612 CV_Error_( CV_StsNotImplemented,
1613 ("Unsupported combination of sum format (=%d), and destination format (=%d)",
1616 return Ptr<BaseColumnFilter>();
1620 cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
1621 Point anchor, bool normalize, int borderType )
1623 int sdepth = CV_MAT_DEPTH(srcType);
1624 int cn = CV_MAT_CN(srcType), sumType = CV_64F;
1625 if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U &&
1626 ksize.width*ksize.height <= 256 )
1628 else if( sdepth <= CV_32S && (!normalize ||
1629 ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
1630 sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
1632 sumType = CV_MAKETYPE( sumType, cn );
1634 Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
1635 Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
1636 dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
1638 return makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
1639 srcType, dstType, sumType, borderType );
1645 static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth,
1646 Size ksize, Point anchor,
1647 bool normalize, int borderType)
1649 int stype = _src.type();
1652 if (stype != CV_8UC1 || (ddepth != CV_8U && ddepth != CV_16S) ||
1653 (anchor.x >= 0 && anchor.x != ksize.width / 2) ||
1654 (anchor.y >= 0 && anchor.y != ksize.height / 2) ||
1655 ksize.width % 2 != 1 || ksize.height % 2 != 1 ||
1656 ksize.width < 3 || ksize.height < 3)
1659 Mat src = _src.getMat();
1660 _dst.create(src.size(), CV_MAKETYPE(ddepth, 1));
1661 Mat dst = _dst.getMat();
1663 if (src.cols < ksize.width || src.rows < ksize.height)
1666 if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
1667 return false; //Process isolated borders only
1669 switch (borderType & ~BORDER_ISOLATED)
1671 case BORDER_CONSTANT:
1672 border.mode = VX_BORDER_CONSTANT;
1673 #if VX_VERSION > VX_VERSION_1_0
1674 border.constant_value.U8 = (vx_uint8)(0);
1676 border.constant_value = (vx_uint32)(0);
1679 case BORDER_REPLICATE:
1680 border.mode = VX_BORDER_REPLICATE;
1688 ivx::Context ctx = ivx::Context::create();
1689 if ((vx_size)(ksize.width) > ctx.convolutionMaxDimension() || (vx_size)(ksize.height) > ctx.convolutionMaxDimension())
1693 if (dst.data != src.data)
1699 ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
1700 ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
1701 ib = ivx::Image::createFromHandle(ctx, ddepth == CV_16S ? VX_DF_IMAGE_S16 : VX_DF_IMAGE_U8,
1702 ivx::Image::createAddressing(dst.cols, dst.rows, ddepth == CV_16S ? 2 : 1, (vx_int32)(dst.step)), dst.data);
1704 //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
1705 //since OpenVX standart says nothing about thread-safety for now
1706 vx_border_t prevBorder = ctx.borderMode();
1707 ctx.setBorderMode(border);
1708 if (ddepth == CV_8U && ksize.width == 3 && ksize.height == 3 && normalize)
1710 ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib));
1714 #if VX_VERSION <= VX_VERSION_1_0
1715 if (ctx.vendorID() == VX_ID_KHRONOS && ((vx_size)(src.cols) <= ctx.convolutionMaxDimension() || (vx_size)(src.rows) <= ctx.convolutionMaxDimension()))
1717 ctx.setBorderMode(prevBorder);
1721 Mat convData(ksize, CV_16SC1);
1722 convData = normalize ? (1 << 15) / (ksize.width * ksize.height) : 1;
1723 ivx::Convolution cnv = ivx::Convolution::create(ctx, convData.cols, convData.rows);
1724 cnv.copyFrom(convData);
1726 cnv.setScale(1 << 15);
1727 ivx::IVX_CHECK_STATUS(vxuConvolve(ctx, ia, cnv, ib));
1729 ctx.setBorderMode(prevBorder);
1731 catch (ivx::RuntimeError & e)
1733 CV_Error(CV_StsInternal, e.what());
1736 catch (ivx::WrapperError & e)
1738 CV_Error(CV_StsInternal, e.what());
1747 // TODO: IPP performance regression
1748 #if defined(HAVE_IPP) && IPP_DISABLE_BLOCK
1751 static bool ipp_boxfilter( InputArray _src, OutputArray _dst, int ddepth,
1752 Size ksize, Point anchor,
1753 bool normalize, int borderType )
1755 CV_INSTRUMENT_REGION_IPP()
1757 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
1760 int ippBorderType = borderType & ~BORDER_ISOLATED;
1761 Point ocvAnchor, ippAnchor;
1762 ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
1763 ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
1764 ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
1765 ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
1767 Mat src = _src.getMat();
1768 _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
1769 Mat dst = _dst.getMat();
1770 if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
1779 if (normalize && !src.isSubmatrix() && ddepth == sdepth &&
1780 (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
1781 ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
1782 dst.cols != ksize.width && dst.rows != ksize.height) // returns ippStsMaskSizeErr: mask has an illegal value
1785 IppiSize roiSize = { dst.cols, dst.rows }, maskSize = { ksize.width, ksize.height };
1787 #define IPP_FILTER_BOX_BORDER(ippType, ippDataType, flavor) \
1790 if (ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ippDataType, cn, &bufSize) >= 0) \
1792 Ipp8u * buffer = ippsMalloc_8u(bufSize); \
1793 ippType borderValue[4] = { 0, 0, 0, 0 }; \
1794 ippBorderType = ippBorderType == BORDER_CONSTANT ? ippBorderConst : ippBorderRepl; \
1795 IppStatus status = CV_INSTRUMENT_FUN_IPP(ippiFilterBoxBorder_##flavor, src.ptr<ippType>(), (int)src.step, dst.ptr<ippType>(), \
1796 (int)dst.step, roiSize, maskSize, \
1797 (IppiBorderType)ippBorderType, borderValue, buffer); \
1801 CV_IMPL_ADD(CV_IMPL_IPP); \
1805 } while ((void)0, 0)
1807 if (stype == CV_8UC1)
1808 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C1R);
1809 else if (stype == CV_8UC3)
1810 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C3R);
1811 else if (stype == CV_8UC4)
1812 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C4R);
1814 // Oct 2014: performance with BORDER_CONSTANT
1815 //else if (stype == CV_16UC1)
1816 // IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C1R);
1817 else if (stype == CV_16UC3)
1818 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C3R);
1819 else if (stype == CV_16UC4)
1820 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C4R);
1822 // Oct 2014: performance with BORDER_CONSTANT
1823 //else if (stype == CV_16SC1)
1824 // IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C1R);
1825 else if (stype == CV_16SC3)
1826 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C3R);
1827 else if (stype == CV_16SC4)
1828 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C4R);
1830 else if (stype == CV_32FC1)
1831 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C1R);
1832 else if (stype == CV_32FC3)
1833 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C3R);
1834 else if (stype == CV_32FC4)
1835 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C4R);
1837 #undef IPP_FILTER_BOX_BORDER
1845 void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
1846 Size ksize, Point anchor,
1847 bool normalize, int borderType )
1849 CV_INSTRUMENT_REGION()
1851 CV_OCL_RUN(_dst.isUMat() &&
1852 (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT ||
1853 borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101),
1854 ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
1856 CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
1859 if (openvx_boxfilter(_src, _dst, ddepth, ksize, anchor, normalize, borderType))
1863 Mat src = _src.getMat();
1864 int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
1867 _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
1868 Mat dst = _dst.getMat();
1869 if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
1876 #ifdef HAVE_TEGRA_OPTIMIZATION
1877 if ( tegra::useTegra() && tegra::box(src, dst, ksize, anchor, normalize, borderType) )
1881 #if defined HAVE_IPP && IPP_DISABLE_BLOCK
1882 int ippBorderType = borderType & ~BORDER_ISOLATED;
1883 Point ocvAnchor, ippAnchor;
1884 ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
1885 ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
1886 ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
1887 ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
1888 CV_IPP_RUN((normalize && !_src.isSubmatrix() && ddepth == sdepth &&
1889 (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
1890 ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
1891 _dst.cols() != ksize.width && _dst.rows() != ksize.height),
1892 ipp_boxfilter( _src, _dst, ddepth, ksize, anchor, normalize, borderType));
1896 Size wsz(src.cols, src.rows);
1897 if(!(borderType&BORDER_ISOLATED))
1898 src.locateROI( wsz, ofs );
1899 borderType = (borderType&~BORDER_ISOLATED);
1901 Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
1902 ksize, anchor, normalize, borderType );
1904 f->apply( src, dst, wsz, ofs );
1908 void cv::blur( InputArray src, OutputArray dst,
1909 Size ksize, Point anchor, int borderType )
1911 CV_INSTRUMENT_REGION()
1913 boxFilter( src, dst, -1, ksize, anchor, true, borderType );
1917 /****************************************************************************************\
1919 \****************************************************************************************/
1924 template<typename T, typename ST>
1926 public BaseRowFilter
1928 SqrRowSum( int _ksize, int _anchor ) :
1935 virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
1937 const T* S = (const T*)src;
1939 int i = 0, k, ksz_cn = ksize*cn;
1941 width = (width - 1)*cn;
1942 for( k = 0; k < cn; k++, S++, D++ )
1945 for( i = 0; i < ksz_cn; i += cn )
1951 for( i = 0; i < width; i += cn )
1953 ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
1954 s += val1*val1 - val0*val0;
1961 static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
1963 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
1964 CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
1969 if( sdepth == CV_8U && ddepth == CV_32S )
1970 return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
1971 if( sdepth == CV_8U && ddepth == CV_64F )
1972 return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
1973 if( sdepth == CV_16U && ddepth == CV_64F )
1974 return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
1975 if( sdepth == CV_16S && ddepth == CV_64F )
1976 return makePtr<SqrRowSum<short, double> >(ksize, anchor);
1977 if( sdepth == CV_32F && ddepth == CV_64F )
1978 return makePtr<SqrRowSum<float, double> >(ksize, anchor);
1979 if( sdepth == CV_64F && ddepth == CV_64F )
1980 return makePtr<SqrRowSum<double, double> >(ksize, anchor);
1982 CV_Error_( CV_StsNotImplemented,
1983 ("Unsupported combination of source format (=%d), and buffer format (=%d)",
1986 return Ptr<BaseRowFilter>();
1991 void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
1992 Size ksize, Point anchor,
1993 bool normalize, int borderType )
1995 CV_INSTRUMENT_REGION()
1997 int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
1998 Size size = _src.size();
2001 ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
2003 if( borderType != BORDER_CONSTANT && normalize )
2005 if( size.height == 1 )
2007 if( size.width == 1 )
2011 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
2012 ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
2014 int sumDepth = CV_64F;
2015 if( sdepth == CV_8U )
2017 int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
2019 Mat src = _src.getMat();
2020 _dst.create( size, dstType );
2021 Mat dst = _dst.getMat();
2023 Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
2024 Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
2025 dstType, ksize.height, anchor.y,
2026 normalize ? 1./(ksize.width*ksize.height) : 1);
2028 Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
2029 srcType, dstType, sumType, borderType );
2031 Size wsz(src.cols, src.rows);
2032 src.locateROI( wsz, ofs );
2034 f->apply( src, dst, wsz, ofs );
2038 /****************************************************************************************\
2040 \****************************************************************************************/
2042 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
2044 const int SMALL_GAUSSIAN_SIZE = 7;
2045 static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
2048 {0.25f, 0.5f, 0.25f},
2049 {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f},
2050 {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f}
2053 const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ?
2054 small_gaussian_tab[n>>1] : 0;
2056 CV_Assert( ktype == CV_32F || ktype == CV_64F );
2057 Mat kernel(n, 1, ktype);
2058 float* cf = kernel.ptr<float>();
2059 double* cd = kernel.ptr<double>();
2061 double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8;
2062 double scale2X = -0.5/(sigmaX*sigmaX);
2066 for( i = 0; i < n; i++ )
2068 double x = i - (n-1)*0.5;
2069 double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x);
2070 if( ktype == CV_32F )
2083 for( i = 0; i < n; i++ )
2085 if( ktype == CV_32F )
2086 cf[i] = (float)(cf[i]*sum);
2096 static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize,
2097 double sigma1, double sigma2 )
2099 int depth = CV_MAT_DEPTH(type);
2103 // automatic detection of kernel size from sigma
2104 if( ksize.width <= 0 && sigma1 > 0 )
2105 ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
2106 if( ksize.height <= 0 && sigma2 > 0 )
2107 ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
2109 CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 &&
2110 ksize.height > 0 && ksize.height % 2 == 1 );
2112 sigma1 = std::max( sigma1, 0. );
2113 sigma2 = std::max( sigma2, 0. );
2115 kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
2116 if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
2119 ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
2124 cv::Ptr<cv::FilterEngine> cv::createGaussianFilter( int type, Size ksize,
2125 double sigma1, double sigma2,
2129 createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
2131 return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType );
2138 static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth,
2139 InputArray _kernelX, InputArray _kernelY, int borderType)
2141 const ocl::Device & dev = ocl::Device::getDefault();
2142 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2144 if ( !(dev.isIntel() && (type == CV_8UC1) &&
2145 (_src.offset() == 0) && (_src.step() % 4 == 0) &&
2146 ((ksize.width == 5 && (_src.cols() % 4 == 0)) ||
2147 (ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) )
2150 Mat kernelX = _kernelX.getMat().reshape(1, 1);
2151 if (kernelX.cols % 2 != 1)
2153 Mat kernelY = _kernelY.getMat().reshape(1, 1);
2154 if (kernelY.cols % 2 != 1)
2160 Size size = _src.size();
2161 size_t globalsize[2] = { 0, 0 };
2162 size_t localsize[2] = { 0, 0 };
2164 if (ksize.width == 3)
2166 globalsize[0] = size.width / 16;
2167 globalsize[1] = size.height / 2;
2169 else if (ksize.width == 5)
2171 globalsize[0] = size.width / 4;
2172 globalsize[1] = size.height / 1;
2175 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
2176 char build_opts[1024];
2177 sprintf(build_opts, "-D %s %s%s", borderMap[borderType],
2178 ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(),
2179 ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str());
2183 if (ksize.width == 3)
2184 kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts);
2185 else if (ksize.width == 5)
2186 kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts);
2191 UMat src = _src.getUMat();
2192 _dst.create(size, CV_MAKETYPE(ddepth, cn));
2193 if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
2195 UMat dst = _dst.getUMat();
2197 int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
2198 idxArg = kernel.set(idxArg, (int)src.step);
2199 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
2200 idxArg = kernel.set(idxArg, (int)dst.step);
2201 idxArg = kernel.set(idxArg, (int)dst.rows);
2202 idxArg = kernel.set(idxArg, (int)dst.cols);
2204 return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
2211 static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
2212 double sigma1, double sigma2, int borderType)
2214 int stype = _src.type();
2217 // automatic detection of kernel size from sigma
2218 if (ksize.width <= 0 && sigma1 > 0)
2219 ksize.width = cvRound(sigma1*6 + 1) | 1;
2220 if (ksize.height <= 0 && sigma2 > 0)
2221 ksize.height = cvRound(sigma2*6 + 1) | 1;
2223 if (stype != CV_8UC1 ||
2224 ksize.width < 3 || ksize.height < 3 ||
2225 ksize.width % 2 != 1 || ksize.height % 2 != 1)
2228 sigma1 = std::max(sigma1, 0.);
2229 sigma2 = std::max(sigma2, 0.);
2231 Mat src = _src.getMat();
2232 Mat dst = _dst.getMat();
2234 if (src.cols < ksize.width || src.rows < ksize.height)
2237 if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
2238 return false; //Process isolated borders only
2240 switch (borderType & ~BORDER_ISOLATED)
2242 case BORDER_CONSTANT:
2243 border.mode = VX_BORDER_CONSTANT;
2244 #if VX_VERSION > VX_VERSION_1_0
2245 border.constant_value.U8 = (vx_uint8)(0);
2247 border.constant_value = (vx_uint32)(0);
2250 case BORDER_REPLICATE:
2251 border.mode = VX_BORDER_REPLICATE;
2259 ivx::Context ctx = ivx::Context::create();
2260 if ((vx_size)(ksize.width) > ctx.convolutionMaxDimension() || (vx_size)(ksize.height) > ctx.convolutionMaxDimension())
2264 if (dst.data != src.data)
2270 ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
2271 ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
2272 ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
2273 ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
2275 //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
2276 //since OpenVX standart says nothing about thread-safety for now
2277 vx_border_t prevBorder = ctx.borderMode();
2278 ctx.setBorderMode(border);
2279 if (ksize.width == 3 && ksize.height == 3 && (sigma1 == 0.0 || (sigma1 - 0.8) < DBL_EPSILON) && (sigma2 == 0.0 || (sigma2 - 0.8) < DBL_EPSILON))
2281 ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib));
2285 #if VX_VERSION <= VX_VERSION_1_0
2286 if (ctx.vendorID() == VX_ID_KHRONOS && ((vx_size)(a.cols) <= ctx.convolutionMaxDimension() || (vx_size)(a.rows) <= ctx.convolutionMaxDimension()))
2288 ctx.setBorderMode(prevBorder);
2293 cv::Mat(cv::getGaussianKernel(ksize.height, sigma2)*cv::getGaussianKernel(ksize.width, sigma1).t()).convertTo(convData, CV_16SC1, (1 << 15));
2294 ivx::Convolution cnv = ivx::Convolution::create(ctx, convData.cols, convData.rows);
2295 cnv.copyFrom(convData);
2296 cnv.setScale(1 << 15);
2297 ivx::IVX_CHECK_STATUS(vxuConvolve(ctx, ia, cnv, ib));
2299 ctx.setBorderMode(prevBorder);
2301 catch (ivx::RuntimeError & e)
2303 CV_Error(CV_StsInternal, e.what());
2306 catch (ivx::WrapperError & e)
2308 CV_Error(CV_StsInternal, e.what());
2318 static bool ipp_GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
2319 double sigma1, double sigma2,
2322 CV_INSTRUMENT_REGION_IPP()
2324 #if IPP_VERSION_X100 >= 810
2325 if ((borderType & BORDER_ISOLATED) == 0 && _src.isSubmatrix())
2328 int type = _src.type();
2329 Size size = _src.size();
2331 if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
2333 if( size.height == 1 )
2335 if( size.width == 1 )
2339 int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2341 if ((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn == 1 || cn == 3) &&
2342 sigma1 == sigma2 && ksize.width == ksize.height && sigma1 != 0.0 )
2344 IppiBorderType ippBorder = ippiGetBorderType(borderType);
2345 if (ippBorderConst == ippBorder || ippBorderRepl == ippBorder)
2347 Mat src = _src.getMat(), dst = _dst.getMat();
2348 IppiSize roiSize = { src.cols, src.rows };
2349 IppDataType dataType = ippiGetDataType(depth);
2350 Ipp32s specSize = 0, bufferSize = 0;
2352 if (ippiFilterGaussianGetBufferSize(roiSize, (Ipp32u)ksize.width, dataType, cn, &specSize, &bufferSize) >= 0)
2354 IppAutoBuffer<IppFilterGaussianSpec> spec(specSize);
2355 IppAutoBuffer<Ipp8u> buffer(bufferSize);
2357 if (ippiFilterGaussianInit(roiSize, (Ipp32u)ksize.width, (Ipp32f)sigma1, ippBorder, dataType, cn, spec, buffer) >= 0)
2359 #define IPP_FILTER_GAUSS_C1(ippfavor) \
2361 Ipp##ippfavor borderValues = 0; \
2362 status = CV_INSTRUMENT_FUN_IPP(ippiFilterGaussianBorder_##ippfavor##_C1R, src.ptr<Ipp##ippfavor>(), (int)src.step, \
2363 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
2366 #define IPP_FILTER_GAUSS_CN(ippfavor, ippcn) \
2368 Ipp##ippfavor borderValues[] = { 0, 0, 0 }; \
2369 status = CV_INSTRUMENT_FUN_IPP(ippiFilterGaussianBorder_##ippfavor##_C##ippcn##R, src.ptr<Ipp##ippfavor>(), (int)src.step, \
2370 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
2373 IppStatus status = ippStsErr;
2374 #if IPP_VERSION_X100 > 900 // Buffer overflow may happen in IPP 9.0.0 and less
2375 if (type == CV_8UC1)
2376 IPP_FILTER_GAUSS_C1(8u)
2379 if (type == CV_8UC3)
2380 IPP_FILTER_GAUSS_CN(8u, 3)
2381 else if (type == CV_16UC1)
2382 IPP_FILTER_GAUSS_C1(16u)
2383 else if (type == CV_16UC3)
2384 IPP_FILTER_GAUSS_CN(16u, 3)
2385 else if (type == CV_16SC1)
2386 IPP_FILTER_GAUSS_C1(16s)
2387 else if (type == CV_16SC3)
2388 IPP_FILTER_GAUSS_CN(16s, 3)
2389 else if (type == CV_32FC3)
2390 IPP_FILTER_GAUSS_CN(32f, 3)
2391 else if (type == CV_32FC1)
2392 IPP_FILTER_GAUSS_C1(32f)
2397 #undef IPP_FILTER_GAUSS_C1
2398 #undef IPP_FILTER_GAUSS_CN
2404 CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
2412 void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
2413 double sigma1, double sigma2,
2416 CV_INSTRUMENT_REGION()
2418 int type = _src.type();
2419 Size size = _src.size();
2420 _dst.create( size, type );
2422 if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
2424 if( size.height == 1 )
2426 if( size.width == 1 )
2430 if( ksize.width == 1 && ksize.height == 1 )
2437 if (openvx_gaussianBlur(_src, _dst, ksize, sigma1, sigma2, borderType))
2441 #ifdef HAVE_TEGRA_OPTIMIZATION
2442 Mat src = _src.getMat();
2443 Mat dst = _dst.getMat();
2444 if(sigma1 == 0 && sigma2 == 0 && tegra::useTegra() && tegra::gaussian(src, dst, ksize, borderType))
2448 CV_IPP_RUN(!(ocl::useOpenCL() && _dst.isUMat()), ipp_GaussianBlur( _src, _dst, ksize, sigma1, sigma2, borderType));
2451 createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
2453 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 &&
2454 ((ksize.width == 3 && ksize.height == 3) ||
2455 (ksize.width == 5 && ksize.height == 5)) &&
2456 (size_t)_src.rows() > ky.total() && (size_t)_src.cols() > kx.total(),
2457 ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType));
2459 sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
2462 /****************************************************************************************\
2464 \****************************************************************************************/
2471 * This structure represents a two-tier histogram. The first tier (known as the
2472 * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
2473 * is 8 bit wide. Pixels inserted in the fine level also get inserted into the
2474 * coarse bucket designated by the 4 MSBs of the fine bucket value.
2476 * The structure is aligned on 16 bits, which is a prerequisite for SIMD
2477 * instructions. Each bucket is 16 bit wide, which means that extra care must be
2478 * taken to prevent overflow.
2488 #define MEDIAN_HAVE_SIMD 1
2490 static inline void histogram_add_simd( const HT x[16], HT y[16] )
2492 const __m128i* rx = (const __m128i*)x;
2493 __m128i* ry = (__m128i*)y;
2494 __m128i r0 = _mm_add_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
2495 __m128i r1 = _mm_add_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
2496 _mm_store_si128(ry+0, r0);
2497 _mm_store_si128(ry+1, r1);
2500 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
2502 const __m128i* rx = (const __m128i*)x;
2503 __m128i* ry = (__m128i*)y;
2504 __m128i r0 = _mm_sub_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
2505 __m128i r1 = _mm_sub_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
2506 _mm_store_si128(ry+0, r0);
2507 _mm_store_si128(ry+1, r1);
2511 #define MEDIAN_HAVE_SIMD 1
2513 static inline void histogram_add_simd( const HT x[16], HT y[16] )
2515 vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y)));
2516 vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
2519 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
2521 vst1q_u16(y, vsubq_u16(vld1q_u16(y), vld1q_u16(x)));
2522 vst1q_u16(y + 8, vsubq_u16(vld1q_u16(y + 8), vld1q_u16(x + 8)));
2526 #define MEDIAN_HAVE_SIMD 0
2530 static inline void histogram_add( const HT x[16], HT y[16] )
2533 for( i = 0; i < 16; ++i )
2534 y[i] = (HT)(y[i] + x[i]);
2537 static inline void histogram_sub( const HT x[16], HT y[16] )
2540 for( i = 0; i < 16; ++i )
2541 y[i] = (HT)(y[i] - x[i]);
2544 static inline void histogram_muladd( int a, const HT x[16],
2547 for( int i = 0; i < 16; ++i )
2548 y[i] = (HT)(y[i] + a * x[i]);
2552 medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
2555 * HOP is short for Histogram OPeration. This macro makes an operation \a op on
2556 * histogram \a h for pixel value \a x. It takes care of handling both levels.
2558 #define HOP(h,x,op) \
2559 h.coarse[x>>4] op, \
2560 *((HT*)h.fine + x) op
2562 #define COP(c,j,x,op) \
2563 h_coarse[ 16*(n*c+j) + (x>>4) ] op, \
2564 h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op
2566 int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
2567 size_t sstep = _src.step, dstep = _dst.step;
2568 Histogram CV_DECL_ALIGNED(16) H[4];
2569 HT CV_DECL_ALIGNED(16) luc[4][16];
2571 int STRIPE_SIZE = std::min( _dst.cols, 512/cn );
2573 std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
2574 std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
2575 HT* h_coarse = alignPtr(&_h_coarse[0], 16);
2576 HT* h_fine = alignPtr(&_h_fine[0], 16);
2577 #if MEDIAN_HAVE_SIMD
2578 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
2581 for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
2583 int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2;
2584 const uchar* src = _src.ptr() + x*cn;
2585 uchar* dst = _dst.ptr() + (x - r)*cn;
2587 memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) );
2588 memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) );
2590 // First row initialization
2591 for( c = 0; c < cn; c++ )
2593 for( j = 0; j < n; j++ )
2594 COP( c, j, src[cn*j+c], += (cv::HT)(r+2) );
2596 for( i = 1; i < r; i++ )
2598 const uchar* p = src + sstep*std::min(i, m-1);
2599 for ( j = 0; j < n; j++ )
2600 COP( c, j, p[cn*j+c], ++ );
2604 for( i = 0; i < m; i++ )
2606 const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
2607 const uchar* p1 = src + sstep * std::min( m-1, i+r );
2609 memset( H, 0, cn*sizeof(H[0]) );
2610 memset( luc, 0, cn*sizeof(luc[0]) );
2611 for( c = 0; c < cn; c++ )
2613 // Update column histograms for the entire row.
2614 for( j = 0; j < n; j++ )
2616 COP( c, j, p0[j*cn + c], -- );
2617 COP( c, j, p1[j*cn + c], ++ );
2620 // First column initialization
2621 for( k = 0; k < 16; ++k )
2622 histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] );
2624 #if MEDIAN_HAVE_SIMD
2627 for( j = 0; j < 2*r; ++j )
2628 histogram_add_simd( &h_coarse[16*(n*c+j)], H[c].coarse );
2630 for( j = r; j < n-r; j++ )
2632 int t = 2*r*r + 2*r, b, sum = 0;
2635 histogram_add_simd( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
2637 // Find median at coarse level
2638 for ( k = 0; k < 16 ; ++k )
2640 sum += H[c].coarse[k];
2643 sum -= H[c].coarse[k];
2649 /* Update corresponding histogram segment */
2650 if ( luc[c][k] <= j-r )
2652 memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
2653 for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
2654 histogram_add_simd( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
2656 if ( luc[c][k] < j+r+1 )
2658 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
2659 luc[c][k] = (HT)(j+r+1);
2664 for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
2666 histogram_sub_simd( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
2667 histogram_add_simd( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
2671 histogram_sub_simd( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
2673 /* Find median in segment */
2674 segment = H[c].fine[k];
2675 for ( b = 0; b < 16 ; b++ )
2680 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
2690 for( j = 0; j < 2*r; ++j )
2691 histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse );
2693 for( j = r; j < n-r; j++ )
2695 int t = 2*r*r + 2*r, b, sum = 0;
2698 histogram_add( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
2700 // Find median at coarse level
2701 for ( k = 0; k < 16 ; ++k )
2703 sum += H[c].coarse[k];
2706 sum -= H[c].coarse[k];
2712 /* Update corresponding histogram segment */
2713 if ( luc[c][k] <= j-r )
2715 memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
2716 for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
2717 histogram_add( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
2719 if ( luc[c][k] < j+r+1 )
2721 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
2722 luc[c][k] = (HT)(j+r+1);
2727 for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
2729 histogram_sub( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
2730 histogram_add( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
2734 histogram_sub( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
2736 /* Find median in segment */
2737 segment = H[c].fine[k];
2738 for ( b = 0; b < 16 ; b++ )
2743 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
2759 medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m )
2766 Size size = _dst.size();
2767 const uchar* src = _src.ptr();
2768 uchar* dst = _dst.ptr();
2769 int src_step = (int)_src.step, dst_step = (int)_dst.step;
2770 int cn = _src.channels();
2771 const uchar* src_max = src + size.height*src_step;
2773 #define UPDATE_ACC01( pix, cn, op ) \
2777 zone0[cn][p >> 4] op; \
2780 //CV_Assert( size.height >= nx && size.width >= nx );
2781 for( x = 0; x < size.width; x++, src += cn, dst += cn )
2783 uchar* dst_cur = dst;
2784 const uchar* src_top = src;
2785 const uchar* src_bottom = src;
2787 int src_step1 = src_step, dst_step1 = dst_step;
2791 src_bottom = src_top += src_step*(size.height-1);
2792 dst_cur += dst_step*(size.height-1);
2793 src_step1 = -src_step1;
2794 dst_step1 = -dst_step1;
2798 memset( zone0, 0, sizeof(zone0[0])*cn );
2799 memset( zone1, 0, sizeof(zone1[0])*cn );
2801 for( y = 0; y <= m/2; y++ )
2803 for( c = 0; c < cn; c++ )
2807 for( k = 0; k < m*cn; k += cn )
2808 UPDATE_ACC01( src_bottom[k+c], c, ++ );
2812 for( k = 0; k < m*cn; k += cn )
2813 UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 );
2817 if( (src_step1 > 0 && y < size.height-1) ||
2818 (src_step1 < 0 && size.height-y-1 > 0) )
2819 src_bottom += src_step1;
2822 for( y = 0; y < size.height; y++, dst_cur += dst_step1 )
2825 for( c = 0; c < cn; c++ )
2830 int t = s + zone0[c][k];
2841 dst_cur[c] = (uchar)k;
2844 if( y+1 == size.height )
2849 for( k = 0; k < m; k++ )
2852 int q = src_bottom[k];
2861 for( k = 0; k < m*3; k += 3 )
2863 UPDATE_ACC01( src_top[k], 0, -- );
2864 UPDATE_ACC01( src_top[k+1], 1, -- );
2865 UPDATE_ACC01( src_top[k+2], 2, -- );
2867 UPDATE_ACC01( src_bottom[k], 0, ++ );
2868 UPDATE_ACC01( src_bottom[k+1], 1, ++ );
2869 UPDATE_ACC01( src_bottom[k+2], 2, ++ );
2875 for( k = 0; k < m*4; k += 4 )
2877 UPDATE_ACC01( src_top[k], 0, -- );
2878 UPDATE_ACC01( src_top[k+1], 1, -- );
2879 UPDATE_ACC01( src_top[k+2], 2, -- );
2880 UPDATE_ACC01( src_top[k+3], 3, -- );
2882 UPDATE_ACC01( src_bottom[k], 0, ++ );
2883 UPDATE_ACC01( src_bottom[k+1], 1, ++ );
2884 UPDATE_ACC01( src_bottom[k+2], 2, ++ );
2885 UPDATE_ACC01( src_bottom[k+3], 3, ++ );
2889 if( (src_step1 > 0 && src_bottom + src_step1 < src_max) ||
2890 (src_step1 < 0 && src_bottom + src_step1 >= src) )
2891 src_bottom += src_step1;
2894 src_top += src_step1;
2904 typedef uchar value_type;
2905 typedef int arg_type;
2907 arg_type load(const uchar* ptr) { return *ptr; }
2908 void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
2909 void operator()(arg_type& a, arg_type& b) const
2911 int t = CV_FAST_CAST_8U(a - b);
2918 typedef ushort value_type;
2919 typedef int arg_type;
2921 arg_type load(const ushort* ptr) { return *ptr; }
2922 void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
2923 void operator()(arg_type& a, arg_type& b) const
2933 typedef short value_type;
2934 typedef int arg_type;
2936 arg_type load(const short* ptr) { return *ptr; }
2937 void store(short* ptr, arg_type val) { *ptr = (short)val; }
2938 void operator()(arg_type& a, arg_type& b) const
2948 typedef float value_type;
2949 typedef float arg_type;
2951 arg_type load(const float* ptr) { return *ptr; }
2952 void store(float* ptr, arg_type val) { *ptr = val; }
2953 void operator()(arg_type& a, arg_type& b) const
2965 typedef uchar value_type;
2966 typedef __m128i arg_type;
2968 arg_type load(const uchar* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
2969 void store(uchar* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
2970 void operator()(arg_type& a, arg_type& b) const
2973 a = _mm_min_epu8(a, b);
2974 b = _mm_max_epu8(b, t);
2981 typedef ushort value_type;
2982 typedef __m128i arg_type;
2984 arg_type load(const ushort* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
2985 void store(ushort* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
2986 void operator()(arg_type& a, arg_type& b) const
2988 arg_type t = _mm_subs_epu16(a, b);
2989 a = _mm_subs_epu16(a, t);
2990 b = _mm_adds_epu16(b, t);
2997 typedef short value_type;
2998 typedef __m128i arg_type;
3000 arg_type load(const short* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
3001 void store(short* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
3002 void operator()(arg_type& a, arg_type& b) const
3005 a = _mm_min_epi16(a, b);
3006 b = _mm_max_epi16(b, t);
3013 typedef float value_type;
3014 typedef __m128 arg_type;
3016 arg_type load(const float* ptr) { return _mm_loadu_ps(ptr); }
3017 void store(float* ptr, arg_type val) { _mm_storeu_ps(ptr, val); }
3018 void operator()(arg_type& a, arg_type& b) const
3021 a = _mm_min_ps(a, b);
3022 b = _mm_max_ps(b, t);
3030 typedef uchar value_type;
3031 typedef uint8x16_t arg_type;
3033 arg_type load(const uchar* ptr) { return vld1q_u8(ptr); }
3034 void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); }
3035 void operator()(arg_type& a, arg_type& b) const
3046 typedef ushort value_type;
3047 typedef uint16x8_t arg_type;
3049 arg_type load(const ushort* ptr) { return vld1q_u16(ptr); }
3050 void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); }
3051 void operator()(arg_type& a, arg_type& b) const
3054 a = vminq_u16(a, b);
3055 b = vmaxq_u16(b, t);
3062 typedef short value_type;
3063 typedef int16x8_t arg_type;
3065 arg_type load(const short* ptr) { return vld1q_s16(ptr); }
3066 void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); }
3067 void operator()(arg_type& a, arg_type& b) const
3070 a = vminq_s16(a, b);
3071 b = vmaxq_s16(b, t);
3078 typedef float value_type;
3079 typedef float32x4_t arg_type;
3081 arg_type load(const float* ptr) { return vld1q_f32(ptr); }
3082 void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); }
3083 void operator()(arg_type& a, arg_type& b) const
3086 a = vminq_f32(a, b);
3087 b = vmaxq_f32(b, t);
3094 typedef MinMax8u MinMaxVec8u;
3095 typedef MinMax16u MinMaxVec16u;
3096 typedef MinMax16s MinMaxVec16s;
3097 typedef MinMax32f MinMaxVec32f;
3101 template<class Op, class VecOp>
3103 medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
3105 typedef typename Op::value_type T;
3106 typedef typename Op::arg_type WT;
3107 typedef typename VecOp::arg_type VT;
3109 const T* src = _src.ptr<T>();
3110 T* dst = _dst.ptr<T>();
3111 int sstep = (int)(_src.step/sizeof(T));
3112 int dstep = (int)(_dst.step/sizeof(T));
3113 Size size = _dst.size();
3114 int i, j, k, cn = _src.channels();
3117 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
3121 if( size.width == 1 || size.height == 1 )
3123 int len = size.width + size.height - 1;
3124 int sdelta = size.height == 1 ? cn : sstep;
3125 int sdelta0 = size.height == 1 ? 0 : sstep - cn;
3126 int ddelta = size.height == 1 ? cn : dstep;
3128 for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
3129 for( j = 0; j < cn; j++, src++ )
3131 WT p0 = src[i > 0 ? -sdelta : 0];
3133 WT p2 = src[i < len - 1 ? sdelta : 0];
3135 op(p0, p1); op(p1, p2); op(p0, p1);
3142 for( i = 0; i < size.height; i++, dst += dstep )
3144 const T* row0 = src + std::max(i - 1, 0)*sstep;
3145 const T* row1 = src + i*sstep;
3146 const T* row2 = src + std::min(i + 1, size.height-1)*sstep;
3147 int limit = useSIMD ? cn : size.width;
3151 for( ; j < limit; j++ )
3153 int j0 = j >= cn ? j - cn : j;
3154 int j2 = j < size.width - cn ? j + cn : j;
3155 WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
3156 WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
3157 WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
3159 op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
3160 op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
3161 op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
3162 op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
3163 op(p4, p2); op(p6, p4); op(p4, p2);
3167 if( limit == size.width )
3170 for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
3172 VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
3173 VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
3174 VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn);
3176 vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
3177 vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
3178 vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
3179 vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
3180 vop(p4, p2); vop(p6, p4); vop(p4, p2);
3181 vop.store(dst+j, p4);
3190 if( size.width == 1 || size.height == 1 )
3192 int len = size.width + size.height - 1;
3193 int sdelta = size.height == 1 ? cn : sstep;
3194 int sdelta0 = size.height == 1 ? 0 : sstep - cn;
3195 int ddelta = size.height == 1 ? cn : dstep;
3197 for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
3198 for( j = 0; j < cn; j++, src++ )
3200 int i1 = i > 0 ? -sdelta : 0;
3201 int i0 = i > 1 ? -sdelta*2 : i1;
3202 int i3 = i < len-1 ? sdelta : 0;
3203 int i4 = i < len-2 ? sdelta*2 : i3;
3204 WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
3206 op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2);
3207 op(p2, p4); op(p1, p3); op(p1, p2);
3214 for( i = 0; i < size.height; i++, dst += dstep )
3217 row[0] = src + std::max(i - 2, 0)*sstep;
3218 row[1] = src + std::max(i - 1, 0)*sstep;
3219 row[2] = src + i*sstep;
3220 row[3] = src + std::min(i + 1, size.height-1)*sstep;
3221 row[4] = src + std::min(i + 2, size.height-1)*sstep;
3222 int limit = useSIMD ? cn*2 : size.width;
3226 for( ; j < limit; j++ )
3229 int j1 = j >= cn ? j - cn : j;
3230 int j0 = j >= cn*2 ? j - cn*2 : j1;
3231 int j3 = j < size.width - cn ? j + cn : j;
3232 int j4 = j < size.width - cn*2 ? j + cn*2 : j3;
3233 for( k = 0; k < 5; k++ )
3235 const T* rowk = row[k];
3236 p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
3237 p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
3238 p[k*5+4] = rowk[j4];
3241 op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]);
3242 op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]);
3243 op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]);
3244 op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]);
3245 op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]);
3246 op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]);
3247 op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]);
3248 op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]);
3249 op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]);
3250 op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]);
3251 op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]);
3252 op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]);
3253 op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]);
3254 op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]);
3255 op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]);
3256 op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]);
3257 op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]);
3258 op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]);
3259 op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]);
3260 op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]);
3261 op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]);
3262 op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]);
3263 op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]);
3267 if( limit == size.width )
3270 for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
3273 for( k = 0; k < 5; k++ )
3275 const T* rowk = row[k];
3276 p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
3277 p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
3278 p[k*5+4] = vop.load(rowk+j+cn*2);
3281 vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
3282 vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
3283 vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
3284 vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
3285 vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
3286 vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
3287 vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
3288 vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
3289 vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
3290 vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
3291 vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
3292 vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
3293 vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
3294 vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
3295 vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
3296 vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
3297 vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
3298 vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
3299 vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
3300 vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
3301 vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
3302 vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
3303 vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
3304 vop.store(dst+j, p[12]);
3315 static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m)
3317 size_t localsize[2] = { 16, 16 };
3318 size_t globalsize[2];
3319 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3321 if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) )
3324 Size imgSize = _src.size();
3325 bool useOptimized = (1 == cn) &&
3326 (size_t)imgSize.width >= localsize[0] * 8 &&
3327 (size_t)imgSize.height >= localsize[1] * 8 &&
3328 imgSize.width % 4 == 0 &&
3329 imgSize.height % 4 == 0 &&
3330 (ocl::Device::getDefault().isIntel());
3332 cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ;
3333 cv::String kdefs = useOptimized ?
3334 format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type),
3335 ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn)
3337 format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ;
3339 ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() );
3344 UMat src = _src.getUMat();
3345 _dst.create(src.size(), type);
3346 UMat dst = _dst.getUMat();
3348 k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
3352 globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0];
3353 globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1];
3357 globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0];
3358 globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1];
3361 return k.run(2, globalsize, localsize, false);
3371 static bool openvx_medianFilter(InputArray _src, OutputArray _dst, int ksize)
3373 if (_src.type() != CV_8UC1 || _dst.type() != CV_8U
3374 #ifndef VX_VERSION_1_1
3380 Mat src = _src.getMat();
3381 Mat dst = _dst.getMat();
3384 border.mode = VX_BORDER_REPLICATE;
3388 ivx::Context ctx = ivx::Context::create();
3389 #ifdef VX_VERSION_1_1
3390 if ((vx_size)ksize > ctx.nonlinearMaxDimension())
3395 if (dst.data != src.data)
3401 ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
3402 ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
3403 ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
3404 ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
3406 //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
3407 //since OpenVX standart says nothing about thread-safety for now
3408 vx_border_t prevBorder = ctx.borderMode();
3409 ctx.setBorderMode(border);
3410 #ifdef VX_VERSION_1_1
3414 ivx::IVX_CHECK_STATUS(vxuMedian3x3(ctx, ia, ib));
3416 #ifdef VX_VERSION_1_1
3421 mtx = ivx::Matrix::createFromPattern(ctx, VX_PATTERN_BOX, ksize, ksize);
3424 vx_size supportedSize;
3425 ivx::IVX_CHECK_STATUS(vxQueryContext(ctx, VX_CONTEXT_NONLINEAR_MAX_DIMENSION, &supportedSize, sizeof(supportedSize)));
3426 if ((vx_size)ksize > supportedSize)
3428 ctx.setBorderMode(prevBorder);
3431 Mat mask(ksize, ksize, CV_8UC1, Scalar(255));
3432 mtx = ivx::Matrix::create(ctx, VX_TYPE_UINT8, ksize, ksize);
3435 ivx::IVX_CHECK_STATUS(vxuNonLinearFilter(ctx, VX_NONLINEAR_FILTER_MEDIAN, ia, mtx, ib));
3438 ctx.setBorderMode(prevBorder);
3440 catch (ivx::RuntimeError & e)
3442 CV_Error(CV_StsInternal, e.what());
3445 catch (ivx::WrapperError & e)
3447 CV_Error(CV_StsInternal, e.what());
3459 static bool ipp_medianFilter( InputArray _src0, OutputArray _dst, int ksize )
3461 CV_INSTRUMENT_REGION_IPP()
3463 #if IPP_VERSION_X100 >= 810
3464 Mat src0 = _src0.getMat();
3465 _dst.create( src0.size(), src0.type() );
3466 Mat dst = _dst.getMat();
3468 #define IPP_FILTER_MEDIAN_BORDER(ippType, ippDataType, flavor) \
3471 if (ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, \
3472 ippDataType, CV_MAT_CN(type), &bufSize) >= 0) \
3474 Ipp8u * buffer = ippsMalloc_8u(bufSize); \
3475 IppStatus status = CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_##flavor, src.ptr<ippType>(), (int)src.step, \
3476 dst.ptr<ippType>(), (int)dst.step, dstRoiSize, maskSize, \
3477 ippBorderRepl, (ippType)0, buffer); \
3481 CV_IMPL_ADD(CV_IMPL_IPP); \
3491 IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize);
3493 if( dst.data != src0.data )
3498 int type = src0.type();
3499 if (type == CV_8UC1)
3500 IPP_FILTER_MEDIAN_BORDER(Ipp8u, ipp8u, 8u_C1R);
3501 else if (type == CV_16UC1)
3502 IPP_FILTER_MEDIAN_BORDER(Ipp16u, ipp16u, 16u_C1R);
3503 else if (type == CV_16SC1)
3504 IPP_FILTER_MEDIAN_BORDER(Ipp16s, ipp16s, 16s_C1R);
3505 else if (type == CV_32FC1)
3506 IPP_FILTER_MEDIAN_BORDER(Ipp32f, ipp32f, 32f_C1R);
3508 #undef IPP_FILTER_MEDIAN_BORDER
3510 CV_UNUSED(_src0); CV_UNUSED(_dst); CV_UNUSED(ksize);
3517 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
3519 CV_INSTRUMENT_REGION()
3521 CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 ));
3529 CV_OCL_RUN(_dst.isUMat(),
3530 ocl_medianFilter(_src0,_dst, ksize))
3532 Mat src0 = _src0.getMat();
3533 _dst.create( src0.size(), src0.type() );
3534 Mat dst = _dst.getMat();
3537 if (openvx_medianFilter(_src0, _dst, ksize))
3541 CV_IPP_RUN(IPP_VERSION_X100 >= 810 && ksize <= 5, ipp_medianFilter(_src0,_dst, ksize));
3543 #ifdef HAVE_TEGRA_OPTIMIZATION
3544 if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize))
3548 bool useSortNet = ksize == 3 || (ksize == 5
3549 #if !(CV_SSE2 || CV_NEON)
3550 && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 )
3557 if( dst.data != src0.data )
3562 if( src.depth() == CV_8U )
3563 medianBlur_SortNet<MinMax8u, MinMaxVec8u>( src, dst, ksize );
3564 else if( src.depth() == CV_16U )
3565 medianBlur_SortNet<MinMax16u, MinMaxVec16u>( src, dst, ksize );
3566 else if( src.depth() == CV_16S )
3567 medianBlur_SortNet<MinMax16s, MinMaxVec16s>( src, dst, ksize );
3568 else if( src.depth() == CV_32F )
3569 medianBlur_SortNet<MinMax32f, MinMaxVec32f>( src, dst, ksize );
3571 CV_Error(CV_StsUnsupportedFormat, "");
3577 cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE );
3579 int cn = src0.channels();
3580 CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) );
3582 double img_size_mp = (double)(src0.total())/(1 << 20);
3583 if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*
3584 (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3))
3585 medianBlur_8u_Om( src, dst, ksize );
3587 medianBlur_8u_O1( src, dst, ksize );
3591 /****************************************************************************************\
3593 \****************************************************************************************/
3598 class BilateralFilter_8u_Invoker :
3599 public ParallelLoopBody
3602 BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
3603 int* _space_ofs, float *_space_weight, float *_color_weight) :
3604 temp(&_temp), dest(&_dest), radius(_radius),
3605 maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
3609 virtual void operator() (const Range& range) const
3611 int i, j, cn = dest->channels(), k;
3612 Size size = dest->size();
3614 int CV_DECL_ALIGNED(16) buf[4];
3615 float CV_DECL_ALIGNED(16) bufSum[4];
3616 static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
3617 bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
3620 for( i = range.start; i < range.end; i++ )
3622 const uchar* sptr = temp->ptr(i+radius) + radius*cn;
3623 uchar* dptr = dest->ptr(i);
3627 for( j = 0; j < size.width; j++ )
3629 float sum = 0, wsum = 0;
3635 __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
3636 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
3638 for( ; k <= maxk - 4; k += 4 )
3640 __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
3641 sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
3643 __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
3644 _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
3646 __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
3647 color_weight[buf[1]],color_weight[buf[0]]);
3648 __m128 _sw = _mm_loadu_ps(space_weight+k);
3649 __m128 _w = _mm_mul_ps(_cw, _sw);
3650 _cw = _mm_mul_ps(_w, _valF);
3652 _sw = _mm_hadd_ps(_w, _cw);
3653 _sw = _mm_hadd_ps(_sw, _sw);
3654 _mm_storel_pi((__m64*)bufSum, _sw);
3661 for( ; k < maxk; k++ )
3663 int val = sptr[j + space_ofs[k]];
3664 float w = space_weight[k]*color_weight[std::abs(val - val0)];
3668 // overflow is not possible here => there is no need to use cv::saturate_cast
3669 dptr[j] = (uchar)cvRound(sum/wsum);
3675 for( j = 0; j < size.width*3; j += 3 )
3677 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
3678 int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
3683 const __m128i izero = _mm_setzero_si128();
3684 const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
3685 const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
3686 const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
3687 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
3689 for( ; k <= maxk - 4; k += 4 )
3691 const int* const sptr_k0 = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
3692 const int* const sptr_k1 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
3693 const int* const sptr_k2 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
3694 const int* const sptr_k3 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
3696 __m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
3697 __m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
3698 __m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
3699 __m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
3701 _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
3703 __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
3704 __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
3705 __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
3707 bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
3708 _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
3710 __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
3711 color_weight[buf[1]],color_weight[buf[0]]);
3712 __m128 _sw = _mm_loadu_ps(space_weight+k);
3714 _w = _mm_mul_ps(_w,_sw);
3715 _b = _mm_mul_ps(_b, _w);
3716 _g = _mm_mul_ps(_g, _w);
3717 _r = _mm_mul_ps(_r, _w);
3719 _w = _mm_hadd_ps(_w, _b);
3720 _g = _mm_hadd_ps(_g, _r);
3722 _w = _mm_hadd_ps(_w, _g);
3723 _mm_store_ps(bufSum, _w);
3733 for( ; k < maxk; k++ )
3735 const uchar* sptr_k = sptr + j + space_ofs[k];
3736 int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
3737 float w = space_weight[k]*color_weight[std::abs(b - b0) +
3738 std::abs(g - g0) + std::abs(r - r0)];
3739 sum_b += b*w; sum_g += g*w; sum_r += r*w;
3743 b0 = cvRound(sum_b*wsum);
3744 g0 = cvRound(sum_g*wsum);
3745 r0 = cvRound(sum_r*wsum);
3746 dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
3755 int radius, maxk, *space_ofs;
3756 float *space_weight, *color_weight;
3759 #if defined (HAVE_IPP) && IPP_DISABLE_BLOCK
3760 class IPPBilateralFilter_8u_Invoker :
3761 public ParallelLoopBody
3764 IPPBilateralFilter_8u_Invoker(Mat &_src, Mat &_dst, double _sigma_color, double _sigma_space, int _radius, bool *_ok) :
3765 ParallelLoopBody(), src(_src), dst(_dst), sigma_color(_sigma_color), sigma_space(_sigma_space), radius(_radius), ok(_ok)
3770 virtual void operator() (const Range& range) const
3772 int d = radius * 2 + 1;
3773 IppiSize kernel = {d, d};
3774 IppiSize roi={dst.cols, range.end - range.start};
3776 if (0 > ippiFilterBilateralGetBufSize_8u_C1R( ippiFilterBilateralGauss, roi, kernel, &bufsize))
3781 AutoBuffer<uchar> buf(bufsize);
3782 IppiFilterBilateralSpec *pSpec = (IppiFilterBilateralSpec *)alignPtr(&buf[0], 32);
3783 if (0 > ippiFilterBilateralInit_8u_C1R( ippiFilterBilateralGauss, kernel, (Ipp32f)sigma_color, (Ipp32f)sigma_space, 1, pSpec ))
3788 if (0 > ippiFilterBilateral_8u_C1R( src.ptr<uchar>(range.start) + radius * ((int)src.step[0] + 1), (int)src.step[0], dst.ptr<uchar>(range.start), (int)dst.step[0], roi, kernel, pSpec ))
3792 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3802 const IPPBilateralFilter_8u_Invoker& operator= (const IPPBilateralFilter_8u_Invoker&);
3808 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
3809 double sigma_color, double sigma_space,
3813 if (ocl::Device::getDefault().isNVidia())
3817 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3818 int i, j, maxk, radius;
3820 if (depth != CV_8U || cn > 4)
3823 if (sigma_color <= 0)
3825 if (sigma_space <= 0)
3828 double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
3829 double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
3832 radius = cvRound(sigma_space * 1.5);
3835 radius = MAX(radius, 1);
3838 UMat src = _src.getUMat(), dst = _dst.getUMat(), temp;
3842 copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
3843 std::vector<float> _space_weight(d * d);
3844 std::vector<int> _space_ofs(d * d);
3845 float * const space_weight = &_space_weight[0];
3846 int * const space_ofs = &_space_ofs[0];
3848 // initialize space-related bilateral filter coefficients
3849 for( i = -radius, maxk = 0; i <= radius; i++ )
3850 for( j = -radius; j <= radius; j++ )
3852 double r = std::sqrt((double)i * i + (double)j * j);
3855 space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
3856 space_ofs[maxk++] = (int)(i * temp.step + j * cn);
3860 String cnstr = cn > 1 ? format("%d", cn) : "";
3861 String kernelName("bilateral");
3863 if ((ocl::Device::getDefault().isIntel()) &&
3864 (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
3867 if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
3869 kernelName = "bilateral_float4";
3873 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
3874 format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
3875 " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f",
3876 radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
3877 ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
3878 ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
3879 ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
3880 ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
3884 Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
3885 Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
3886 UMat ucolor_weight, uspace_weight, uspace_ofs;
3888 mspace_weight.copyTo(uspace_weight);
3889 mspace_ofs.copyTo(uspace_ofs);
3891 k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
3892 ocl::KernelArg::PtrReadOnly(uspace_weight),
3893 ocl::KernelArg::PtrReadOnly(uspace_ofs));
3895 size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows };
3896 return k.run(2, globalsize, NULL, false);
3901 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
3902 double sigma_color, double sigma_space,
3905 int cn = src.channels();
3906 int i, j, maxk, radius;
3907 Size size = src.size();
3909 CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
3911 if( sigma_color <= 0 )
3913 if( sigma_space <= 0 )
3916 double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
3917 double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
3920 radius = cvRound(sigma_space*1.5);
3923 radius = MAX(radius, 1);
3927 copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
3929 #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
3935 IPPBilateralFilter_8u_Invoker body(temp, dst, sigma_color * sigma_color, sigma_space * sigma_space, radius, &ok );
3936 parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16));
3939 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3942 setIppErrorStatus();
3947 std::vector<float> _color_weight(cn*256);
3948 std::vector<float> _space_weight(d*d);
3949 std::vector<int> _space_ofs(d*d);
3950 float* color_weight = &_color_weight[0];
3951 float* space_weight = &_space_weight[0];
3952 int* space_ofs = &_space_ofs[0];
3954 // initialize color-related bilateral filter coefficients
3956 for( i = 0; i < 256*cn; i++ )
3957 color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
3959 // initialize space-related bilateral filter coefficients
3960 for( i = -radius, maxk = 0; i <= radius; i++ )
3964 for( ; j <= radius; j++ )
3966 double r = std::sqrt((double)i*i + (double)j*j);
3969 space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
3970 space_ofs[maxk++] = (int)(i*temp.step + j*cn);
3974 BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
3975 parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
3979 class BilateralFilter_32f_Invoker :
3980 public ParallelLoopBody
3984 BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
3985 const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
3986 cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
3987 temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
3991 virtual void operator() (const Range& range) const
3994 Size size = dest->size();
3995 #if CV_SSE3 || CV_NEON
3996 int CV_DECL_ALIGNED(16) idxBuf[4];
3997 float CV_DECL_ALIGNED(16) bufSum32[4];
3998 static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
4001 bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
4003 bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
4006 for( i = range.start; i < range.end; i++ )
4008 const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
4009 float* dptr = dest->ptr<float>(i);
4013 for( j = 0; j < size.width; j++ )
4015 float sum = 0, wsum = 0;
4016 float val0 = sptr[j];
4021 __m128 psum = _mm_setzero_ps();
4022 const __m128 _val0 = _mm_set1_ps(sptr[j]);
4023 const __m128 _scale_index = _mm_set1_ps(scale_index);
4024 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
4026 for( ; k <= maxk - 4 ; k += 4 )
4028 __m128 _sw = _mm_loadu_ps(space_weight + k);
4029 __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
4030 sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
4031 __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
4033 __m128i _idx = _mm_cvtps_epi32(_alpha);
4034 _mm_store_si128((__m128i*)idxBuf, _idx);
4035 _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
4037 __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
4038 expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
4039 __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
4040 expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
4042 __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
4043 _val = _mm_mul_ps(_w, _val);
4045 _sw = _mm_hadd_ps(_w, _val);
4046 _sw = _mm_hadd_ps(_sw, _sw);
4047 psum = _mm_add_ps(_sw, psum);
4049 _mm_storel_pi((__m64*)bufSum32, psum);
4057 float32x2_t psum = vdup_n_f32(0.0f);
4058 const volatile float32x4_t _val0 = vdupq_n_f32(sptr[j]);
4059 const float32x4_t _scale_index = vdupq_n_f32(scale_index);
4060 const uint32x4_t _signMask = vld1q_u32(bufSignMask);
4062 for( ; k <= maxk - 4 ; k += 4 )
4064 float32x4_t _sw = vld1q_f32(space_weight + k);
4065 float CV_DECL_ALIGNED(16) _data[] = {sptr[j + space_ofs[k]], sptr[j + space_ofs[k+1]],
4066 sptr[j + space_ofs[k+2]], sptr[j + space_ofs[k+3]],};
4067 float32x4_t _val = vld1q_f32(_data);
4068 float32x4_t _alpha = vsubq_f32(_val, _val0);
4069 _alpha = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(_alpha), _signMask));
4070 _alpha = vmulq_f32(_alpha, _scale_index);
4071 int32x4_t _idx = vcvtq_s32_f32(_alpha);
4072 vst1q_s32(idxBuf, _idx);
4073 _alpha = vsubq_f32(_alpha, vcvtq_f32_s32(_idx));
4075 bufSum32[0] = expLUT[idxBuf[0]];
4076 bufSum32[1] = expLUT[idxBuf[1]];
4077 bufSum32[2] = expLUT[idxBuf[2]];
4078 bufSum32[3] = expLUT[idxBuf[3]];
4079 float32x4_t _explut = vld1q_f32(bufSum32);
4080 bufSum32[0] = expLUT[idxBuf[0]+1];
4081 bufSum32[1] = expLUT[idxBuf[1]+1];
4082 bufSum32[2] = expLUT[idxBuf[2]+1];
4083 bufSum32[3] = expLUT[idxBuf[3]+1];
4084 float32x4_t _explut1 = vld1q_f32(bufSum32);
4086 float32x4_t _w = vmulq_f32(_sw, vaddq_f32(_explut, vmulq_f32(_alpha, vsubq_f32(_explut1, _explut))));
4087 _val = vmulq_f32(_w, _val);
4089 float32x2_t _wval = vpadd_f32(vpadd_f32(vget_low_f32(_w),vget_high_f32(_w)), vpadd_f32(vget_low_f32(_val), vget_high_f32(_val)));
4090 psum = vadd_f32(_wval, psum);
4092 sum = vget_lane_f32(psum, 1);
4093 wsum = vget_lane_f32(psum, 0);
4097 for( ; k < maxk; k++ )
4099 float val = sptr[j + space_ofs[k]];
4100 float alpha = (float)(std::abs(val - val0)*scale_index);
4101 int idx = cvFloor(alpha);
4103 float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
4107 dptr[j] = (float)(sum/wsum);
4112 CV_Assert( cn == 3 );
4113 for( j = 0; j < size.width*3; j += 3 )
4115 float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
4116 float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
4121 __m128 sum = _mm_setzero_ps();
4122 const __m128 _b0 = _mm_set1_ps(b0);
4123 const __m128 _g0 = _mm_set1_ps(g0);
4124 const __m128 _r0 = _mm_set1_ps(r0);
4125 const __m128 _scale_index = _mm_set1_ps(scale_index);
4126 const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
4128 for( ; k <= maxk-4; k += 4 )
4130 __m128 _sw = _mm_loadu_ps(space_weight + k);
4132 const float* const sptr_k0 = sptr + j + space_ofs[k];
4133 const float* const sptr_k1 = sptr + j + space_ofs[k+1];
4134 const float* const sptr_k2 = sptr + j + space_ofs[k+2];
4135 const float* const sptr_k3 = sptr + j + space_ofs[k+3];
4137 __m128 _b = _mm_loadu_ps(sptr_k0);
4138 __m128 _g = _mm_loadu_ps(sptr_k1);
4139 __m128 _r = _mm_loadu_ps(sptr_k2);
4140 __m128 _z = _mm_loadu_ps(sptr_k3);
4141 _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
4143 __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
4144 __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
4145 __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
4147 __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
4149 __m128i _idx = _mm_cvtps_epi32(_alpha);
4150 _mm_store_si128((__m128i*)idxBuf, _idx);
4151 _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
4153 __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
4154 __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
4156 __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
4158 _b = _mm_mul_ps(_b, _w);
4159 _g = _mm_mul_ps(_g, _w);
4160 _r = _mm_mul_ps(_r, _w);
4162 _w = _mm_hadd_ps(_w, _b);
4163 _g = _mm_hadd_ps(_g, _r);
4165 _w = _mm_hadd_ps(_w, _g);
4166 sum = _mm_add_ps(sum, _w);
4168 _mm_store_ps(bufSum32, sum);
4170 sum_b = bufSum32[1];
4171 sum_g = bufSum32[2];
4172 sum_r = bufSum32[3];
4177 float32x4_t sum = vdupq_n_f32(0.0f);
4178 const float32x4_t _b0 = vdupq_n_f32(b0);
4179 const float32x4_t _g0 = vdupq_n_f32(g0);
4180 const float32x4_t _r0 = vdupq_n_f32(r0);
4181 const float32x4_t _scale_index = vdupq_n_f32(scale_index);
4182 const uint32x4_t _signMask = vld1q_u32(bufSignMask);
4184 for( ; k <= maxk-4; k += 4 )
4186 float32x4_t _sw = vld1q_f32(space_weight + k);
4188 const float* const sptr_k0 = sptr + j + space_ofs[k];
4189 const float* const sptr_k1 = sptr + j + space_ofs[k+1];
4190 const float* const sptr_k2 = sptr + j + space_ofs[k+2];
4191 const float* const sptr_k3 = sptr + j + space_ofs[k+3];
4193 float32x4_t _v0 = vld1q_f32(sptr_k0);
4194 float32x4_t _v1 = vld1q_f32(sptr_k1);
4195 float32x4_t _v2 = vld1q_f32(sptr_k2);
4196 float32x4_t _v3 = vld1q_f32(sptr_k3);
4198 float32x4x2_t v01 = vtrnq_f32(_v0, _v1);
4199 float32x4x2_t v23 = vtrnq_f32(_v2, _v3);
4200 float32x4_t _b = vcombine_f32(vget_low_f32(v01.val[0]), vget_low_f32(v23.val[0]));
4201 float32x4_t _g = vcombine_f32(vget_low_f32(v01.val[1]), vget_low_f32(v23.val[1]));
4202 float32x4_t _r = vcombine_f32(vget_high_f32(v01.val[0]), vget_high_f32(v23.val[0]));
4204 float32x4_t _bt = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vsubq_f32(_b, _b0)), _signMask));
4205 float32x4_t _gt = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vsubq_f32(_g, _g0)), _signMask));
4206 float32x4_t _rt = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vsubq_f32(_r, _r0)), _signMask));
4207 float32x4_t _alpha = vmulq_f32(_scale_index, vaddq_f32(_bt, vaddq_f32(_gt, _rt)));
4209 int32x4_t _idx = vcvtq_s32_f32(_alpha);
4210 vst1q_s32((int*)idxBuf, _idx);
4211 bufSum32[0] = expLUT[idxBuf[0]];
4212 bufSum32[1] = expLUT[idxBuf[1]];
4213 bufSum32[2] = expLUT[idxBuf[2]];
4214 bufSum32[3] = expLUT[idxBuf[3]];
4215 float32x4_t _explut = vld1q_f32(bufSum32);
4216 bufSum32[0] = expLUT[idxBuf[0]+1];
4217 bufSum32[1] = expLUT[idxBuf[1]+1];
4218 bufSum32[2] = expLUT[idxBuf[2]+1];
4219 bufSum32[3] = expLUT[idxBuf[3]+1];
4220 float32x4_t _explut1 = vld1q_f32(bufSum32);
4222 float32x4_t _w = vmulq_f32(_sw, vaddq_f32(_explut, vmulq_f32(_alpha, vsubq_f32(_explut1, _explut))));
4224 _b = vmulq_f32(_b, _w);
4225 _g = vmulq_f32(_g, _w);
4226 _r = vmulq_f32(_r, _w);
4228 float32x2_t _wb = vpadd_f32(vpadd_f32(vget_low_f32(_w),vget_high_f32(_w)), vpadd_f32(vget_low_f32(_b), vget_high_f32(_b)));
4229 float32x2_t _gr = vpadd_f32(vpadd_f32(vget_low_f32(_g),vget_high_f32(_g)), vpadd_f32(vget_low_f32(_r), vget_high_f32(_r)));
4231 _w = vcombine_f32(_wb, _gr);
4232 sum = vaddq_f32(sum, _w);
4234 vst1q_f32(bufSum32, sum);
4236 sum_b = bufSum32[1];
4237 sum_g = bufSum32[2];
4238 sum_r = bufSum32[3];
4242 for(; k < maxk; k++ )
4244 const float* sptr_k = sptr + j + space_ofs[k];
4245 float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
4246 float alpha = (float)((std::abs(b - b0) +
4247 std::abs(g - g0) + std::abs(r - r0))*scale_index);
4248 int idx = cvFloor(alpha);
4250 float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
4251 sum_b += b*w; sum_g += g*w; sum_r += r*w;
4258 dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
4265 int cn, radius, maxk, *space_ofs;
4268 float scale_index, *space_weight, *expLUT;
4273 bilateralFilter_32f( const Mat& src, Mat& dst, int d,
4274 double sigma_color, double sigma_space,
4277 int cn = src.channels();
4278 int i, j, maxk, radius;
4279 double minValSrc=-1, maxValSrc=1;
4280 const int kExpNumBinsPerChannel = 1 << 12;
4281 int kExpNumBins = 0;
4282 float lastExpVal = 1.f;
4283 float len, scale_index;
4284 Size size = src.size();
4286 CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
4288 if( sigma_color <= 0 )
4290 if( sigma_space <= 0 )
4293 double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
4294 double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
4297 radius = cvRound(sigma_space*1.5);
4300 radius = MAX(radius, 1);
4302 // compute the min/max range for the input image (even if multichannel)
4304 minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
4305 if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
4311 // temporary copy of the image with borders for easy processing
4313 copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
4314 const double insteadNaNValue = -5. * sigma_color;
4315 patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative
4316 // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption
4317 // allocate lookup tables
4318 std::vector<float> _space_weight(d*d);
4319 std::vector<int> _space_ofs(d*d);
4320 float* space_weight = &_space_weight[0];
4321 int* space_ofs = &_space_ofs[0];
4323 // assign a length which is slightly more than needed
4324 len = (float)(maxValSrc - minValSrc) * cn;
4325 kExpNumBins = kExpNumBinsPerChannel * cn;
4326 std::vector<float> _expLUT(kExpNumBins+2);
4327 float* expLUT = &_expLUT[0];
4329 scale_index = kExpNumBins/len;
4331 // initialize the exp LUT
4332 for( i = 0; i < kExpNumBins+2; i++ )
4334 if( lastExpVal > 0.f )
4336 double val = i / scale_index;
4337 expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
4338 lastExpVal = expLUT[i];
4344 // initialize space-related bilateral filter coefficients
4345 for( i = -radius, maxk = 0; i <= radius; i++ )
4346 for( j = -radius; j <= radius; j++ )
4348 double r = std::sqrt((double)i*i + (double)j*j);
4351 space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
4352 space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
4355 // parallel_for usage
4357 BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
4358 parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
4363 void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
4364 double sigmaColor, double sigmaSpace,
4367 CV_INSTRUMENT_REGION()
4369 _dst.create( _src.size(), _src.type() );
4371 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4372 ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
4374 Mat src = _src.getMat(), dst = _dst.getMat();
4376 if( src.depth() == CV_8U )
4377 bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType );
4378 else if( src.depth() == CV_32F )
4379 bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
4381 CV_Error( CV_StsUnsupportedFormat,
4382 "Bilateral filtering is only implemented for 8u and 32f images" );
4385 //////////////////////////////////////////////////////////////////////////////////////////
4388 cvSmooth( const void* srcarr, void* dstarr, int smooth_type,
4389 int param1, int param2, double param3, double param4 )
4391 cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
4393 CV_Assert( dst.size() == src.size() &&
4394 (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) );
4399 if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE )
4400 cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point(-1,-1),
4401 smooth_type == CV_BLUR, cv::BORDER_REPLICATE );
4402 else if( smooth_type == CV_GAUSSIAN )
4403 cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE );
4404 else if( smooth_type == CV_MEDIAN )
4405 cv::medianBlur( src, dst, param1 );
4407 cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE );
4409 if( dst.data != dst0.data )
4410 CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" );