modules/imgproc/src/smooth.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // Redistribution and use in source and binary forms, with or without modification,
  19 // are permitted provided that the following conditions are met:
  20 //
  21 //   * Redistribution's of source code must retain the above copyright notice,
  22 //     this list of conditions and the following disclaimer.
  23 //
  24 //   * Redistribution's in binary form must reproduce the above copyright notice,
  25 //     this list of conditions and the following disclaimer in the documentation
  26 //     and/or other materials provided with the distribution.
  27 //
  28 //   * The name of the copyright holders may not be used to endorse or promote products
  29 //     derived from this software without specific prior written permission.
  30 //
  31 // This software is provided by the copyright holders and contributors "as is" and
  32 // any express or implied warranties, including, but not limited to, the implied
  33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34 // In no event shall the Intel Corporation or contributors be liable for any direct,
  35 // indirect, incidental, special, exemplary, or consequential damages
  36 // (including, but not limited to, procurement of substitute goods or services;
  37 // loss of use, data, or profits; or business interruption) however caused
  38 // and on any theory of liability, whether in contract, strict liability,
  39 // or tort (including negligence or otherwise) arising in any way out of
  40 // the use of this software, even if advised of the possibility of such damage.
  41 //
  42 //M*/
  43
  44 #include "precomp.hpp"
  45 #include "opencl_kernels_imgproc.hpp"
  46
  47 #ifdef HAVE_OPENVX
  48 #define IVX_HIDE_INFO_WARNINGS
  49 #define IVX_USE_OPENCV
  50 #include "ivx.hpp"
  51 #endif
  52 /*
  53  * This file includes the code, contributed by Simon Perreault
  54  * (the function icvMedianBlur_8u_O1)
  55  *
  56  * Constant-time median filtering -- http://nomis80.org/ctmf.html
  57  * Copyright (C) 2006 Simon Perreault
  58  *
  59  * Contact:
  60  *  Laboratoire de vision et systemes numeriques
  61  *  Pavillon Adrien-Pouliot
  62  *  Universite Laval
  63  *  Sainte-Foy, Quebec, Canada
  64  *  G1K 7P4
  65  *
  66  *  perreaul@gel.ulaval.ca
  67  */
  68
  69 namespace cv
  70 {
  71
  72 /****************************************************************************************\
  73                                          Box Filter
  74 \****************************************************************************************/
  75
  76 template<typename T, typename ST>
  77 struct RowSum :
  78         public BaseRowFilter
  79 {
  80     RowSum( int _ksize, int _anchor ) :
  81         BaseRowFilter()
  82     {
  83         ksize = _ksize;
  84         anchor = _anchor;
  85     }
  86
  87     virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
  88     {
  89         const T* S = (const T*)src;
  90         ST* D = (ST*)dst;
  91         int i = 0, k, ksz_cn = ksize*cn;
  92
  93         width = (width - 1)*cn;
  94         if( ksize == 3 )
  95         {
  96             for( i = 0; i < width + cn; i++ )
  97             {
  98                 D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2];
  99             }
 100         }
 101         else if( ksize == 5 )
 102         {
 103             for( i = 0; i < width + cn; i++ )
 104             {
 105                 D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4];
 106             }
 107         }
 108         else if( cn == 1 )
 109         {
 110             ST s = 0;
 111             for( i = 0; i < ksz_cn; i++ )
 112                 s += (ST)S[i];
 113             D[0] = s;
 114             for( i = 0; i < width; i++ )
 115             {
 116                 s += (ST)S[i + ksz_cn] - (ST)S[i];
 117                 D[i+1] = s;
 118             }
 119         }
 120         else if( cn == 3 )
 121         {
 122             ST s0 = 0, s1 = 0, s2 = 0;
 123             for( i = 0; i < ksz_cn; i += 3 )
 124             {
 125                 s0 += (ST)S[i];
 126                 s1 += (ST)S[i+1];
 127                 s2 += (ST)S[i+2];
 128             }
 129             D[0] = s0;
 130             D[1] = s1;
 131             D[2] = s2;
 132             for( i = 0; i < width; i += 3 )
 133             {
 134                 s0 += (ST)S[i + ksz_cn] - (ST)S[i];
 135                 s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
 136                 s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
 137                 D[i+3] = s0;
 138                 D[i+4] = s1;
 139                 D[i+5] = s2;
 140             }
 141         }
 142         else if( cn == 4 )
 143         {
 144             ST s0 = 0, s1 = 0, s2 = 0, s3 = 0;
 145             for( i = 0; i < ksz_cn; i += 4 )
 146             {
 147                 s0 += (ST)S[i];
 148                 s1 += (ST)S[i+1];
 149                 s2 += (ST)S[i+2];
 150                 s3 += (ST)S[i+3];
 151             }
 152             D[0] = s0;
 153             D[1] = s1;
 154             D[2] = s2;
 155             D[3] = s3;
 156             for( i = 0; i < width; i += 4 )
 157             {
 158                 s0 += (ST)S[i + ksz_cn] - (ST)S[i];
 159                 s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
 160                 s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
 161                 s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3];
 162                 D[i+4] = s0;
 163                 D[i+5] = s1;
 164                 D[i+6] = s2;
 165                 D[i+7] = s3;
 166             }
 167         }
 168         else
 169             for( k = 0; k < cn; k++, S++, D++ )
 170             {
 171                 ST s = 0;
 172                 for( i = 0; i < ksz_cn; i += cn )
 173                     s += (ST)S[i];
 174                 D[0] = s;
 175                 for( i = 0; i < width; i += cn )
 176                 {
 177                     s += (ST)S[i + ksz_cn] - (ST)S[i];
 178                     D[i+cn] = s;
 179                 }
 180             }
 181     }
 182 };
 183
 184
 185 template<typename ST, typename T>
 186 struct ColumnSum :
 187         public BaseColumnFilter
 188 {
 189     ColumnSum( int _ksize, int _anchor, double _scale ) :
 190         BaseColumnFilter()
 191     {
 192         ksize = _ksize;
 193         anchor = _anchor;
 194         scale = _scale;
 195         sumCount = 0;
 196     }
 197
 198     virtual void reset() { sumCount = 0; }
 199
 200     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
 201     {
 202         int i;
 203         ST* SUM;
 204         bool haveScale = scale != 1;
 205         double _scale = scale;
 206
 207         if( width != (int)sum.size() )
 208         {
 209             sum.resize(width);
 210             sumCount = 0;
 211         }
 212
 213         SUM = &sum[0];
 214         if( sumCount == 0 )
 215         {
 216             memset((void*)SUM, 0, width*sizeof(ST));
 217
 218             for( ; sumCount < ksize - 1; sumCount++, src++ )
 219             {
 220                 const ST* Sp = (const ST*)src[0];
 221
 222                 for( i = 0; i < width; i++ )
 223                     SUM[i] += Sp[i];
 224             }
 225         }
 226         else
 227         {
 228             CV_Assert( sumCount == ksize-1 );
 229             src += ksize-1;
 230         }
 231
 232         for( ; count--; src++ )
 233         {
 234             const ST* Sp = (const ST*)src[0];
 235             const ST* Sm = (const ST*)src[1-ksize];
 236             T* D = (T*)dst;
 237             if( haveScale )
 238             {
 239                 for( i = 0; i <= width - 2; i += 2 )
 240                 {
 241                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
 242                     D[i] = saturate_cast<T>(s0*_scale);
 243                     D[i+1] = saturate_cast<T>(s1*_scale);
 244                     s0 -= Sm[i]; s1 -= Sm[i+1];
 245                     SUM[i] = s0; SUM[i+1] = s1;
 246                 }
 247
 248                 for( ; i < width; i++ )
 249                 {
 250                     ST s0 = SUM[i] + Sp[i];
 251                     D[i] = saturate_cast<T>(s0*_scale);
 252                     SUM[i] = s0 - Sm[i];
 253                 }
 254             }
 255             else
 256             {
 257                 for( i = 0; i <= width - 2; i += 2 )
 258                 {
 259                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
 260                     D[i] = saturate_cast<T>(s0);
 261                     D[i+1] = saturate_cast<T>(s1);
 262                     s0 -= Sm[i]; s1 -= Sm[i+1];
 263                     SUM[i] = s0; SUM[i+1] = s1;
 264                 }
 265
 266                 for( ; i < width; i++ )
 267                 {
 268                     ST s0 = SUM[i] + Sp[i];
 269                     D[i] = saturate_cast<T>(s0);
 270                     SUM[i] = s0 - Sm[i];
 271                 }
 272             }
 273             dst += dststep;
 274         }
 275     }
 276
 277     double scale;
 278     int sumCount;
 279     std::vector<ST> sum;
 280 };
 281
 282
 283 template<>
 284 struct ColumnSum<int, uchar> :
 285         public BaseColumnFilter
 286 {
 287     ColumnSum( int _ksize, int _anchor, double _scale ) :
 288         BaseColumnFilter()
 289     {
 290         ksize = _ksize;
 291         anchor = _anchor;
 292         scale = _scale;
 293         sumCount = 0;
 294     }
 295
 296     virtual void reset() { sumCount = 0; }
 297
 298     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
 299     {
 300         int* SUM;
 301         bool haveScale = scale != 1;
 302         double _scale = scale;
 303
 304         #if CV_SSE2
 305             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
 306         #elif CV_NEON
 307             bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
 308         #endif
 309
 310         if( width != (int)sum.size() )
 311         {
 312             sum.resize(width);
 313             sumCount = 0;
 314         }
 315
 316         SUM = &sum[0];
 317         if( sumCount == 0 )
 318         {
 319             memset((void*)SUM, 0, width*sizeof(int));
 320             for( ; sumCount < ksize - 1; sumCount++, src++ )
 321             {
 322                 const int* Sp = (const int*)src[0];
 323                 int i = 0;
 324                 #if CV_SSE2
 325                 if(haveSSE2)
 326                 {
 327                     for( ; i <= width-4; i+=4 )
 328                     {
 329                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
 330                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
 331                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
 332                     }
 333                 }
 334                 #elif CV_NEON
 335                 if(haveNEON)
 336                 {
 337                     for( ; i <= width - 4; i+=4 )
 338                         vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
 339                 }
 340                 #endif
 341                 for( ; i < width; i++ )
 342                     SUM[i] += Sp[i];
 343             }
 344         }
 345         else
 346         {
 347             CV_Assert( sumCount == ksize-1 );
 348             src += ksize-1;
 349         }
 350
 351         for( ; count--; src++ )
 352         {
 353             const int* Sp = (const int*)src[0];
 354             const int* Sm = (const int*)src[1-ksize];
 355             uchar* D = (uchar*)dst;
 356             if( haveScale )
 357             {
 358                 int i = 0;
 359                 #if CV_SSE2
 360                 if(haveSSE2)
 361                 {
 362                     const __m128 scale4 = _mm_set1_ps((float)_scale);
 363                     for( ; i <= width-8; i+=8 )
 364                     {
 365                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
 366                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
 367
 368                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
 369                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
 370                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
 371                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
 372
 373                         __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
 374                         __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
 375
 376                         _s0T = _mm_packs_epi32(_s0T, _s0T1);
 377
 378                         _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
 379
 380                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
 381                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
 382                     }
 383                 }
 384                 #elif CV_NEON
 385                 if(haveNEON)
 386                 {
 387                     float32x4_t v_scale = vdupq_n_f32((float)_scale);
 388                     for( ; i <= width-8; i+=8 )
 389                     {
 390                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 391                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 392
 393                         uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
 394                         uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
 395
 396                         uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
 397                         vst1_u8(D + i, vqmovn_u16(v_dst));
 398
 399                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
 400                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
 401                     }
 402                 }
 403                 #endif
 404                 for( ; i < width; i++ )
 405                 {
 406                     int s0 = SUM[i] + Sp[i];
 407                     D[i] = saturate_cast<uchar>(s0*_scale);
 408                     SUM[i] = s0 - Sm[i];
 409                 }
 410             }
 411             else
 412             {
 413                 int i = 0;
 414                 #if CV_SSE2
 415                 if(haveSSE2)
 416                 {
 417                     for( ; i <= width-8; i+=8 )
 418                     {
 419                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
 420                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
 421
 422                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
 423                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
 424                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
 425                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
 426
 427                         __m128i _s0T = _mm_packs_epi32(_s0, _s01);
 428
 429                         _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
 430
 431                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
 432                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
 433                     }
 434                 }
 435                 #elif CV_NEON
 436                 if(haveNEON)
 437                 {
 438                     for( ; i <= width-8; i+=8 )
 439                     {
 440                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 441                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 442
 443                         uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
 444                         vst1_u8(D + i, vqmovn_u16(v_dst));
 445
 446                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
 447                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
 448                     }
 449                 }
 450                 #endif
 451
 452                 for( ; i < width; i++ )
 453                 {
 454                     int s0 = SUM[i] + Sp[i];
 455                     D[i] = saturate_cast<uchar>(s0);
 456                     SUM[i] = s0 - Sm[i];
 457                 }
 458             }
 459             dst += dststep;
 460         }
 461     }
 462
 463     double scale;
 464     int sumCount;
 465     std::vector<int> sum;
 466 };
 467
 468
 469 template<>
 470 struct ColumnSum<ushort, uchar> :
 471 public BaseColumnFilter
 472 {
 473     ColumnSum( int _ksize, int _anchor, double _scale ) :
 474     BaseColumnFilter()
 475     {
 476         ksize = _ksize;
 477         anchor = _anchor;
 478         scale = _scale;
 479         sumCount = 0;
 480         divDelta = 0;
 481         divScale = 1;
 482         if( scale != 1 )
 483         {
 484             int d = cvRound(1./scale);
 485             double scalef = ((double)(1 << 16))/d;
 486             divScale = cvFloor(scalef);
 487             scalef -= divScale;
 488             divDelta = d/2;
 489             if( scalef < 0.5 )
 490                 divDelta++;
 491             else
 492                 divScale++;
 493         }
 494     }
 495
 496     virtual void reset() { sumCount = 0; }
 497
 498     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
 499     {
 500         const int ds = divScale;
 501         const int dd = divDelta;
 502         ushort* SUM;
 503         const bool haveScale = scale != 1;
 504
 505 #if CV_SSE2
 506         bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
 507 #elif CV_NEON
 508         bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
 509 #endif
 510
 511         if( width != (int)sum.size() )
 512         {
 513             sum.resize(width);
 514             sumCount = 0;
 515         }
 516
 517         SUM = &sum[0];
 518         if( sumCount == 0 )
 519         {
 520             memset((void*)SUM, 0, width*sizeof(SUM[0]));
 521             for( ; sumCount < ksize - 1; sumCount++, src++ )
 522             {
 523                 const ushort* Sp = (const ushort*)src[0];
 524                 int i = 0;
 525 #if CV_SSE2
 526                 if(haveSSE2)
 527                 {
 528                     for( ; i <= width-8; i+=8 )
 529                     {
 530                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
 531                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
 532                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi16(_sum, _sp));
 533                     }
 534                 }
 535 #elif CV_NEON
 536                 if(haveNEON)
 537                 {
 538                     for( ; i <= width - 8; i+=8 )
 539                         vst1q_u16(SUM + i, vaddq_u16(vld1q_u16(SUM + i), vld1q_u16(Sp + i)));
 540                 }
 541 #endif
 542                 for( ; i < width; i++ )
 543                     SUM[i] += Sp[i];
 544             }
 545         }
 546         else
 547         {
 548             CV_Assert( sumCount == ksize-1 );
 549             src += ksize-1;
 550         }
 551
 552         for( ; count--; src++ )
 553         {
 554             const ushort* Sp = (const ushort*)src[0];
 555             const ushort* Sm = (const ushort*)src[1-ksize];
 556             uchar* D = (uchar*)dst;
 557             if( haveScale )
 558             {
 559                 int i = 0;
 560     #if CV_SSE2
 561                 if(haveSSE2)
 562                 {
 563                     __m128i ds8 = _mm_set1_epi16((short)ds);
 564                     __m128i dd8 = _mm_set1_epi16((short)dd);
 565
 566                     for( ; i <= width-16; i+=16 )
 567                     {
 568                         __m128i _sm0  = _mm_loadu_si128((const __m128i*)(Sm+i));
 569                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+8));
 570
 571                         __m128i _s0  = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i)),
 572                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
 573                         __m128i _s1  = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i+8)),
 574                                                      _mm_loadu_si128((const __m128i*)(Sp+i+8)));
 575                         __m128i _s2 = _mm_mulhi_epu16(_mm_adds_epu16(_s0, dd8), ds8);
 576                         __m128i _s3 = _mm_mulhi_epu16(_mm_adds_epu16(_s1, dd8), ds8);
 577                         _s0 = _mm_sub_epi16(_s0, _sm0);
 578                         _s1 = _mm_sub_epi16(_s1, _sm1);
 579                         _mm_storeu_si128((__m128i*)(D+i), _mm_packus_epi16(_s2, _s3));
 580                         _mm_storeu_si128((__m128i*)(SUM+i), _s0);
 581                         _mm_storeu_si128((__m128i*)(SUM+i+8), _s1);
 582                     }
 583                 }
 584     #endif
 585                 for( ; i < width; i++ )
 586                 {
 587                     int s0 = SUM[i] + Sp[i];
 588                     D[i] = (uchar)((s0 + dd)*ds >> 16);
 589                     SUM[i] = (ushort)(s0 - Sm[i]);
 590                 }
 591             }
 592             else
 593             {
 594                 int i = 0;
 595                 for( ; i < width; i++ )
 596                 {
 597                     int s0 = SUM[i] + Sp[i];
 598                     D[i] = saturate_cast<uchar>(s0);
 599                     SUM[i] = (ushort)(s0 - Sm[i]);
 600                 }
 601             }
 602             dst += dststep;
 603         }
 604     }
 605
 606     double scale;
 607     int sumCount;
 608     int divDelta;
 609     int divScale;
 610     std::vector<ushort> sum;
 611 };
 612
 613
 614 template<>
 615 struct ColumnSum<int, short> :
 616         public BaseColumnFilter
 617 {
 618     ColumnSum( int _ksize, int _anchor, double _scale ) :
 619         BaseColumnFilter()
 620     {
 621         ksize = _ksize;
 622         anchor = _anchor;
 623         scale = _scale;
 624         sumCount = 0;
 625     }
 626
 627     virtual void reset() { sumCount = 0; }
 628
 629     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
 630     {
 631         int i;
 632         int* SUM;
 633         bool haveScale = scale != 1;
 634         double _scale = scale;
 635
 636         #if CV_SSE2
 637             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
 638         #elif CV_NEON
 639             bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
 640         #endif
 641
 642         if( width != (int)sum.size() )
 643         {
 644             sum.resize(width);
 645             sumCount = 0;
 646         }
 647
 648         SUM = &sum[0];
 649         if( sumCount == 0 )
 650         {
 651             memset((void*)SUM, 0, width*sizeof(int));
 652             for( ; sumCount < ksize - 1; sumCount++, src++ )
 653             {
 654                 const int* Sp = (const int*)src[0];
 655                 i = 0;
 656                 #if CV_SSE2
 657                 if(haveSSE2)
 658                 {
 659                     for( ; i <= width-4; i+=4 )
 660                     {
 661                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
 662                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
 663                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
 664                     }
 665                 }
 666                 #elif CV_NEON
 667                 if(haveNEON)
 668                 {
 669                     for( ; i <= width - 4; i+=4 )
 670                         vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
 671                 }
 672                 #endif
 673                 for( ; i < width; i++ )
 674                     SUM[i] += Sp[i];
 675             }
 676         }
 677         else
 678         {
 679             CV_Assert( sumCount == ksize-1 );
 680             src += ksize-1;
 681         }
 682
 683         for( ; count--; src++ )
 684         {
 685             const int* Sp = (const int*)src[0];
 686             const int* Sm = (const int*)src[1-ksize];
 687             short* D = (short*)dst;
 688             if( haveScale )
 689             {
 690                 i = 0;
 691                 #if CV_SSE2
 692                 if(haveSSE2)
 693                 {
 694                     const __m128 scale4 = _mm_set1_ps((float)_scale);
 695                     for( ; i <= width-8; i+=8 )
 696                     {
 697                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
 698                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
 699
 700                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
 701                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
 702                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
 703                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
 704
 705                         __m128i _s0T  = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
 706                         __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
 707
 708                         _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1));
 709
 710                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
 711                         _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1));
 712                     }
 713                 }
 714                 #elif CV_NEON
 715                 if(haveNEON)
 716                 {
 717                     float32x4_t v_scale = vdupq_n_f32((float)_scale);
 718                     for( ; i <= width-8; i+=8 )
 719                     {
 720                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 721                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 722
 723                         int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
 724                         int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
 725                         vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
 726
 727                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
 728                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
 729                     }
 730                 }
 731                 #endif
 732                 for( ; i < width; i++ )
 733                 {
 734                     int s0 = SUM[i] + Sp[i];
 735                     D[i] = saturate_cast<short>(s0*_scale);
 736                     SUM[i] = s0 - Sm[i];
 737                 }
 738             }
 739             else
 740             {
 741                 i = 0;
 742                 #if CV_SSE2
 743                 if(haveSSE2)
 744                 {
 745                     for( ; i <= width-8; i+=8 )
 746                     {
 747
 748                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
 749                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
 750
 751                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
 752                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
 753                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
 754                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
 755
 756                         _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01));
 757
 758                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
 759                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
 760                     }
 761                 }
 762                 #elif CV_NEON
 763                 if(haveNEON)
 764                 {
 765                     for( ; i <= width-8; i+=8 )
 766                     {
 767                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 768                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 769
 770                         vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
 771
 772                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
 773                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
 774                     }
 775                 }
 776                 #endif
 777
 778                 for( ; i < width; i++ )
 779                 {
 780                     int s0 = SUM[i] + Sp[i];
 781                     D[i] = saturate_cast<short>(s0);
 782                     SUM[i] = s0 - Sm[i];
 783                 }
 784             }
 785             dst += dststep;
 786         }
 787     }
 788
 789     double scale;
 790     int sumCount;
 791     std::vector<int> sum;
 792 };
 793
 794
 795 template<>
 796 struct ColumnSum<int, ushort> :
 797         public BaseColumnFilter
 798 {
 799     ColumnSum( int _ksize, int _anchor, double _scale ) :
 800         BaseColumnFilter()
 801     {
 802         ksize = _ksize;
 803         anchor = _anchor;
 804         scale = _scale;
 805         sumCount = 0;
 806     }
 807
 808     virtual void reset() { sumCount = 0; }
 809
 810     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
 811     {
 812         int* SUM;
 813         bool haveScale = scale != 1;
 814         double _scale = scale;
 815
 816         #if CV_SSE2
 817             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
 818         #elif CV_NEON
 819             bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
 820         #endif
 821
 822         if( width != (int)sum.size() )
 823         {
 824             sum.resize(width);
 825             sumCount = 0;
 826         }
 827
 828         SUM = &sum[0];
 829         if( sumCount == 0 )
 830         {
 831             memset((void*)SUM, 0, width*sizeof(int));
 832             for( ; sumCount < ksize - 1; sumCount++, src++ )
 833             {
 834                 const int* Sp = (const int*)src[0];
 835                 int i = 0;
 836                 #if CV_SSE2
 837                 if(haveSSE2)
 838                 {
 839                     for( ; i <= width-4; i+=4 )
 840                     {
 841                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
 842                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
 843                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
 844                     }
 845                 }
 846                 #elif CV_NEON
 847                 if(haveNEON)
 848                 {
 849                     for( ; i <= width - 4; i+=4 )
 850                         vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
 851                 }
 852                 #endif
 853                 for( ; i < width; i++ )
 854                     SUM[i] += Sp[i];
 855             }
 856         }
 857         else
 858         {
 859             CV_Assert( sumCount == ksize-1 );
 860             src += ksize-1;
 861         }
 862
 863         for( ; count--; src++ )
 864         {
 865             const int* Sp = (const int*)src[0];
 866             const int* Sm = (const int*)src[1-ksize];
 867             ushort* D = (ushort*)dst;
 868             if( haveScale )
 869             {
 870                 int i = 0;
 871                 #if CV_SSE2
 872                 if(haveSSE2)
 873                 {
 874                     const __m128 scale4 = _mm_set1_ps((float)_scale);
 875                     const __m128i delta0 = _mm_set1_epi32(0x8000);
 876                     const __m128i delta1 = _mm_set1_epi32(0x80008000);
 877
 878                     for( ; i < width-4; i+=4)
 879                     {
 880                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
 881                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
 882                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
 883
 884                         __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
 885
 886                         _res = _mm_sub_epi32(_res, delta0);
 887                         _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
 888
 889                         _mm_storel_epi64((__m128i*)(D+i), _res);
 890                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
 891                     }
 892                 }
 893                 #elif CV_NEON
 894                 if(haveNEON)
 895                 {
 896                     float32x4_t v_scale = vdupq_n_f32((float)_scale);
 897                     for( ; i <= width-8; i+=8 )
 898                     {
 899                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 900                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 901
 902                         uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
 903                         uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
 904                         vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
 905
 906                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
 907                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
 908                     }
 909                 }
 910                 #endif
 911                 for( ; i < width; i++ )
 912                 {
 913                     int s0 = SUM[i] + Sp[i];
 914                     D[i] = saturate_cast<ushort>(s0*_scale);
 915                     SUM[i] = s0 - Sm[i];
 916                 }
 917             }
 918             else
 919             {
 920                 int i = 0;
 921                 #if CV_SSE2
 922                 if(haveSSE2)
 923                 {
 924                     const __m128i delta0 = _mm_set1_epi32(0x8000);
 925                     const __m128i delta1 = _mm_set1_epi32(0x80008000);
 926
 927                     for( ; i < width-4; i+=4 )
 928                     {
 929                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
 930                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
 931                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
 932
 933                         __m128i _res = _mm_sub_epi32(_s0, delta0);
 934                         _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
 935
 936                         _mm_storel_epi64((__m128i*)(D+i), _res);
 937                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
 938                     }
 939                 }
 940                 #elif CV_NEON
 941                 if(haveNEON)
 942                 {
 943                     for( ; i <= width-8; i+=8 )
 944                     {
 945                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
 946                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
 947
 948                         vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
 949
 950                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
 951                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
 952                     }
 953                 }
 954                 #endif
 955
 956                 for( ; i < width; i++ )
 957                 {
 958                     int s0 = SUM[i] + Sp[i];
 959                     D[i] = saturate_cast<ushort>(s0);
 960                     SUM[i] = s0 - Sm[i];
 961                 }
 962             }
 963             dst += dststep;
 964         }
 965     }
 966
 967     double scale;
 968     int sumCount;
 969     std::vector<int> sum;
 970 };
 971
 972 template<>
 973 struct ColumnSum<int, int> :
 974         public BaseColumnFilter
 975 {
 976     ColumnSum( int _ksize, int _anchor, double _scale ) :
 977         BaseColumnFilter()
 978     {
 979         ksize = _ksize;
 980         anchor = _anchor;
 981         scale = _scale;
 982         sumCount = 0;
 983     }
 984
 985     virtual void reset() { sumCount = 0; }
 986
 987     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
 988     {
 989         int* SUM;
 990         bool haveScale = scale != 1;
 991         double _scale = scale;
 992
 993         #if CV_SSE2
 994             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
 995         #elif CV_NEON
 996             bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
 997         #endif
 998
 999         if( width != (int)sum.size() )
1000         {
1001             sum.resize(width);
1002             sumCount = 0;
1003         }
1004
1005         SUM = &sum[0];
1006         if( sumCount == 0 )
1007         {
1008             memset((void*)SUM, 0, width*sizeof(int));
1009             for( ; sumCount < ksize - 1; sumCount++, src++ )
1010             {
1011                 const int* Sp = (const int*)src[0];
1012                 int i = 0;
1013                 #if CV_SSE2
1014                 if(haveSSE2)
1015                 {
1016                     for( ; i <= width-4; i+=4 )
1017                     {
1018                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
1019                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
1020                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
1021                     }
1022                 }
1023                 #elif CV_NEON
1024                 if(haveNEON)
1025                 {
1026                     for( ; i <= width - 4; i+=4 )
1027                         vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
1028                 }
1029                 #endif
1030                 for( ; i < width; i++ )
1031                     SUM[i] += Sp[i];
1032             }
1033         }
1034         else
1035         {
1036             CV_Assert( sumCount == ksize-1 );
1037             src += ksize-1;
1038         }
1039
1040         for( ; count--; src++ )
1041         {
1042             const int* Sp = (const int*)src[0];
1043             const int* Sm = (const int*)src[1-ksize];
1044             int* D = (int*)dst;
1045             if( haveScale )
1046             {
1047                 int i = 0;
1048                 #if CV_SSE2
1049                 if(haveSSE2)
1050                 {
1051                     const __m128 scale4 = _mm_set1_ps((float)_scale);
1052                     for( ; i <= width-4; i+=4 )
1053                     {
1054                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
1055
1056                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1057                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
1058
1059                         __m128i _s0T  = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
1060
1061                         _mm_storeu_si128((__m128i*)(D+i), _s0T);
1062                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
1063                     }
1064                 }
1065                 #elif CV_NEON
1066                 if(haveNEON)
1067                 {
1068                     float32x4_t v_scale = vdupq_n_f32((float)_scale);
1069                     for( ; i <= width-4; i+=4 )
1070                     {
1071                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1072
1073                         int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
1074                         vst1q_s32(D + i, v_s0d);
1075
1076                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1077                     }
1078                 }
1079                 #endif
1080                 for( ; i < width; i++ )
1081                 {
1082                     int s0 = SUM[i] + Sp[i];
1083                     D[i] = saturate_cast<int>(s0*_scale);
1084                     SUM[i] = s0 - Sm[i];
1085                 }
1086             }
1087             else
1088             {
1089                 int i = 0;
1090                 #if CV_SSE2
1091                 if(haveSSE2)
1092                 {
1093                     for( ; i <= width-4; i+=4 )
1094                     {
1095                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
1096                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1097                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
1098
1099                         _mm_storeu_si128((__m128i*)(D+i), _s0);
1100                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
1101                     }
1102                 }
1103                 #elif CV_NEON
1104                 if(haveNEON)
1105                 {
1106                     for( ; i <= width-4; i+=4 )
1107                     {
1108                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1109
1110                         vst1q_s32(D + i, v_s0);
1111                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1112                     }
1113                 }
1114                 #endif
1115
1116                 for( ; i < width; i++ )
1117                 {
1118                     int s0 = SUM[i] + Sp[i];
1119                     D[i] = s0;
1120                     SUM[i] = s0 - Sm[i];
1121                 }
1122             }
1123             dst += dststep;
1124         }
1125     }
1126
1127     double scale;
1128     int sumCount;
1129     std::vector<int> sum;
1130 };
1131
1132
1133 template<>
1134 struct ColumnSum<int, float> :
1135         public BaseColumnFilter
1136 {
1137     ColumnSum( int _ksize, int _anchor, double _scale ) :
1138         BaseColumnFilter()
1139     {
1140         ksize = _ksize;
1141         anchor = _anchor;
1142         scale = _scale;
1143         sumCount = 0;
1144     }
1145
1146     virtual void reset() { sumCount = 0; }
1147
1148     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
1149     {
1150         int* SUM;
1151         bool haveScale = scale != 1;
1152         double _scale = scale;
1153
1154         #if CV_SSE2
1155             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1156         #elif CV_NEON
1157             bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
1158         #endif
1159
1160         if( width != (int)sum.size() )
1161         {
1162             sum.resize(width);
1163             sumCount = 0;
1164         }
1165
1166         SUM = &sum[0];
1167         if( sumCount == 0 )
1168         {
1169             memset((void*)SUM, 0, width*sizeof(int));
1170             for( ; sumCount < ksize - 1; sumCount++, src++ )
1171             {
1172                 const int* Sp = (const int*)src[0];
1173                 int i = 0;
1174                 #if CV_SSE2
1175                 if(haveSSE2)
1176                 {
1177                     for( ; i <= width-4; i+=4 )
1178                     {
1179                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
1180                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
1181                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
1182                     }
1183                 }
1184                 #elif CV_NEON
1185                 if(haveNEON)
1186                 {
1187                     for( ; i <= width - 4; i+=4 )
1188                         vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
1189                 }
1190                 #endif
1191
1192                 for( ; i < width; i++ )
1193                     SUM[i] += Sp[i];
1194             }
1195         }
1196         else
1197         {
1198             CV_Assert( sumCount == ksize-1 );
1199             src += ksize-1;
1200         }
1201
1202         for( ; count--; src++ )
1203         {
1204             const int * Sp = (const int*)src[0];
1205             const int * Sm = (const int*)src[1-ksize];
1206             float* D = (float*)dst;
1207             if( haveScale )
1208             {
1209                 int i = 0;
1210
1211                 #if CV_SSE2
1212                 if(haveSSE2)
1213                 {
1214                     const __m128 scale4 = _mm_set1_ps((float)_scale);
1215
1216                     for( ; i < width-4; i+=4)
1217                     {
1218                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
1219                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1220                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
1221
1222                         _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
1223                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
1224                     }
1225                 }
1226                 #elif CV_NEON
1227                 if(haveNEON)
1228                 {
1229                     float32x4_t v_scale = vdupq_n_f32((float)_scale);
1230                     for( ; i <= width-8; i+=8 )
1231                     {
1232                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1233                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
1234
1235                         vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
1236                         vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
1237
1238                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1239                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
1240                     }
1241                 }
1242                 #endif
1243
1244                 for( ; i < width; i++ )
1245                 {
1246                     int s0 = SUM[i] + Sp[i];
1247                     D[i] = (float)(s0*_scale);
1248                     SUM[i] = s0 - Sm[i];
1249                 }
1250             }
1251             else
1252             {
1253                 int i = 0;
1254
1255                 #if CV_SSE2
1256                 if(haveSSE2)
1257                 {
1258                     for( ; i < width-4; i+=4)
1259                     {
1260                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
1261                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
1262                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
1263
1264                         _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0));
1265                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
1266                     }
1267                 }
1268                 #elif CV_NEON
1269                 if(haveNEON)
1270                 {
1271                     for( ; i <= width-8; i+=8 )
1272                     {
1273                         int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
1274                         int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
1275
1276                         vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
1277                         vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
1278
1279                         vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
1280                         vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
1281                     }
1282                 }
1283                 #endif
1284
1285                 for( ; i < width; i++ )
1286                 {
1287                     int s0 = SUM[i] + Sp[i];
1288                     D[i] = (float)(s0);
1289                     SUM[i] = s0 - Sm[i];
1290                 }
1291             }
1292             dst += dststep;
1293         }
1294     }
1295
1296     double scale;
1297     int sumCount;
1298     std::vector<int> sum;
1299 };
1300
1301 #ifdef HAVE_OPENCL
1302
1303 static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth,
1304                                    Size ksize, Point anchor, int borderType, bool normalize )
1305 {
1306     const ocl::Device & dev = ocl::Device::getDefault();
1307     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1308
1309     if (ddepth < 0)
1310         ddepth = sdepth;
1311
1312     if (anchor.x < 0)
1313         anchor.x = ksize.width / 2;
1314     if (anchor.y < 0)
1315         anchor.y = ksize.height / 2;
1316
1317     if ( !(dev.isIntel() && (type == CV_8UC1) &&
1318          (_src.offset() == 0) && (_src.step() % 4 == 0) &&
1319          (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) &&
1320          (anchor.x == 1) && (anchor.y == 1) &&
1321          (ksize.width == 3) && (ksize.height == 3)) )
1322         return false;
1323
1324     float alpha = 1.0f / (ksize.height * ksize.width);
1325     Size size = _src.size();
1326     size_t globalsize[2] = { 0, 0 };
1327     size_t localsize[2] = { 0, 0 };
1328     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
1329
1330     globalsize[0] = size.width / 16;
1331     globalsize[1] = size.height / 2;
1332
1333     char build_opts[1024];
1334     sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
1335
1336     ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts);
1337     if (kernel.empty())
1338         return false;
1339
1340     UMat src = _src.getUMat();
1341     _dst.create(size, CV_MAKETYPE(ddepth, cn));
1342     if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
1343         return false;
1344     UMat dst = _dst.getUMat();
1345
1346     int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
1347     idxArg = kernel.set(idxArg, (int)src.step);
1348     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
1349     idxArg = kernel.set(idxArg, (int)dst.step);
1350     idxArg = kernel.set(idxArg, (int)dst.rows);
1351     idxArg = kernel.set(idxArg, (int)dst.cols);
1352     if (normalize)
1353         idxArg = kernel.set(idxArg, (float)alpha);
1354
1355     return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
1356 }
1357
1358 #define DIVUP(total, grain) ((total + grain - 1) / (grain))
1359 #define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
1360
1361 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
1362                            Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
1363 {
1364     const ocl::Device & dev = ocl::Device::getDefault();
1365     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
1366     bool doubleSupport = dev.doubleFPConfig() > 0;
1367
1368     if (ddepth < 0)
1369         ddepth = sdepth;
1370
1371     if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
1372         _src.offset() % esz != 0 || _src.step() % esz != 0)
1373         return false;
1374
1375     if (anchor.x < 0)
1376         anchor.x = ksize.width / 2;
1377     if (anchor.y < 0)
1378         anchor.y = ksize.height / 2;
1379
1380     int computeUnits = ocl::Device::getDefault().maxComputeUnits();
1381     float alpha = 1.0f / (ksize.height * ksize.width);
1382     Size size = _src.size(), wholeSize;
1383     bool isolated = (borderType & BORDER_ISOLATED) != 0;
1384     borderType &= ~BORDER_ISOLATED;
1385     int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
1386         wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
1387
1388     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
1389     size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
1390     size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
1391
1392     UMat src = _src.getUMat();
1393     if (!isolated)
1394     {
1395         Point ofs;
1396         src.locateROI(wholeSize, ofs);
1397     }
1398
1399     int h = isolated ? size.height : wholeSize.height;
1400     int w = isolated ? size.width : wholeSize.width;
1401
1402     size_t maxWorkItemSizes[32];
1403     ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
1404     int tryWorkItems = (int)maxWorkItemSizes[0];
1405
1406     ocl::Kernel kernel;
1407
1408     if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
1409         ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
1410          (ksize.width == 5 && ksize.height == 5 && cn == 1)))
1411     {
1412         if (w < ksize.width || h < ksize.height)
1413             return false;
1414
1415         // Figure out what vector size to use for loading the pixels.
1416         int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
1417         int pxLoadVecSize = cn * pxLoadNumPixels;
1418
1419         // Figure out how many pixels per work item to compute in X and Y
1420         // directions.  Too many and we run out of registers.
1421         int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
1422         if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
1423         {
1424             pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
1425             pxPerWorkItemY = size.height % 2 ? 1 : 2;
1426         }
1427         else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
1428         {
1429             pxPerWorkItemX = size.width % 2 ? 1 : 2;
1430             pxPerWorkItemY = size.height % 2 ? 1 : 2;
1431         }
1432         globalsize[0] = size.width / pxPerWorkItemX;
1433         globalsize[1] = size.height / pxPerWorkItemY;
1434
1435         // Need some padding in the private array for pixels
1436         int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
1437
1438         // Make the global size a nice round number so the runtime can pick
1439         // from reasonable choices for the workgroup size
1440         const int wgRound = 256;
1441         globalsize[0] = ROUNDUP(globalsize[0], wgRound);
1442
1443         char build_options[1024], cvt[2][40];
1444         sprintf(build_options, "-D cn=%d "
1445                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
1446                 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
1447                 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
1448                 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
1449                 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
1450                 "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
1451                 cn, anchor.x, anchor.y, ksize.width, ksize.height,
1452                 pxLoadVecSize, pxLoadNumPixels,
1453                 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
1454                 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
1455                 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
1456                 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
1457                 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
1458                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
1459                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
1460                 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
1461                 ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
1462                 );
1463
1464
1465         if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
1466             return false;
1467     }
1468     else
1469     {
1470         localsize = localsize_general;
1471         for ( ; ; )
1472         {
1473             int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
1474
1475             while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
1476                 BLOCK_SIZE_X /= 2;
1477             while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
1478                 BLOCK_SIZE_Y *= 2;
1479
1480             if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
1481                 return false;
1482
1483             char cvt[2][50];
1484             String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
1485                                  " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
1486                                  " -D ST1=%s -D DT1=%s -D cn=%d",
1487                                  BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
1488                                  ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
1489                                  ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
1490                                  ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
1491                                  anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
1492                                  isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1493                                  normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
1494                                  ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
1495
1496             localsize[0] = BLOCK_SIZE_X;
1497             globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
1498             globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);
1499
1500             kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
1501             if (kernel.empty())
1502                 return false;
1503
1504             size_t kernelWorkGroupSize = kernel.workGroupSize();
1505             if (localsize[0] <= kernelWorkGroupSize)
1506                 break;
1507             if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
1508                 return false;
1509
1510             tryWorkItems = (int)kernelWorkGroupSize;
1511         }
1512     }
1513
1514     _dst.create(size, CV_MAKETYPE(ddepth, cn));
1515     UMat dst = _dst.getUMat();
1516
1517     int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
1518     idxArg = kernel.set(idxArg, (int)src.step);
1519     int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
1520     int srcOffsetY = (int)(src.offset / src.step);
1521     int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
1522     int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
1523     idxArg = kernel.set(idxArg, srcOffsetX);
1524     idxArg = kernel.set(idxArg, srcOffsetY);
1525     idxArg = kernel.set(idxArg, srcEndX);
1526     idxArg = kernel.set(idxArg, srcEndY);
1527     idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
1528     if (normalize)
1529         idxArg = kernel.set(idxArg, (float)alpha);
1530
1531     return kernel.run(2, globalsize, localsize, false);
1532 }
1533
1534 #undef ROUNDUP
1535
1536 #endif
1537
1538 }
1539
1540
1541 cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
1542 {
1543     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
1544     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
1545
1546     if( anchor < 0 )
1547         anchor = ksize/2;
1548
1549     if( sdepth == CV_8U && ddepth == CV_32S )
1550         return makePtr<RowSum<uchar, int> >(ksize, anchor);
1551     if( sdepth == CV_8U && ddepth == CV_16U )
1552         return makePtr<RowSum<uchar, ushort> >(ksize, anchor);
1553     if( sdepth == CV_8U && ddepth == CV_64F )
1554         return makePtr<RowSum<uchar, double> >(ksize, anchor);
1555     if( sdepth == CV_16U && ddepth == CV_32S )
1556         return makePtr<RowSum<ushort, int> >(ksize, anchor);
1557     if( sdepth == CV_16U && ddepth == CV_64F )
1558         return makePtr<RowSum<ushort, double> >(ksize, anchor);
1559     if( sdepth == CV_16S && ddepth == CV_32S )
1560         return makePtr<RowSum<short, int> >(ksize, anchor);
1561     if( sdepth == CV_32S && ddepth == CV_32S )
1562         return makePtr<RowSum<int, int> >(ksize, anchor);
1563     if( sdepth == CV_16S && ddepth == CV_64F )
1564         return makePtr<RowSum<short, double> >(ksize, anchor);
1565     if( sdepth == CV_32F && ddepth == CV_64F )
1566         return makePtr<RowSum<float, double> >(ksize, anchor);
1567     if( sdepth == CV_64F && ddepth == CV_64F )
1568         return makePtr<RowSum<double, double> >(ksize, anchor);
1569
1570     CV_Error_( CV_StsNotImplemented,
1571         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
1572         srcType, sumType));
1573
1574     return Ptr<BaseRowFilter>();
1575 }
1576
1577
1578 cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
1579                                                      int anchor, double scale)
1580 {
1581     int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
1582     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
1583
1584     if( anchor < 0 )
1585         anchor = ksize/2;
1586
1587     if( ddepth == CV_8U && sdepth == CV_32S )
1588         return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
1589     if( ddepth == CV_8U && sdepth == CV_16U )
1590         return makePtr<ColumnSum<ushort, uchar> >(ksize, anchor, scale);
1591     if( ddepth == CV_8U && sdepth == CV_64F )
1592         return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
1593     if( ddepth == CV_16U && sdepth == CV_32S )
1594         return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
1595     if( ddepth == CV_16U && sdepth == CV_64F )
1596         return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
1597     if( ddepth == CV_16S && sdepth == CV_32S )
1598         return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
1599     if( ddepth == CV_16S && sdepth == CV_64F )
1600         return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
1601     if( ddepth == CV_32S && sdepth == CV_32S )
1602         return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
1603     if( ddepth == CV_32F && sdepth == CV_32S )
1604         return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
1605     if( ddepth == CV_32F && sdepth == CV_64F )
1606         return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
1607     if( ddepth == CV_64F && sdepth == CV_32S )
1608         return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
1609     if( ddepth == CV_64F && sdepth == CV_64F )
1610         return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
1611
1612     CV_Error_( CV_StsNotImplemented,
1613         ("Unsupported combination of sum format (=%d), and destination format (=%d)",
1614         sumType, dstType));
1615
1616     return Ptr<BaseColumnFilter>();
1617 }
1618
1619
1620 cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
1621                     Point anchor, bool normalize, int borderType )
1622 {
1623     int sdepth = CV_MAT_DEPTH(srcType);
1624     int cn = CV_MAT_CN(srcType), sumType = CV_64F;
1625     if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U &&
1626         ksize.width*ksize.height <= 256 )
1627         sumType = CV_16U;
1628     else if( sdepth <= CV_32S && (!normalize ||
1629         ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
1630             sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
1631         sumType = CV_32S;
1632     sumType = CV_MAKETYPE( sumType, cn );
1633
1634     Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
1635     Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
1636         dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
1637
1638     return makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
1639            srcType, dstType, sumType, borderType );
1640 }
1641
1642 #ifdef HAVE_OPENVX
1643 namespace cv
1644 {
1645     static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth,
1646                                  Size ksize, Point anchor,
1647                                  bool normalize, int borderType)
1648     {
1649         int stype = _src.type();
1650         if (ddepth < 0)
1651             ddepth = CV_8UC1;
1652         if (stype != CV_8UC1 || (ddepth != CV_8U && ddepth != CV_16S) ||
1653             (anchor.x >= 0 && anchor.x != ksize.width / 2) ||
1654             (anchor.y >= 0 && anchor.y != ksize.height / 2) ||
1655             ksize.width % 2 != 1 || ksize.height % 2 != 1 ||
1656             ksize.width < 3 || ksize.height < 3)
1657             return false;
1658
1659         Mat src = _src.getMat();
1660         _dst.create(src.size(), CV_MAKETYPE(ddepth, 1));
1661         Mat dst = _dst.getMat();
1662
1663         if (src.cols < ksize.width || src.rows < ksize.height)
1664             return false;
1665
1666         if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
1667             return false; //Process isolated borders only
1668         vx_border_t border;
1669         switch (borderType & ~BORDER_ISOLATED)
1670         {
1671         case BORDER_CONSTANT:
1672             border.mode = VX_BORDER_CONSTANT;
1673 #if VX_VERSION > VX_VERSION_1_0
1674             border.constant_value.U8 = (vx_uint8)(0);
1675 #else
1676             border.constant_value = (vx_uint32)(0);
1677 #endif
1678             break;
1679         case BORDER_REPLICATE:
1680             border.mode = VX_BORDER_REPLICATE;
1681             break;
1682         default:
1683             return false;
1684         }
1685
1686         try
1687         {
1688             ivx::Context ctx = ivx::Context::create();
1689             if ((vx_size)(ksize.width) > ctx.convolutionMaxDimension() || (vx_size)(ksize.height) > ctx.convolutionMaxDimension())
1690                 return false;
1691
1692             Mat a;
1693             if (dst.data != src.data)
1694                 a = src;
1695             else
1696                 src.copyTo(a);
1697
1698             ivx::Image
1699                 ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
1700                                                   ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
1701                 ib = ivx::Image::createFromHandle(ctx, ddepth == CV_16S ? VX_DF_IMAGE_S16 : VX_DF_IMAGE_U8,
1702                                                   ivx::Image::createAddressing(dst.cols, dst.rows, ddepth == CV_16S ? 2 : 1, (vx_int32)(dst.step)), dst.data);
1703
1704             //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
1705             //since OpenVX standart says nothing about thread-safety for now
1706             vx_border_t prevBorder = ctx.borderMode();
1707             ctx.setBorderMode(border);
1708             if (ddepth == CV_8U && ksize.width == 3 && ksize.height == 3 && normalize)
1709             {
1710                 ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib));
1711             }
1712             else
1713             {
1714 #if VX_VERSION <= VX_VERSION_1_0
1715                 if (ctx.vendorID() == VX_ID_KHRONOS && ((vx_size)(src.cols) <= ctx.convolutionMaxDimension() || (vx_size)(src.rows) <= ctx.convolutionMaxDimension()))
1716                 {
1717                     ctx.setBorderMode(prevBorder);
1718                     return false;
1719                 }
1720 #endif
1721                 Mat convData(ksize, CV_16SC1);
1722                 convData = normalize ? (1 << 15) / (ksize.width * ksize.height) : 1;
1723                 ivx::Convolution cnv = ivx::Convolution::create(ctx, convData.cols, convData.rows);
1724                 cnv.copyFrom(convData);
1725                 if (normalize)
1726                     cnv.setScale(1 << 15);
1727                 ivx::IVX_CHECK_STATUS(vxuConvolve(ctx, ia, cnv, ib));
1728             }
1729             ctx.setBorderMode(prevBorder);
1730         }
1731         catch (ivx::RuntimeError & e)
1732         {
1733             CV_Error(CV_StsInternal, e.what());
1734             return false;
1735         }
1736         catch (ivx::WrapperError & e)
1737         {
1738             CV_Error(CV_StsInternal, e.what());
1739             return false;
1740         }
1741
1742         return true;
1743     }
1744 }
1745 #endif
1746
1747 // TODO: IPP performance regression
1748 #if defined(HAVE_IPP) && IPP_DISABLE_BLOCK
1749 namespace cv
1750 {
1751 static bool ipp_boxfilter( InputArray _src, OutputArray _dst, int ddepth,
1752                 Size ksize, Point anchor,
1753                 bool normalize, int borderType )
1754 {
1755     CV_INSTRUMENT_REGION_IPP()
1756
1757     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
1758     if( ddepth < 0 )
1759         ddepth = sdepth;
1760     int ippBorderType = borderType & ~BORDER_ISOLATED;
1761     Point ocvAnchor, ippAnchor;
1762     ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
1763     ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
1764     ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
1765     ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
1766
1767     Mat src = _src.getMat();
1768     _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
1769     Mat dst = _dst.getMat();
1770     if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
1771     {
1772         if( src.rows == 1 )
1773             ksize.height = 1;
1774         if( src.cols == 1 )
1775             ksize.width = 1;
1776     }
1777
1778     {
1779         if (normalize && !src.isSubmatrix() && ddepth == sdepth &&
1780             (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
1781              ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
1782              dst.cols != ksize.width && dst.rows != ksize.height) // returns ippStsMaskSizeErr: mask has an illegal value
1783         {
1784             Ipp32s bufSize = 0;
1785             IppiSize roiSize = { dst.cols, dst.rows }, maskSize = { ksize.width, ksize.height };
1786
1787 #define IPP_FILTER_BOX_BORDER(ippType, ippDataType, flavor) \
1788             do \
1789             { \
1790                 if (ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ippDataType, cn, &bufSize) >= 0) \
1791                 { \
1792                     Ipp8u * buffer = ippsMalloc_8u(bufSize); \
1793                     ippType borderValue[4] = { 0, 0, 0, 0 }; \
1794                     ippBorderType = ippBorderType == BORDER_CONSTANT ? ippBorderConst : ippBorderRepl; \
1795                     IppStatus status = CV_INSTRUMENT_FUN_IPP(ippiFilterBoxBorder_##flavor, src.ptr<ippType>(), (int)src.step, dst.ptr<ippType>(), \
1796                                                                     (int)dst.step, roiSize, maskSize, \
1797                                                                     (IppiBorderType)ippBorderType, borderValue, buffer); \
1798                     ippsFree(buffer); \
1799                     if (status >= 0) \
1800                     { \
1801                         CV_IMPL_ADD(CV_IMPL_IPP); \
1802                         return true; \
1803                     } \
1804                 } \
1805             } while ((void)0, 0)
1806
1807             if (stype == CV_8UC1)
1808                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C1R);
1809             else if (stype == CV_8UC3)
1810                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C3R);
1811             else if (stype == CV_8UC4)
1812                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C4R);
1813
1814             // Oct 2014: performance with BORDER_CONSTANT
1815             //else if (stype == CV_16UC1)
1816             //    IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C1R);
1817             else if (stype == CV_16UC3)
1818                 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C3R);
1819             else if (stype == CV_16UC4)
1820                 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C4R);
1821
1822             // Oct 2014: performance with BORDER_CONSTANT
1823             //else if (stype == CV_16SC1)
1824             //    IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C1R);
1825             else if (stype == CV_16SC3)
1826                 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C3R);
1827             else if (stype == CV_16SC4)
1828                 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C4R);
1829
1830             else if (stype == CV_32FC1)
1831                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C1R);
1832             else if (stype == CV_32FC3)
1833                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C3R);
1834             else if (stype == CV_32FC4)
1835                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C4R);
1836         }
1837 #undef IPP_FILTER_BOX_BORDER
1838     }
1839     return false;
1840 }
1841 }
1842 #endif
1843
1844
1845 void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
1846                 Size ksize, Point anchor,
1847                 bool normalize, int borderType )
1848 {
1849     CV_INSTRUMENT_REGION()
1850
1851     CV_OCL_RUN(_dst.isUMat() &&
1852                (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT ||
1853                 borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101),
1854                ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
1855
1856     CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
1857
1858 #ifdef HAVE_OPENVX
1859     if (openvx_boxfilter(_src, _dst, ddepth, ksize, anchor, normalize, borderType))
1860         return;
1861 #endif
1862
1863     Mat src = _src.getMat();
1864     int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
1865     if( ddepth < 0 )
1866         ddepth = sdepth;
1867     _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
1868     Mat dst = _dst.getMat();
1869     if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
1870     {
1871         if( src.rows == 1 )
1872             ksize.height = 1;
1873         if( src.cols == 1 )
1874             ksize.width = 1;
1875     }
1876 #ifdef HAVE_TEGRA_OPTIMIZATION
1877     if ( tegra::useTegra() && tegra::box(src, dst, ksize, anchor, normalize, borderType) )
1878         return;
1879 #endif
1880
1881 #if defined HAVE_IPP && IPP_DISABLE_BLOCK
1882     int ippBorderType = borderType & ~BORDER_ISOLATED;
1883     Point ocvAnchor, ippAnchor;
1884     ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
1885     ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
1886     ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
1887     ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
1888     CV_IPP_RUN((normalize && !_src.isSubmatrix() && ddepth == sdepth &&
1889             (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
1890              ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
1891              _dst.cols() != ksize.width && _dst.rows() != ksize.height),
1892              ipp_boxfilter( _src,  _dst,  ddepth, ksize,  anchor, normalize,  borderType));
1893 #endif
1894
1895     Point ofs;
1896     Size wsz(src.cols, src.rows);
1897     if(!(borderType&BORDER_ISOLATED))
1898         src.locateROI( wsz, ofs );
1899     borderType = (borderType&~BORDER_ISOLATED);
1900
1901     Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
1902                         ksize, anchor, normalize, borderType );
1903
1904     f->apply( src, dst, wsz, ofs );
1905 }
1906
1907
1908 void cv::blur( InputArray src, OutputArray dst,
1909            Size ksize, Point anchor, int borderType )
1910 {
1911     CV_INSTRUMENT_REGION()
1912
1913     boxFilter( src, dst, -1, ksize, anchor, true, borderType );
1914 }
1915
1916
1917 /****************************************************************************************\
1918                                     Squared Box Filter
1919 \****************************************************************************************/
1920
1921 namespace cv
1922 {
1923
1924 template<typename T, typename ST>
1925 struct SqrRowSum :
1926         public BaseRowFilter
1927 {
1928     SqrRowSum( int _ksize, int _anchor ) :
1929         BaseRowFilter()
1930     {
1931         ksize = _ksize;
1932         anchor = _anchor;
1933     }
1934
1935     virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
1936     {
1937         const T* S = (const T*)src;
1938         ST* D = (ST*)dst;
1939         int i = 0, k, ksz_cn = ksize*cn;
1940
1941         width = (width - 1)*cn;
1942         for( k = 0; k < cn; k++, S++, D++ )
1943         {
1944             ST s = 0;
1945             for( i = 0; i < ksz_cn; i += cn )
1946             {
1947                 ST val = (ST)S[i];
1948                 s += val*val;
1949             }
1950             D[0] = s;
1951             for( i = 0; i < width; i += cn )
1952             {
1953                 ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
1954                 s += val1*val1 - val0*val0;
1955                 D[i+cn] = s;
1956             }
1957         }
1958     }
1959 };
1960
1961 static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
1962 {
1963     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
1964     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
1965
1966     if( anchor < 0 )
1967         anchor = ksize/2;
1968
1969     if( sdepth == CV_8U && ddepth == CV_32S )
1970         return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
1971     if( sdepth == CV_8U && ddepth == CV_64F )
1972         return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
1973     if( sdepth == CV_16U && ddepth == CV_64F )
1974         return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
1975     if( sdepth == CV_16S && ddepth == CV_64F )
1976         return makePtr<SqrRowSum<short, double> >(ksize, anchor);
1977     if( sdepth == CV_32F && ddepth == CV_64F )
1978         return makePtr<SqrRowSum<float, double> >(ksize, anchor);
1979     if( sdepth == CV_64F && ddepth == CV_64F )
1980         return makePtr<SqrRowSum<double, double> >(ksize, anchor);
1981
1982     CV_Error_( CV_StsNotImplemented,
1983               ("Unsupported combination of source format (=%d), and buffer format (=%d)",
1984                srcType, sumType));
1985
1986     return Ptr<BaseRowFilter>();
1987 }
1988
1989 }
1990
1991 void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
1992                        Size ksize, Point anchor,
1993                        bool normalize, int borderType )
1994 {
1995     CV_INSTRUMENT_REGION()
1996
1997     int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
1998     Size size = _src.size();
1999
2000     if( ddepth < 0 )
2001         ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
2002
2003     if( borderType != BORDER_CONSTANT && normalize )
2004     {
2005         if( size.height == 1 )
2006             ksize.height = 1;
2007         if( size.width == 1 )
2008             ksize.width = 1;
2009     }
2010
2011     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
2012                ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
2013
2014     int sumDepth = CV_64F;
2015     if( sdepth == CV_8U )
2016         sumDepth = CV_32S;
2017     int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
2018
2019     Mat src = _src.getMat();
2020     _dst.create( size, dstType );
2021     Mat dst = _dst.getMat();
2022
2023     Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
2024     Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
2025                                                             dstType, ksize.height, anchor.y,
2026                                                             normalize ? 1./(ksize.width*ksize.height) : 1);
2027
2028     Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
2029                                                 srcType, dstType, sumType, borderType );
2030     Point ofs;
2031     Size wsz(src.cols, src.rows);
2032     src.locateROI( wsz, ofs );
2033
2034     f->apply( src, dst, wsz, ofs );
2035 }
2036
2037
2038 /****************************************************************************************\
2039                                      Gaussian Blur
2040 \****************************************************************************************/
2041
2042 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
2043 {
2044     const int SMALL_GAUSSIAN_SIZE = 7;
2045     static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
2046     {
2047         {1.f},
2048         {0.25f, 0.5f, 0.25f},
2049         {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f},
2050         {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f}
2051     };
2052
2053     const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ?
2054         small_gaussian_tab[n>>1] : 0;
2055
2056     CV_Assert( ktype == CV_32F || ktype == CV_64F );
2057     Mat kernel(n, 1, ktype);
2058     float* cf = kernel.ptr<float>();
2059     double* cd = kernel.ptr<double>();
2060
2061     double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8;
2062     double scale2X = -0.5/(sigmaX*sigmaX);
2063     double sum = 0;
2064
2065     int i;
2066     for( i = 0; i < n; i++ )
2067     {
2068         double x = i - (n-1)*0.5;
2069         double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x);
2070         if( ktype == CV_32F )
2071         {
2072             cf[i] = (float)t;
2073             sum += cf[i];
2074         }
2075         else
2076         {
2077             cd[i] = t;
2078             sum += cd[i];
2079         }
2080     }
2081
2082     sum = 1./sum;
2083     for( i = 0; i < n; i++ )
2084     {
2085         if( ktype == CV_32F )
2086             cf[i] = (float)(cf[i]*sum);
2087         else
2088             cd[i] *= sum;
2089     }
2090
2091     return kernel;
2092 }
2093
2094 namespace cv {
2095
2096 static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize,
2097                                    double sigma1, double sigma2 )
2098 {
2099     int depth = CV_MAT_DEPTH(type);
2100     if( sigma2 <= 0 )
2101         sigma2 = sigma1;
2102
2103     // automatic detection of kernel size from sigma
2104     if( ksize.width <= 0 && sigma1 > 0 )
2105         ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
2106     if( ksize.height <= 0 && sigma2 > 0 )
2107         ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
2108
2109     CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 &&
2110         ksize.height > 0 && ksize.height % 2 == 1 );
2111
2112     sigma1 = std::max( sigma1, 0. );
2113     sigma2 = std::max( sigma2, 0. );
2114
2115     kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
2116     if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
2117         ky = kx;
2118     else
2119         ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
2120 }
2121
2122 }
2123
2124 cv::Ptr<cv::FilterEngine> cv::createGaussianFilter( int type, Size ksize,
2125                                         double sigma1, double sigma2,
2126                                         int borderType )
2127 {
2128     Mat kx, ky;
2129     createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
2130
2131     return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType );
2132 }
2133
2134 namespace cv
2135 {
2136 #ifdef HAVE_OPENCL
2137
2138 static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth,
2139                                   InputArray _kernelX, InputArray _kernelY, int borderType)
2140 {
2141     const ocl::Device & dev = ocl::Device::getDefault();
2142     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2143
2144     if ( !(dev.isIntel() && (type == CV_8UC1) &&
2145          (_src.offset() == 0) && (_src.step() % 4 == 0) &&
2146          ((ksize.width == 5 && (_src.cols() % 4 == 0)) ||
2147          (ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) )
2148         return false;
2149
2150     Mat kernelX = _kernelX.getMat().reshape(1, 1);
2151     if (kernelX.cols % 2 != 1)
2152         return false;
2153     Mat kernelY = _kernelY.getMat().reshape(1, 1);
2154     if (kernelY.cols % 2 != 1)
2155         return false;
2156
2157     if (ddepth < 0)
2158         ddepth = sdepth;
2159
2160     Size size = _src.size();
2161     size_t globalsize[2] = { 0, 0 };
2162     size_t localsize[2] = { 0, 0 };
2163
2164     if (ksize.width == 3)
2165     {
2166         globalsize[0] = size.width / 16;
2167         globalsize[1] = size.height / 2;
2168     }
2169     else if (ksize.width == 5)
2170     {
2171         globalsize[0] = size.width / 4;
2172         globalsize[1] = size.height / 1;
2173     }
2174
2175     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
2176     char build_opts[1024];
2177     sprintf(build_opts, "-D %s %s%s", borderMap[borderType],
2178             ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(),
2179             ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str());
2180
2181     ocl::Kernel kernel;
2182
2183     if (ksize.width == 3)
2184         kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts);
2185     else if (ksize.width == 5)
2186         kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts);
2187
2188     if (kernel.empty())
2189         return false;
2190
2191     UMat src = _src.getUMat();
2192     _dst.create(size, CV_MAKETYPE(ddepth, cn));
2193     if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
2194         return false;
2195     UMat dst = _dst.getUMat();
2196
2197     int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
2198     idxArg = kernel.set(idxArg, (int)src.step);
2199     idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
2200     idxArg = kernel.set(idxArg, (int)dst.step);
2201     idxArg = kernel.set(idxArg, (int)dst.rows);
2202     idxArg = kernel.set(idxArg, (int)dst.cols);
2203
2204     return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
2205 }
2206
2207 #endif
2208
2209 #ifdef HAVE_OPENVX
2210
2211 static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
2212                                 double sigma1, double sigma2, int borderType)
2213 {
2214     int stype = _src.type();
2215     if (sigma2 <= 0)
2216         sigma2 = sigma1;
2217     // automatic detection of kernel size from sigma
2218     if (ksize.width <= 0 && sigma1 > 0)
2219         ksize.width = cvRound(sigma1*6 + 1) | 1;
2220     if (ksize.height <= 0 && sigma2 > 0)
2221         ksize.height = cvRound(sigma2*6 + 1) | 1;
2222
2223     if (stype != CV_8UC1 ||
2224         ksize.width < 3 || ksize.height < 3 ||
2225         ksize.width % 2 != 1 || ksize.height % 2 != 1)
2226         return false;
2227
2228     sigma1 = std::max(sigma1, 0.);
2229     sigma2 = std::max(sigma2, 0.);
2230
2231     Mat src = _src.getMat();
2232     Mat dst = _dst.getMat();
2233
2234     if (src.cols < ksize.width || src.rows < ksize.height)
2235         return false;
2236
2237     if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
2238         return false; //Process isolated borders only
2239     vx_border_t border;
2240     switch (borderType & ~BORDER_ISOLATED)
2241     {
2242     case BORDER_CONSTANT:
2243         border.mode = VX_BORDER_CONSTANT;
2244 #if VX_VERSION > VX_VERSION_1_0
2245         border.constant_value.U8 = (vx_uint8)(0);
2246 #else
2247         border.constant_value = (vx_uint32)(0);
2248 #endif
2249         break;
2250     case BORDER_REPLICATE:
2251         border.mode = VX_BORDER_REPLICATE;
2252         break;
2253     default:
2254         return false;
2255     }
2256
2257     try
2258     {
2259         ivx::Context ctx = ivx::Context::create();
2260         if ((vx_size)(ksize.width) > ctx.convolutionMaxDimension() || (vx_size)(ksize.height) > ctx.convolutionMaxDimension())
2261             return false;
2262
2263         Mat a;
2264         if (dst.data != src.data)
2265             a = src;
2266         else
2267             src.copyTo(a);
2268
2269         ivx::Image
2270             ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
2271                 ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
2272             ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
2273                 ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
2274
2275         //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
2276         //since OpenVX standart says nothing about thread-safety for now
2277         vx_border_t prevBorder = ctx.borderMode();
2278         ctx.setBorderMode(border);
2279         if (ksize.width == 3 && ksize.height == 3 && (sigma1 == 0.0 || (sigma1 - 0.8) < DBL_EPSILON) && (sigma2 == 0.0 || (sigma2 - 0.8) < DBL_EPSILON))
2280         {
2281             ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib));
2282         }
2283         else
2284         {
2285 #if VX_VERSION <= VX_VERSION_1_0
2286             if (ctx.vendorID() == VX_ID_KHRONOS && ((vx_size)(a.cols) <= ctx.convolutionMaxDimension() || (vx_size)(a.rows) <= ctx.convolutionMaxDimension()))
2287             {
2288                 ctx.setBorderMode(prevBorder);
2289                 return false;
2290             }
2291 #endif
2292             Mat convData;
2293             cv::Mat(cv::getGaussianKernel(ksize.height, sigma2)*cv::getGaussianKernel(ksize.width, sigma1).t()).convertTo(convData, CV_16SC1, (1 << 15));
2294             ivx::Convolution cnv = ivx::Convolution::create(ctx, convData.cols, convData.rows);
2295             cnv.copyFrom(convData);
2296             cnv.setScale(1 << 15);
2297             ivx::IVX_CHECK_STATUS(vxuConvolve(ctx, ia, cnv, ib));
2298         }
2299         ctx.setBorderMode(prevBorder);
2300     }
2301     catch (ivx::RuntimeError & e)
2302     {
2303         CV_Error(CV_StsInternal, e.what());
2304         return false;
2305     }
2306     catch (ivx::WrapperError & e)
2307     {
2308         CV_Error(CV_StsInternal, e.what());
2309         return false;
2310     }
2311     return true;
2312 }
2313
2314 #endif
2315
2316 #ifdef HAVE_IPP
2317
2318 static bool ipp_GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
2319                    double sigma1, double sigma2,
2320                    int borderType )
2321 {
2322     CV_INSTRUMENT_REGION_IPP()
2323
2324 #if IPP_VERSION_X100 >= 810
2325     if ((borderType & BORDER_ISOLATED) == 0 && _src.isSubmatrix())
2326         return false;
2327
2328     int type = _src.type();
2329     Size size = _src.size();
2330
2331     if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
2332     {
2333         if( size.height == 1 )
2334             ksize.height = 1;
2335         if( size.width == 1 )
2336             ksize.width = 1;
2337     }
2338
2339     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2340
2341     if ((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn == 1 || cn == 3) &&
2342             sigma1 == sigma2 && ksize.width == ksize.height && sigma1 != 0.0 )
2343     {
2344         IppiBorderType ippBorder = ippiGetBorderType(borderType);
2345         if (ippBorderConst == ippBorder || ippBorderRepl == ippBorder)
2346         {
2347             Mat src = _src.getMat(), dst = _dst.getMat();
2348             IppiSize roiSize = { src.cols, src.rows };
2349             IppDataType dataType = ippiGetDataType(depth);
2350             Ipp32s specSize = 0, bufferSize = 0;
2351
2352             if (ippiFilterGaussianGetBufferSize(roiSize, (Ipp32u)ksize.width, dataType, cn, &specSize, &bufferSize) >= 0)
2353             {
2354                 IppAutoBuffer<IppFilterGaussianSpec> spec(specSize);
2355                 IppAutoBuffer<Ipp8u> buffer(bufferSize);
2356
2357                 if (ippiFilterGaussianInit(roiSize, (Ipp32u)ksize.width, (Ipp32f)sigma1, ippBorder, dataType, cn, spec, buffer) >= 0)
2358                 {
2359 #define IPP_FILTER_GAUSS_C1(ippfavor) \
2360                     { \
2361                         Ipp##ippfavor borderValues = 0; \
2362                         status = CV_INSTRUMENT_FUN_IPP(ippiFilterGaussianBorder_##ippfavor##_C1R, src.ptr<Ipp##ippfavor>(), (int)src.step, \
2363                                 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
2364                     }
2365
2366 #define IPP_FILTER_GAUSS_CN(ippfavor, ippcn) \
2367                     { \
2368                         Ipp##ippfavor borderValues[] = { 0, 0, 0 }; \
2369                         status = CV_INSTRUMENT_FUN_IPP(ippiFilterGaussianBorder_##ippfavor##_C##ippcn##R, src.ptr<Ipp##ippfavor>(), (int)src.step, \
2370                                 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
2371                     }
2372
2373                     IppStatus status = ippStsErr;
2374 #if IPP_VERSION_X100 > 900 // Buffer overflow may happen in IPP 9.0.0 and less
2375                     if (type == CV_8UC1)
2376                         IPP_FILTER_GAUSS_C1(8u)
2377                     else
2378 #endif
2379                     if (type == CV_8UC3)
2380                         IPP_FILTER_GAUSS_CN(8u, 3)
2381                     else if (type == CV_16UC1)
2382                         IPP_FILTER_GAUSS_C1(16u)
2383                     else if (type == CV_16UC3)
2384                         IPP_FILTER_GAUSS_CN(16u, 3)
2385                     else if (type == CV_16SC1)
2386                         IPP_FILTER_GAUSS_C1(16s)
2387                     else if (type == CV_16SC3)
2388                         IPP_FILTER_GAUSS_CN(16s, 3)
2389                     else if (type == CV_32FC3)
2390                         IPP_FILTER_GAUSS_CN(32f, 3)
2391                     else if (type == CV_32FC1)
2392                         IPP_FILTER_GAUSS_C1(32f)
2393
2394                     if(status >= 0)
2395                         return true;
2396
2397 #undef IPP_FILTER_GAUSS_C1
2398 #undef IPP_FILTER_GAUSS_CN
2399                 }
2400             }
2401         }
2402     }
2403 #else
2404     CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
2405 #endif
2406     return false;
2407 }
2408 #endif
2409 }
2410
2411
2412 void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
2413                    double sigma1, double sigma2,
2414                    int borderType )
2415 {
2416     CV_INSTRUMENT_REGION()
2417
2418     int type = _src.type();
2419     Size size = _src.size();
2420     _dst.create( size, type );
2421
2422     if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
2423     {
2424         if( size.height == 1 )
2425             ksize.height = 1;
2426         if( size.width == 1 )
2427             ksize.width = 1;
2428     }
2429
2430     if( ksize.width == 1 && ksize.height == 1 )
2431     {
2432         _src.copyTo(_dst);
2433         return;
2434     }
2435
2436 #ifdef HAVE_OPENVX
2437     if (openvx_gaussianBlur(_src, _dst, ksize, sigma1, sigma2, borderType))
2438         return;
2439 #endif
2440
2441 #ifdef HAVE_TEGRA_OPTIMIZATION
2442     Mat src = _src.getMat();
2443     Mat dst = _dst.getMat();
2444     if(sigma1 == 0 && sigma2 == 0 && tegra::useTegra() && tegra::gaussian(src, dst, ksize, borderType))
2445         return;
2446 #endif
2447
2448     CV_IPP_RUN(!(ocl::useOpenCL() && _dst.isUMat()), ipp_GaussianBlur( _src,  _dst,  ksize, sigma1,  sigma2, borderType));
2449
2450     Mat kx, ky;
2451     createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
2452
2453     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 &&
2454                ((ksize.width == 3 && ksize.height == 3) ||
2455                (ksize.width == 5 && ksize.height == 5)) &&
2456                (size_t)_src.rows() > ky.total() && (size_t)_src.cols() > kx.total(),
2457                ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType));
2458
2459     sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
2460 }
2461
2462 /****************************************************************************************\
2463                                       Median Filter
2464 \****************************************************************************************/
2465
2466 namespace cv
2467 {
2468 typedef ushort HT;
2469
2470 /**
2471  * This structure represents a two-tier histogram. The first tier (known as the
2472  * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
2473  * is 8 bit wide. Pixels inserted in the fine level also get inserted into the
2474  * coarse bucket designated by the 4 MSBs of the fine bucket value.
2475  *
2476  * The structure is aligned on 16 bits, which is a prerequisite for SIMD
2477  * instructions. Each bucket is 16 bit wide, which means that extra care must be
2478  * taken to prevent overflow.
2479  */
2480 typedef struct
2481 {
2482     HT coarse[16];
2483     HT fine[16][16];
2484 } Histogram;
2485
2486
2487 #if CV_SSE2
2488 #define MEDIAN_HAVE_SIMD 1
2489
2490 static inline void histogram_add_simd( const HT x[16], HT y[16] )
2491 {
2492     const __m128i* rx = (const __m128i*)x;
2493     __m128i* ry = (__m128i*)y;
2494     __m128i r0 = _mm_add_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
2495     __m128i r1 = _mm_add_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
2496     _mm_store_si128(ry+0, r0);
2497     _mm_store_si128(ry+1, r1);
2498 }
2499
2500 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
2501 {
2502     const __m128i* rx = (const __m128i*)x;
2503     __m128i* ry = (__m128i*)y;
2504     __m128i r0 = _mm_sub_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
2505     __m128i r1 = _mm_sub_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
2506     _mm_store_si128(ry+0, r0);
2507     _mm_store_si128(ry+1, r1);
2508 }
2509
2510 #elif CV_NEON
2511 #define MEDIAN_HAVE_SIMD 1
2512
2513 static inline void histogram_add_simd( const HT x[16], HT y[16] )
2514 {
2515     vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y)));
2516     vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
2517 }
2518
2519 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
2520 {
2521     vst1q_u16(y, vsubq_u16(vld1q_u16(y), vld1q_u16(x)));
2522     vst1q_u16(y + 8, vsubq_u16(vld1q_u16(y + 8), vld1q_u16(x + 8)));
2523 }
2524
2525 #else
2526 #define MEDIAN_HAVE_SIMD 0
2527 #endif
2528
2529
2530 static inline void histogram_add( const HT x[16], HT y[16] )
2531 {
2532     int i;
2533     for( i = 0; i < 16; ++i )
2534         y[i] = (HT)(y[i] + x[i]);
2535 }
2536
2537 static inline void histogram_sub( const HT x[16], HT y[16] )
2538 {
2539     int i;
2540     for( i = 0; i < 16; ++i )
2541         y[i] = (HT)(y[i] - x[i]);
2542 }
2543
2544 static inline void histogram_muladd( int a, const HT x[16],
2545         HT y[16] )
2546 {
2547     for( int i = 0; i < 16; ++i )
2548         y[i] = (HT)(y[i] + a * x[i]);
2549 }
2550
2551 static void
2552 medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
2553 {
2554 /**
2555  * HOP is short for Histogram OPeration. This macro makes an operation \a op on
2556  * histogram \a h for pixel value \a x. It takes care of handling both levels.
2557  */
2558 #define HOP(h,x,op) \
2559     h.coarse[x>>4] op, \
2560     *((HT*)h.fine + x) op
2561
2562 #define COP(c,j,x,op) \
2563     h_coarse[ 16*(n*c+j) + (x>>4) ] op, \
2564     h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op
2565
2566     int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
2567     size_t sstep = _src.step, dstep = _dst.step;
2568     Histogram CV_DECL_ALIGNED(16) H[4];
2569     HT CV_DECL_ALIGNED(16) luc[4][16];
2570
2571     int STRIPE_SIZE = std::min( _dst.cols, 512/cn );
2572
2573     std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
2574     std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
2575     HT* h_coarse = alignPtr(&_h_coarse[0], 16);
2576     HT* h_fine = alignPtr(&_h_fine[0], 16);
2577 #if MEDIAN_HAVE_SIMD
2578     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
2579 #endif
2580
2581     for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
2582     {
2583         int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2;
2584         const uchar* src = _src.ptr() + x*cn;
2585         uchar* dst = _dst.ptr() + (x - r)*cn;
2586
2587         memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) );
2588         memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) );
2589
2590         // First row initialization
2591         for( c = 0; c < cn; c++ )
2592         {
2593             for( j = 0; j < n; j++ )
2594                 COP( c, j, src[cn*j+c], += (cv::HT)(r+2) );
2595
2596             for( i = 1; i < r; i++ )
2597             {
2598                 const uchar* p = src + sstep*std::min(i, m-1);
2599                 for ( j = 0; j < n; j++ )
2600                     COP( c, j, p[cn*j+c], ++ );
2601             }
2602         }
2603
2604         for( i = 0; i < m; i++ )
2605         {
2606             const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
2607             const uchar* p1 = src + sstep * std::min( m-1, i+r );
2608
2609             memset( H, 0, cn*sizeof(H[0]) );
2610             memset( luc, 0, cn*sizeof(luc[0]) );
2611             for( c = 0; c < cn; c++ )
2612             {
2613                 // Update column histograms for the entire row.
2614                 for( j = 0; j < n; j++ )
2615                 {
2616                     COP( c, j, p0[j*cn + c], -- );
2617                     COP( c, j, p1[j*cn + c], ++ );
2618                 }
2619
2620                 // First column initialization
2621                 for( k = 0; k < 16; ++k )
2622                     histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] );
2623
2624             #if MEDIAN_HAVE_SIMD
2625                 if( useSIMD )
2626                 {
2627                     for( j = 0; j < 2*r; ++j )
2628                         histogram_add_simd( &h_coarse[16*(n*c+j)], H[c].coarse );
2629
2630                     for( j = r; j < n-r; j++ )
2631                     {
2632                         int t = 2*r*r + 2*r, b, sum = 0;
2633                         HT* segment;
2634
2635                         histogram_add_simd( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
2636
2637                         // Find median at coarse level
2638                         for ( k = 0; k < 16 ; ++k )
2639                         {
2640                             sum += H[c].coarse[k];
2641                             if ( sum > t )
2642                             {
2643                                 sum -= H[c].coarse[k];
2644                                 break;
2645                             }
2646                         }
2647                         assert( k < 16 );
2648
2649                         /* Update corresponding histogram segment */
2650                         if ( luc[c][k] <= j-r )
2651                         {
2652                             memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
2653                             for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
2654                                 histogram_add_simd( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
2655
2656                             if ( luc[c][k] < j+r+1 )
2657                             {
2658                                 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
2659                                 luc[c][k] = (HT)(j+r+1);
2660                             }
2661                         }
2662                         else
2663                         {
2664                             for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
2665                             {
2666                                 histogram_sub_simd( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
2667                                 histogram_add_simd( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
2668                             }
2669                         }
2670
2671                         histogram_sub_simd( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
2672
2673                         /* Find median in segment */
2674                         segment = H[c].fine[k];
2675                         for ( b = 0; b < 16 ; b++ )
2676                         {
2677                             sum += segment[b];
2678                             if ( sum > t )
2679                             {
2680                                 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
2681                                 break;
2682                             }
2683                         }
2684                         assert( b < 16 );
2685                     }
2686                 }
2687                 else
2688             #endif
2689                 {
2690                     for( j = 0; j < 2*r; ++j )
2691                         histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse );
2692
2693                     for( j = r; j < n-r; j++ )
2694                     {
2695                         int t = 2*r*r + 2*r, b, sum = 0;
2696                         HT* segment;
2697
2698                         histogram_add( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
2699
2700                         // Find median at coarse level
2701                         for ( k = 0; k < 16 ; ++k )
2702                         {
2703                             sum += H[c].coarse[k];
2704                             if ( sum > t )
2705                             {
2706                                 sum -= H[c].coarse[k];
2707                                 break;
2708                             }
2709                         }
2710                         assert( k < 16 );
2711
2712                         /* Update corresponding histogram segment */
2713                         if ( luc[c][k] <= j-r )
2714                         {
2715                             memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
2716                             for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
2717                                 histogram_add( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
2718
2719                             if ( luc[c][k] < j+r+1 )
2720                             {
2721                                 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
2722                                 luc[c][k] = (HT)(j+r+1);
2723                             }
2724                         }
2725                         else
2726                         {
2727                             for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
2728                             {
2729                                 histogram_sub( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
2730                                 histogram_add( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
2731                             }
2732                         }
2733
2734                         histogram_sub( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
2735
2736                         /* Find median in segment */
2737                         segment = H[c].fine[k];
2738                         for ( b = 0; b < 16 ; b++ )
2739                         {
2740                             sum += segment[b];
2741                             if ( sum > t )
2742                             {
2743                                 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
2744                                 break;
2745                             }
2746                         }
2747                         assert( b < 16 );
2748                     }
2749                 }
2750             }
2751         }
2752     }
2753
2754 #undef HOP
2755 #undef COP
2756 }
2757
2758 static void
2759 medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m )
2760 {
2761     #define N  16
2762     int     zone0[4][N];
2763     int     zone1[4][N*N];
2764     int     x, y;
2765     int     n2 = m*m/2;
2766     Size    size = _dst.size();
2767     const uchar* src = _src.ptr();
2768     uchar*  dst = _dst.ptr();
2769     int     src_step = (int)_src.step, dst_step = (int)_dst.step;
2770     int     cn = _src.channels();
2771     const uchar*  src_max = src + size.height*src_step;
2772
2773     #define UPDATE_ACC01( pix, cn, op ) \
2774     {                                   \
2775         int p = (pix);                  \
2776         zone1[cn][p] op;                \
2777         zone0[cn][p >> 4] op;           \
2778     }
2779
2780     //CV_Assert( size.height >= nx && size.width >= nx );
2781     for( x = 0; x < size.width; x++, src += cn, dst += cn )
2782     {
2783         uchar* dst_cur = dst;
2784         const uchar* src_top = src;
2785         const uchar* src_bottom = src;
2786         int k, c;
2787         int src_step1 = src_step, dst_step1 = dst_step;
2788
2789         if( x % 2 != 0 )
2790         {
2791             src_bottom = src_top += src_step*(size.height-1);
2792             dst_cur += dst_step*(size.height-1);
2793             src_step1 = -src_step1;
2794             dst_step1 = -dst_step1;
2795         }
2796
2797         // init accumulator
2798         memset( zone0, 0, sizeof(zone0[0])*cn );
2799         memset( zone1, 0, sizeof(zone1[0])*cn );
2800
2801         for( y = 0; y <= m/2; y++ )
2802         {
2803             for( c = 0; c < cn; c++ )
2804             {
2805                 if( y > 0 )
2806                 {
2807                     for( k = 0; k < m*cn; k += cn )
2808                         UPDATE_ACC01( src_bottom[k+c], c, ++ );
2809                 }
2810                 else
2811                 {
2812                     for( k = 0; k < m*cn; k += cn )
2813                         UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 );
2814                 }
2815             }
2816
2817             if( (src_step1 > 0 && y < size.height-1) ||
2818                 (src_step1 < 0 && size.height-y-1 > 0) )
2819                 src_bottom += src_step1;
2820         }
2821
2822         for( y = 0; y < size.height; y++, dst_cur += dst_step1 )
2823         {
2824             // find median
2825             for( c = 0; c < cn; c++ )
2826             {
2827                 int s = 0;
2828                 for( k = 0; ; k++ )
2829                 {
2830                     int t = s + zone0[c][k];
2831                     if( t > n2 ) break;
2832                     s = t;
2833                 }
2834
2835                 for( k *= N; ;k++ )
2836                 {
2837                     s += zone1[c][k];
2838                     if( s > n2 ) break;
2839                 }
2840
2841                 dst_cur[c] = (uchar)k;
2842             }
2843
2844             if( y+1 == size.height )
2845                 break;
2846
2847             if( cn == 1 )
2848             {
2849                 for( k = 0; k < m; k++ )
2850                 {
2851                     int p = src_top[k];
2852                     int q = src_bottom[k];
2853                     zone1[0][p]--;
2854                     zone0[0][p>>4]--;
2855                     zone1[0][q]++;
2856                     zone0[0][q>>4]++;
2857                 }
2858             }
2859             else if( cn == 3 )
2860             {
2861                 for( k = 0; k < m*3; k += 3 )
2862                 {
2863                     UPDATE_ACC01( src_top[k], 0, -- );
2864                     UPDATE_ACC01( src_top[k+1], 1, -- );
2865                     UPDATE_ACC01( src_top[k+2], 2, -- );
2866
2867                     UPDATE_ACC01( src_bottom[k], 0, ++ );
2868                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
2869                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
2870                 }
2871             }
2872             else
2873             {
2874                 assert( cn == 4 );
2875                 for( k = 0; k < m*4; k += 4 )
2876                 {
2877                     UPDATE_ACC01( src_top[k], 0, -- );
2878                     UPDATE_ACC01( src_top[k+1], 1, -- );
2879                     UPDATE_ACC01( src_top[k+2], 2, -- );
2880                     UPDATE_ACC01( src_top[k+3], 3, -- );
2881
2882                     UPDATE_ACC01( src_bottom[k], 0, ++ );
2883                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
2884                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
2885                     UPDATE_ACC01( src_bottom[k+3], 3, ++ );
2886                 }
2887             }
2888
2889             if( (src_step1 > 0 && src_bottom + src_step1 < src_max) ||
2890                 (src_step1 < 0 && src_bottom + src_step1 >= src) )
2891                 src_bottom += src_step1;
2892
2893             if( y >= m/2 )
2894                 src_top += src_step1;
2895         }
2896     }
2897 #undef N
2898 #undef UPDATE_ACC
2899 }
2900
2901
2902 struct MinMax8u
2903 {
2904     typedef uchar value_type;
2905     typedef int arg_type;
2906     enum { SIZE = 1 };
2907     arg_type load(const uchar* ptr) { return *ptr; }
2908     void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
2909     void operator()(arg_type& a, arg_type& b) const
2910     {
2911         int t = CV_FAST_CAST_8U(a - b);
2912         b += t; a -= t;
2913     }
2914 };
2915
2916 struct MinMax16u
2917 {
2918     typedef ushort value_type;
2919     typedef int arg_type;
2920     enum { SIZE = 1 };
2921     arg_type load(const ushort* ptr) { return *ptr; }
2922     void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
2923     void operator()(arg_type& a, arg_type& b) const
2924     {
2925         arg_type t = a;
2926         a = std::min(a, b);
2927         b = std::max(b, t);
2928     }
2929 };
2930
2931 struct MinMax16s
2932 {
2933     typedef short value_type;
2934     typedef int arg_type;
2935     enum { SIZE = 1 };
2936     arg_type load(const short* ptr) { return *ptr; }
2937     void store(short* ptr, arg_type val) { *ptr = (short)val; }
2938     void operator()(arg_type& a, arg_type& b) const
2939     {
2940         arg_type t = a;
2941         a = std::min(a, b);
2942         b = std::max(b, t);
2943     }
2944 };
2945
2946 struct MinMax32f
2947 {
2948     typedef float value_type;
2949     typedef float arg_type;
2950     enum { SIZE = 1 };
2951     arg_type load(const float* ptr) { return *ptr; }
2952     void store(float* ptr, arg_type val) { *ptr = val; }
2953     void operator()(arg_type& a, arg_type& b) const
2954     {
2955         arg_type t = a;
2956         a = std::min(a, b);
2957         b = std::max(b, t);
2958     }
2959 };
2960
2961 #if CV_SSE2
2962
2963 struct MinMaxVec8u
2964 {
2965     typedef uchar value_type;
2966     typedef __m128i arg_type;
2967     enum { SIZE = 16 };
2968     arg_type load(const uchar* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
2969     void store(uchar* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
2970     void operator()(arg_type& a, arg_type& b) const
2971     {
2972         arg_type t = a;
2973         a = _mm_min_epu8(a, b);
2974         b = _mm_max_epu8(b, t);
2975     }
2976 };
2977
2978
2979 struct MinMaxVec16u
2980 {
2981     typedef ushort value_type;
2982     typedef __m128i arg_type;
2983     enum { SIZE = 8 };
2984     arg_type load(const ushort* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
2985     void store(ushort* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
2986     void operator()(arg_type& a, arg_type& b) const
2987     {
2988         arg_type t = _mm_subs_epu16(a, b);
2989         a = _mm_subs_epu16(a, t);
2990         b = _mm_adds_epu16(b, t);
2991     }
2992 };
2993
2994
2995 struct MinMaxVec16s
2996 {
2997     typedef short value_type;
2998     typedef __m128i arg_type;
2999     enum { SIZE = 8 };
3000     arg_type load(const short* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
3001     void store(short* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
3002     void operator()(arg_type& a, arg_type& b) const
3003     {
3004         arg_type t = a;
3005         a = _mm_min_epi16(a, b);
3006         b = _mm_max_epi16(b, t);
3007     }
3008 };
3009
3010
3011 struct MinMaxVec32f
3012 {
3013     typedef float value_type;
3014     typedef __m128 arg_type;
3015     enum { SIZE = 4 };
3016     arg_type load(const float* ptr) { return _mm_loadu_ps(ptr); }
3017     void store(float* ptr, arg_type val) { _mm_storeu_ps(ptr, val); }
3018     void operator()(arg_type& a, arg_type& b) const
3019     {
3020         arg_type t = a;
3021         a = _mm_min_ps(a, b);
3022         b = _mm_max_ps(b, t);
3023     }
3024 };
3025
3026 #elif CV_NEON
3027
3028 struct MinMaxVec8u
3029 {
3030     typedef uchar value_type;
3031     typedef uint8x16_t arg_type;
3032     enum { SIZE = 16 };
3033     arg_type load(const uchar* ptr) { return vld1q_u8(ptr); }
3034     void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); }
3035     void operator()(arg_type& a, arg_type& b) const
3036     {
3037         arg_type t = a;
3038         a = vminq_u8(a, b);
3039         b = vmaxq_u8(b, t);
3040     }
3041 };
3042
3043
3044 struct MinMaxVec16u
3045 {
3046     typedef ushort value_type;
3047     typedef uint16x8_t arg_type;
3048     enum { SIZE = 8 };
3049     arg_type load(const ushort* ptr) { return vld1q_u16(ptr); }
3050     void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); }
3051     void operator()(arg_type& a, arg_type& b) const
3052     {
3053         arg_type t = a;
3054         a = vminq_u16(a, b);
3055         b = vmaxq_u16(b, t);
3056     }
3057 };
3058
3059
3060 struct MinMaxVec16s
3061 {
3062     typedef short value_type;
3063     typedef int16x8_t arg_type;
3064     enum { SIZE = 8 };
3065     arg_type load(const short* ptr) { return vld1q_s16(ptr); }
3066     void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); }
3067     void operator()(arg_type& a, arg_type& b) const
3068     {
3069         arg_type t = a;
3070         a = vminq_s16(a, b);
3071         b = vmaxq_s16(b, t);
3072     }
3073 };
3074
3075
3076 struct MinMaxVec32f
3077 {
3078     typedef float value_type;
3079     typedef float32x4_t arg_type;
3080     enum { SIZE = 4 };
3081     arg_type load(const float* ptr) { return vld1q_f32(ptr); }
3082     void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); }
3083     void operator()(arg_type& a, arg_type& b) const
3084     {
3085         arg_type t = a;
3086         a = vminq_f32(a, b);
3087         b = vmaxq_f32(b, t);
3088     }
3089 };
3090
3091
3092 #else
3093
3094 typedef MinMax8u MinMaxVec8u;
3095 typedef MinMax16u MinMaxVec16u;
3096 typedef MinMax16s MinMaxVec16s;
3097 typedef MinMax32f MinMaxVec32f;
3098
3099 #endif
3100
3101 template<class Op, class VecOp>
3102 static void
3103 medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
3104 {
3105     typedef typename Op::value_type T;
3106     typedef typename Op::arg_type WT;
3107     typedef typename VecOp::arg_type VT;
3108
3109     const T* src = _src.ptr<T>();
3110     T* dst = _dst.ptr<T>();
3111     int sstep = (int)(_src.step/sizeof(T));
3112     int dstep = (int)(_dst.step/sizeof(T));
3113     Size size = _dst.size();
3114     int i, j, k, cn = _src.channels();
3115     Op op;
3116     VecOp vop;
3117     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
3118
3119     if( m == 3 )
3120     {
3121         if( size.width == 1 || size.height == 1 )
3122         {
3123             int len = size.width + size.height - 1;
3124             int sdelta = size.height == 1 ? cn : sstep;
3125             int sdelta0 = size.height == 1 ? 0 : sstep - cn;
3126             int ddelta = size.height == 1 ? cn : dstep;
3127
3128             for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
3129                 for( j = 0; j < cn; j++, src++ )
3130                 {
3131                     WT p0 = src[i > 0 ? -sdelta : 0];
3132                     WT p1 = src[0];
3133                     WT p2 = src[i < len - 1 ? sdelta : 0];
3134
3135                     op(p0, p1); op(p1, p2); op(p0, p1);
3136                     dst[j] = (T)p1;
3137                 }
3138             return;
3139         }
3140
3141         size.width *= cn;
3142         for( i = 0; i < size.height; i++, dst += dstep )
3143         {
3144             const T* row0 = src + std::max(i - 1, 0)*sstep;
3145             const T* row1 = src + i*sstep;
3146             const T* row2 = src + std::min(i + 1, size.height-1)*sstep;
3147             int limit = useSIMD ? cn : size.width;
3148
3149             for(j = 0;; )
3150             {
3151                 for( ; j < limit; j++ )
3152                 {
3153                     int j0 = j >= cn ? j - cn : j;
3154                     int j2 = j < size.width - cn ? j + cn : j;
3155                     WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
3156                     WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
3157                     WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
3158
3159                     op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
3160                     op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
3161                     op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
3162                     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
3163                     op(p4, p2); op(p6, p4); op(p4, p2);
3164                     dst[j] = (T)p4;
3165                 }
3166
3167                 if( limit == size.width )
3168                     break;
3169
3170                 for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
3171                 {
3172                     VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
3173                     VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
3174                     VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn);
3175
3176                     vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
3177                     vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
3178                     vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
3179                     vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
3180                     vop(p4, p2); vop(p6, p4); vop(p4, p2);
3181                     vop.store(dst+j, p4);
3182                 }
3183
3184                 limit = size.width;
3185             }
3186         }
3187     }
3188     else if( m == 5 )
3189     {
3190         if( size.width == 1 || size.height == 1 )
3191         {
3192             int len = size.width + size.height - 1;
3193             int sdelta = size.height == 1 ? cn : sstep;
3194             int sdelta0 = size.height == 1 ? 0 : sstep - cn;
3195             int ddelta = size.height == 1 ? cn : dstep;
3196
3197             for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
3198                 for( j = 0; j < cn; j++, src++ )
3199                 {
3200                     int i1 = i > 0 ? -sdelta : 0;
3201                     int i0 = i > 1 ? -sdelta*2 : i1;
3202                     int i3 = i < len-1 ? sdelta : 0;
3203                     int i4 = i < len-2 ? sdelta*2 : i3;
3204                     WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
3205
3206                     op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2);
3207                     op(p2, p4); op(p1, p3); op(p1, p2);
3208                     dst[j] = (T)p2;
3209                 }
3210             return;
3211         }
3212
3213         size.width *= cn;
3214         for( i = 0; i < size.height; i++, dst += dstep )
3215         {
3216             const T* row[5];
3217             row[0] = src + std::max(i - 2, 0)*sstep;
3218             row[1] = src + std::max(i - 1, 0)*sstep;
3219             row[2] = src + i*sstep;
3220             row[3] = src + std::min(i + 1, size.height-1)*sstep;
3221             row[4] = src + std::min(i + 2, size.height-1)*sstep;
3222             int limit = useSIMD ? cn*2 : size.width;
3223
3224             for(j = 0;; )
3225             {
3226                 for( ; j < limit; j++ )
3227                 {
3228                     WT p[25];
3229                     int j1 = j >= cn ? j - cn : j;
3230                     int j0 = j >= cn*2 ? j - cn*2 : j1;
3231                     int j3 = j < size.width - cn ? j + cn : j;
3232                     int j4 = j < size.width - cn*2 ? j + cn*2 : j3;
3233                     for( k = 0; k < 5; k++ )
3234                     {
3235                         const T* rowk = row[k];
3236                         p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
3237                         p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
3238                         p[k*5+4] = rowk[j4];
3239                     }
3240
3241                     op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]);
3242                     op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]);
3243                     op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]);
3244                     op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]);
3245                     op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]);
3246                     op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]);
3247                     op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]);
3248                     op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]);
3249                     op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]);
3250                     op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]);
3251                     op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]);
3252                     op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]);
3253                     op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]);
3254                     op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]);
3255                     op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]);
3256                     op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]);
3257                     op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]);
3258                     op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]);
3259                     op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]);
3260                     op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]);
3261                     op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]);
3262                     op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]);
3263                     op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]);
3264                     dst[j] = (T)p[12];
3265                 }
3266
3267                 if( limit == size.width )
3268                     break;
3269
3270                 for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
3271                 {
3272                     VT p[25];
3273                     for( k = 0; k < 5; k++ )
3274                     {
3275                         const T* rowk = row[k];
3276                         p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
3277                         p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
3278                         p[k*5+4] = vop.load(rowk+j+cn*2);
3279                     }
3280
3281                     vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
3282                     vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
3283                     vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
3284                     vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
3285                     vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
3286                     vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
3287                     vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
3288                     vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
3289                     vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
3290                     vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
3291                     vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
3292                     vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
3293                     vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
3294                     vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
3295                     vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
3296                     vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
3297                     vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
3298                     vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
3299                     vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
3300                     vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
3301                     vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
3302                     vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
3303                     vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
3304                     vop.store(dst+j, p[12]);
3305                 }
3306
3307                 limit = size.width;
3308             }
3309         }
3310     }
3311 }
3312
3313 #ifdef HAVE_OPENCL
3314
3315 static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m)
3316 {
3317     size_t localsize[2] = { 16, 16 };
3318     size_t globalsize[2];
3319     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3320
3321     if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) )
3322         return false;
3323
3324     Size imgSize = _src.size();
3325     bool useOptimized = (1 == cn) &&
3326                         (size_t)imgSize.width >= localsize[0] * 8  &&
3327                         (size_t)imgSize.height >= localsize[1] * 8 &&
3328                         imgSize.width % 4 == 0 &&
3329                         imgSize.height % 4 == 0 &&
3330                         (ocl::Device::getDefault().isIntel());
3331
3332     cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ;
3333     cv::String kdefs = useOptimized ?
3334                          format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type),
3335                          ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn)
3336                          :
3337                          format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ;
3338
3339     ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() );
3340
3341     if (k.empty())
3342         return false;
3343
3344     UMat src = _src.getUMat();
3345     _dst.create(src.size(), type);
3346     UMat dst = _dst.getUMat();
3347
3348     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
3349
3350     if( useOptimized )
3351     {
3352         globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0];
3353         globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1];
3354     }
3355     else
3356     {
3357         globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0];
3358         globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1];
3359     }
3360
3361     return k.run(2, globalsize, localsize, false);
3362 }
3363
3364 #endif
3365
3366 }
3367
3368 #ifdef HAVE_OPENVX
3369 namespace cv
3370 {
3371     static bool openvx_medianFilter(InputArray _src, OutputArray _dst, int ksize)
3372     {
3373         if (_src.type() != CV_8UC1 || _dst.type() != CV_8U
3374 #ifndef VX_VERSION_1_1
3375             || ksize != 3
3376 #endif
3377             )
3378             return false;
3379
3380         Mat src = _src.getMat();
3381         Mat dst = _dst.getMat();
3382
3383         vx_border_t border;
3384         border.mode = VX_BORDER_REPLICATE;
3385
3386         try
3387         {
3388             ivx::Context ctx = ivx::Context::create();
3389 #ifdef VX_VERSION_1_1
3390             if ((vx_size)ksize > ctx.nonlinearMaxDimension())
3391                 return false;
3392 #endif
3393
3394             Mat a;
3395             if (dst.data != src.data)
3396                 a = src;
3397             else
3398                 src.copyTo(a);
3399
3400             ivx::Image
3401                 ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
3402                     ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
3403                 ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
3404                     ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
3405
3406             //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
3407             //since OpenVX standart says nothing about thread-safety for now
3408             vx_border_t prevBorder = ctx.borderMode();
3409             ctx.setBorderMode(border);
3410 #ifdef VX_VERSION_1_1
3411             if (ksize == 3)
3412 #endif
3413             {
3414                 ivx::IVX_CHECK_STATUS(vxuMedian3x3(ctx, ia, ib));
3415             }
3416 #ifdef VX_VERSION_1_1
3417             else
3418             {
3419                 ivx::Matrix mtx;
3420                 if(ksize == 5)
3421                     mtx = ivx::Matrix::createFromPattern(ctx, VX_PATTERN_BOX, ksize, ksize);
3422                 else
3423                 {
3424                     vx_size supportedSize;
3425                     ivx::IVX_CHECK_STATUS(vxQueryContext(ctx, VX_CONTEXT_NONLINEAR_MAX_DIMENSION, &supportedSize, sizeof(supportedSize)));
3426                     if ((vx_size)ksize > supportedSize)
3427                     {
3428                         ctx.setBorderMode(prevBorder);
3429                         return false;
3430                     }
3431                     Mat mask(ksize, ksize, CV_8UC1, Scalar(255));
3432                     mtx = ivx::Matrix::create(ctx, VX_TYPE_UINT8, ksize, ksize);
3433                     mtx.copyFrom(mask);
3434                 }
3435                 ivx::IVX_CHECK_STATUS(vxuNonLinearFilter(ctx, VX_NONLINEAR_FILTER_MEDIAN, ia, mtx, ib));
3436             }
3437 #endif
3438             ctx.setBorderMode(prevBorder);
3439         }
3440         catch (ivx::RuntimeError & e)
3441         {
3442             CV_Error(CV_StsInternal, e.what());
3443             return false;
3444         }
3445         catch (ivx::WrapperError & e)
3446         {
3447             CV_Error(CV_StsInternal, e.what());
3448             return false;
3449         }
3450
3451         return true;
3452     }
3453 }
3454 #endif
3455
3456 #ifdef HAVE_IPP
3457 namespace cv
3458 {
3459 static bool ipp_medianFilter( InputArray _src0, OutputArray _dst, int ksize )
3460 {
3461     CV_INSTRUMENT_REGION_IPP()
3462
3463 #if IPP_VERSION_X100 >= 810
3464     Mat src0 = _src0.getMat();
3465     _dst.create( src0.size(), src0.type() );
3466     Mat dst = _dst.getMat();
3467
3468 #define IPP_FILTER_MEDIAN_BORDER(ippType, ippDataType, flavor) \
3469     do \
3470     { \
3471         if (ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, \
3472         ippDataType, CV_MAT_CN(type), &bufSize) >= 0) \
3473         { \
3474             Ipp8u * buffer = ippsMalloc_8u(bufSize); \
3475             IppStatus status = CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_##flavor, src.ptr<ippType>(), (int)src.step, \
3476             dst.ptr<ippType>(), (int)dst.step, dstRoiSize, maskSize, \
3477             ippBorderRepl, (ippType)0, buffer); \
3478             ippsFree(buffer); \
3479             if (status >= 0) \
3480             { \
3481                 CV_IMPL_ADD(CV_IMPL_IPP); \
3482                 return true; \
3483             } \
3484         } \
3485     } \
3486     while ((void)0, 0)
3487
3488     if( ksize <= 5 )
3489     {
3490         Ipp32s bufSize;
3491         IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize);
3492         Mat src;
3493         if( dst.data != src0.data )
3494             src = src0;
3495         else
3496             src0.copyTo(src);
3497
3498         int type = src0.type();
3499         if (type == CV_8UC1)
3500             IPP_FILTER_MEDIAN_BORDER(Ipp8u, ipp8u, 8u_C1R);
3501         else if (type == CV_16UC1)
3502             IPP_FILTER_MEDIAN_BORDER(Ipp16u, ipp16u, 16u_C1R);
3503         else if (type == CV_16SC1)
3504             IPP_FILTER_MEDIAN_BORDER(Ipp16s, ipp16s, 16s_C1R);
3505         else if (type == CV_32FC1)
3506             IPP_FILTER_MEDIAN_BORDER(Ipp32f, ipp32f, 32f_C1R);
3507     }
3508 #undef IPP_FILTER_MEDIAN_BORDER
3509 #else
3510     CV_UNUSED(_src0); CV_UNUSED(_dst); CV_UNUSED(ksize);
3511 #endif
3512     return false;
3513 }
3514 }
3515 #endif
3516
3517 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
3518 {
3519     CV_INSTRUMENT_REGION()
3520
3521     CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 ));
3522
3523     if( ksize <= 1 )
3524     {
3525         _src0.copyTo(_dst);
3526         return;
3527     }
3528
3529     CV_OCL_RUN(_dst.isUMat(),
3530                ocl_medianFilter(_src0,_dst, ksize))
3531
3532     Mat src0 = _src0.getMat();
3533     _dst.create( src0.size(), src0.type() );
3534     Mat dst = _dst.getMat();
3535
3536 #ifdef HAVE_OPENVX
3537     if (openvx_medianFilter(_src0, _dst, ksize))
3538         return;
3539 #endif
3540
3541     CV_IPP_RUN(IPP_VERSION_X100 >= 810 && ksize <= 5, ipp_medianFilter(_src0,_dst, ksize));
3542
3543 #ifdef HAVE_TEGRA_OPTIMIZATION
3544     if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize))
3545         return;
3546 #endif
3547
3548     bool useSortNet = ksize == 3 || (ksize == 5
3549 #if !(CV_SSE2 || CV_NEON)
3550             && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 )
3551 #endif
3552         );
3553
3554     Mat src;
3555     if( useSortNet )
3556     {
3557         if( dst.data != src0.data )
3558             src = src0;
3559         else
3560             src0.copyTo(src);
3561
3562         if( src.depth() == CV_8U )
3563             medianBlur_SortNet<MinMax8u, MinMaxVec8u>( src, dst, ksize );
3564         else if( src.depth() == CV_16U )
3565             medianBlur_SortNet<MinMax16u, MinMaxVec16u>( src, dst, ksize );
3566         else if( src.depth() == CV_16S )
3567             medianBlur_SortNet<MinMax16s, MinMaxVec16s>( src, dst, ksize );
3568         else if( src.depth() == CV_32F )
3569             medianBlur_SortNet<MinMax32f, MinMaxVec32f>( src, dst, ksize );
3570         else
3571             CV_Error(CV_StsUnsupportedFormat, "");
3572
3573         return;
3574     }
3575     else
3576     {
3577         cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE );
3578
3579         int cn = src0.channels();
3580         CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) );
3581
3582         double img_size_mp = (double)(src0.total())/(1 << 20);
3583         if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*
3584             (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3))
3585             medianBlur_8u_Om( src, dst, ksize );
3586         else
3587             medianBlur_8u_O1( src, dst, ksize );
3588     }
3589 }
3590
3591 /****************************************************************************************\
3592                                    Bilateral Filtering
3593 \****************************************************************************************/
3594
3595 namespace cv
3596 {
3597
3598 class BilateralFilter_8u_Invoker :
3599     public ParallelLoopBody
3600 {
3601 public:
3602     BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
3603         int* _space_ofs, float *_space_weight, float *_color_weight) :
3604         temp(&_temp), dest(&_dest), radius(_radius),
3605         maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
3606     {
3607     }
3608
3609     virtual void operator() (const Range& range) const
3610     {
3611         int i, j, cn = dest->channels(), k;
3612         Size size = dest->size();
3613         #if CV_SSE3
3614         int CV_DECL_ALIGNED(16) buf[4];
3615         float CV_DECL_ALIGNED(16) bufSum[4];
3616         static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
3617         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
3618         #endif
3619
3620         for( i = range.start; i < range.end; i++ )
3621         {
3622             const uchar* sptr = temp->ptr(i+radius) + radius*cn;
3623             uchar* dptr = dest->ptr(i);
3624
3625             if( cn == 1 )
3626             {
3627                 for( j = 0; j < size.width; j++ )
3628                 {
3629                     float sum = 0, wsum = 0;
3630                     int val0 = sptr[j];
3631                     k = 0;
3632                     #if CV_SSE3
3633                     if( haveSSE3 )
3634                     {
3635                         __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
3636                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
3637
3638                         for( ; k <= maxk - 4; k += 4 )
3639                         {
3640                             __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
3641                                                       sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
3642
3643                             __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
3644                             _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
3645
3646                             __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
3647                                                     color_weight[buf[1]],color_weight[buf[0]]);
3648                             __m128 _sw = _mm_loadu_ps(space_weight+k);
3649                             __m128 _w = _mm_mul_ps(_cw, _sw);
3650                              _cw = _mm_mul_ps(_w, _valF);
3651
3652                              _sw = _mm_hadd_ps(_w, _cw);
3653                              _sw = _mm_hadd_ps(_sw, _sw);
3654                              _mm_storel_pi((__m64*)bufSum, _sw);
3655
3656                              sum += bufSum[1];
3657                              wsum += bufSum[0];
3658                         }
3659                     }
3660                     #endif
3661                     for( ; k < maxk; k++ )
3662                     {
3663                         int val = sptr[j + space_ofs[k]];
3664                         float w = space_weight[k]*color_weight[std::abs(val - val0)];
3665                         sum += val*w;
3666                         wsum += w;
3667                     }
3668                     // overflow is not possible here => there is no need to use cv::saturate_cast
3669                     dptr[j] = (uchar)cvRound(sum/wsum);
3670                 }
3671             }
3672             else
3673             {
3674                 assert( cn == 3 );
3675                 for( j = 0; j < size.width*3; j += 3 )
3676                 {
3677                     float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
3678                     int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
3679                     k = 0;
3680                     #if CV_SSE3
3681                     if( haveSSE3 )
3682                     {
3683                         const __m128i izero = _mm_setzero_si128();
3684                         const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
3685                         const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
3686                         const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
3687                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
3688
3689                         for( ; k <= maxk - 4; k += 4 )
3690                         {
3691                             const int* const sptr_k0  = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
3692                             const int* const sptr_k1  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
3693                             const int* const sptr_k2  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
3694                             const int* const sptr_k3  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
3695
3696                             __m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
3697                             __m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
3698                             __m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
3699                             __m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
3700
3701                             _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
3702
3703                             __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
3704                             __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
3705                             __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
3706
3707                             bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
3708                             _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
3709
3710                             __m128 _w  = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
3711                                                     color_weight[buf[1]],color_weight[buf[0]]);
3712                             __m128 _sw = _mm_loadu_ps(space_weight+k);
3713
3714                             _w = _mm_mul_ps(_w,_sw);
3715                             _b = _mm_mul_ps(_b, _w);
3716                             _g = _mm_mul_ps(_g, _w);
3717                             _r = _mm_mul_ps(_r, _w);
3718
3719                             _w = _mm_hadd_ps(_w, _b);
3720                             _g = _mm_hadd_ps(_g, _r);
3721
3722                             _w = _mm_hadd_ps(_w, _g);
3723                             _mm_store_ps(bufSum, _w);
3724
3725                             wsum  += bufSum[0];
3726                             sum_b += bufSum[1];
3727                             sum_g += bufSum[2];
3728                             sum_r += bufSum[3];
3729                          }
3730                     }
3731                     #endif
3732
3733                     for( ; k < maxk; k++ )
3734                     {
3735                         const uchar* sptr_k = sptr + j + space_ofs[k];
3736                         int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
3737                         float w = space_weight[k]*color_weight[std::abs(b - b0) +
3738                                                                std::abs(g - g0) + std::abs(r - r0)];
3739                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
3740                         wsum += w;
3741                     }
3742                     wsum = 1.f/wsum;
3743                     b0 = cvRound(sum_b*wsum);
3744                     g0 = cvRound(sum_g*wsum);
3745                     r0 = cvRound(sum_r*wsum);
3746                     dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
3747                 }
3748             }
3749         }
3750     }
3751
3752 private:
3753     const Mat *temp;
3754     Mat *dest;
3755     int radius, maxk, *space_ofs;
3756     float *space_weight, *color_weight;
3757 };
3758
3759 #if defined (HAVE_IPP) && IPP_DISABLE_BLOCK
3760 class IPPBilateralFilter_8u_Invoker :
3761     public ParallelLoopBody
3762 {
3763 public:
3764     IPPBilateralFilter_8u_Invoker(Mat &_src, Mat &_dst, double _sigma_color, double _sigma_space, int _radius, bool *_ok) :
3765       ParallelLoopBody(), src(_src), dst(_dst), sigma_color(_sigma_color), sigma_space(_sigma_space), radius(_radius), ok(_ok)
3766       {
3767           *ok = true;
3768       }
3769
3770       virtual void operator() (const Range& range) const
3771       {
3772           int d = radius * 2 + 1;
3773           IppiSize kernel = {d, d};
3774           IppiSize roi={dst.cols, range.end - range.start};
3775           int bufsize=0;
3776           if (0 > ippiFilterBilateralGetBufSize_8u_C1R( ippiFilterBilateralGauss, roi, kernel, &bufsize))
3777           {
3778               *ok = false;
3779               return;
3780           }
3781           AutoBuffer<uchar> buf(bufsize);
3782           IppiFilterBilateralSpec *pSpec = (IppiFilterBilateralSpec *)alignPtr(&buf[0], 32);
3783           if (0 > ippiFilterBilateralInit_8u_C1R( ippiFilterBilateralGauss, kernel, (Ipp32f)sigma_color, (Ipp32f)sigma_space, 1, pSpec ))
3784           {
3785               *ok = false;
3786               return;
3787           }
3788           if (0 > ippiFilterBilateral_8u_C1R( src.ptr<uchar>(range.start) + radius * ((int)src.step[0] + 1), (int)src.step[0], dst.ptr<uchar>(range.start), (int)dst.step[0], roi, kernel, pSpec ))
3789               *ok = false;
3790           else
3791           {
3792             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3793           }
3794       }
3795 private:
3796     Mat &src;
3797     Mat &dst;
3798     double sigma_color;
3799     double sigma_space;
3800     int radius;
3801     bool *ok;
3802     const IPPBilateralFilter_8u_Invoker& operator= (const IPPBilateralFilter_8u_Invoker&);
3803 };
3804 #endif
3805
3806 #ifdef HAVE_OPENCL
3807
3808 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
3809                                    double sigma_color, double sigma_space,
3810                                    int borderType)
3811 {
3812 #ifdef ANDROID
3813     if (ocl::Device::getDefault().isNVidia())
3814         return false;
3815 #endif
3816
3817     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3818     int i, j, maxk, radius;
3819
3820     if (depth != CV_8U || cn > 4)
3821         return false;
3822
3823     if (sigma_color <= 0)
3824         sigma_color = 1;
3825     if (sigma_space <= 0)
3826         sigma_space = 1;
3827
3828     double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
3829     double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
3830
3831     if ( d <= 0 )
3832         radius = cvRound(sigma_space * 1.5);
3833     else
3834         radius = d / 2;
3835     radius = MAX(radius, 1);
3836     d = radius * 2 + 1;
3837
3838     UMat src = _src.getUMat(), dst = _dst.getUMat(), temp;
3839     if (src.u == dst.u)
3840         return false;
3841
3842     copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
3843     std::vector<float> _space_weight(d * d);
3844     std::vector<int> _space_ofs(d * d);
3845     float * const space_weight = &_space_weight[0];
3846     int * const space_ofs = &_space_ofs[0];
3847
3848     // initialize space-related bilateral filter coefficients
3849     for( i = -radius, maxk = 0; i <= radius; i++ )
3850         for( j = -radius; j <= radius; j++ )
3851         {
3852             double r = std::sqrt((double)i * i + (double)j * j);
3853             if ( r > radius )
3854                 continue;
3855             space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
3856             space_ofs[maxk++] = (int)(i * temp.step + j * cn);
3857         }
3858
3859     char cvt[3][40];
3860     String cnstr = cn > 1 ? format("%d", cn) : "";
3861     String kernelName("bilateral");
3862     size_t sizeDiv = 1;
3863     if ((ocl::Device::getDefault().isIntel()) &&
3864         (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
3865     {
3866             //Intel GPU
3867             if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
3868             {
3869                 kernelName = "bilateral_float4";
3870                 sizeDiv = 4;
3871             }
3872      }
3873      ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
3874             format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
3875             " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f",
3876             radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
3877             ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
3878             ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
3879             ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
3880             ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
3881     if (k.empty())
3882         return false;
3883
3884     Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
3885     Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
3886     UMat ucolor_weight, uspace_weight, uspace_ofs;
3887
3888     mspace_weight.copyTo(uspace_weight);
3889     mspace_ofs.copyTo(uspace_ofs);
3890
3891     k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
3892            ocl::KernelArg::PtrReadOnly(uspace_weight),
3893            ocl::KernelArg::PtrReadOnly(uspace_ofs));
3894
3895     size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows };
3896     return k.run(2, globalsize, NULL, false);
3897 }
3898
3899 #endif
3900 static void
3901 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
3902     double sigma_color, double sigma_space,
3903     int borderType )
3904 {
3905     int cn = src.channels();
3906     int i, j, maxk, radius;
3907     Size size = src.size();
3908
3909     CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
3910
3911     if( sigma_color <= 0 )
3912         sigma_color = 1;
3913     if( sigma_space <= 0 )
3914         sigma_space = 1;
3915
3916     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
3917     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
3918
3919     if( d <= 0 )
3920         radius = cvRound(sigma_space*1.5);
3921     else
3922         radius = d/2;
3923     radius = MAX(radius, 1);
3924     d = radius*2 + 1;
3925
3926     Mat temp;
3927     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
3928
3929 #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
3930     CV_IPP_CHECK()
3931     {
3932         if( cn == 1 )
3933         {
3934             bool ok;
3935             IPPBilateralFilter_8u_Invoker body(temp, dst, sigma_color * sigma_color, sigma_space * sigma_space, radius, &ok );
3936             parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16));
3937             if( ok )
3938             {
3939                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
3940                 return;
3941             }
3942             setIppErrorStatus();
3943         }
3944     }
3945 #endif
3946
3947     std::vector<float> _color_weight(cn*256);
3948     std::vector<float> _space_weight(d*d);
3949     std::vector<int> _space_ofs(d*d);
3950     float* color_weight = &_color_weight[0];
3951     float* space_weight = &_space_weight[0];
3952     int* space_ofs = &_space_ofs[0];
3953
3954     // initialize color-related bilateral filter coefficients
3955
3956     for( i = 0; i < 256*cn; i++ )
3957         color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
3958
3959     // initialize space-related bilateral filter coefficients
3960     for( i = -radius, maxk = 0; i <= radius; i++ )
3961     {
3962         j = -radius;
3963
3964         for( ; j <= radius; j++ )
3965         {
3966             double r = std::sqrt((double)i*i + (double)j*j);
3967             if( r > radius )
3968                 continue;
3969             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
3970             space_ofs[maxk++] = (int)(i*temp.step + j*cn);
3971         }
3972     }
3973
3974     BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
3975     parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
3976 }
3977
3978
3979 class BilateralFilter_32f_Invoker :
3980     public ParallelLoopBody
3981 {
3982 public:
3983
3984     BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
3985         const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
3986         cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
3987         temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
3988     {
3989     }
3990
3991     virtual void operator() (const Range& range) const
3992     {
3993         int i, j, k;
3994         Size size = dest->size();
3995         #if CV_SSE3 || CV_NEON
3996         int CV_DECL_ALIGNED(16) idxBuf[4];
3997         float CV_DECL_ALIGNED(16) bufSum32[4];
3998         static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
3999         #endif
4000         #if CV_SSE3
4001         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
4002         #elif CV_NEON
4003         bool haveNEON = checkHardwareSupport(CV_CPU_NEON);
4004         #endif
4005
4006         for( i = range.start; i < range.end; i++ )
4007         {
4008             const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
4009             float* dptr = dest->ptr<float>(i);
4010
4011             if( cn == 1 )
4012             {
4013                 for( j = 0; j < size.width; j++ )
4014                 {
4015                     float sum = 0, wsum = 0;
4016                     float val0 = sptr[j];
4017                     k = 0;
4018                     #if CV_SSE3
4019                     if( haveSSE3 )
4020                     {
4021                         __m128 psum = _mm_setzero_ps();
4022                         const __m128 _val0 = _mm_set1_ps(sptr[j]);
4023                         const __m128 _scale_index = _mm_set1_ps(scale_index);
4024                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
4025
4026                         for( ; k <= maxk - 4 ; k += 4 )
4027                         {
4028                             __m128 _sw    = _mm_loadu_ps(space_weight + k);
4029                             __m128 _val   = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
4030                                                        sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
4031                             __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
4032
4033                             __m128i _idx = _mm_cvtps_epi32(_alpha);
4034                             _mm_store_si128((__m128i*)idxBuf, _idx);
4035                             _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
4036
4037                             __m128 _explut  = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
4038                                                          expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
4039                             __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
4040                                                          expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
4041
4042                             __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
4043                             _val = _mm_mul_ps(_w, _val);
4044
4045                             _sw = _mm_hadd_ps(_w, _val);
4046                             _sw = _mm_hadd_ps(_sw, _sw);
4047                             psum = _mm_add_ps(_sw, psum);
4048                         }
4049                         _mm_storel_pi((__m64*)bufSum32, psum);
4050
4051                         sum = bufSum32[1];
4052                         wsum = bufSum32[0];
4053                     }
4054                     #elif CV_NEON
4055                     if( haveNEON )
4056                     {
4057                         float32x2_t psum = vdup_n_f32(0.0f);
4058                         const volatile float32x4_t _val0 = vdupq_n_f32(sptr[j]);
4059                         const float32x4_t _scale_index = vdupq_n_f32(scale_index);
4060                         const uint32x4_t _signMask = vld1q_u32(bufSignMask);
4061
4062                         for( ; k <= maxk - 4 ; k += 4 )
4063                         {
4064                             float32x4_t _sw  = vld1q_f32(space_weight + k);
4065                             float CV_DECL_ALIGNED(16) _data[] = {sptr[j + space_ofs[k]],   sptr[j + space_ofs[k+1]],
4066                                                                  sptr[j + space_ofs[k+2]], sptr[j + space_ofs[k+3]],};
4067                             float32x4_t _val = vld1q_f32(_data);
4068                             float32x4_t _alpha = vsubq_f32(_val, _val0);
4069                             _alpha = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(_alpha), _signMask));
4070                             _alpha = vmulq_f32(_alpha, _scale_index);
4071                             int32x4_t _idx = vcvtq_s32_f32(_alpha);
4072                             vst1q_s32(idxBuf, _idx);
4073                             _alpha = vsubq_f32(_alpha, vcvtq_f32_s32(_idx));
4074
4075                             bufSum32[0] = expLUT[idxBuf[0]];
4076                             bufSum32[1] = expLUT[idxBuf[1]];
4077                             bufSum32[2] = expLUT[idxBuf[2]];
4078                             bufSum32[3] = expLUT[idxBuf[3]];
4079                             float32x4_t _explut = vld1q_f32(bufSum32);
4080                             bufSum32[0] = expLUT[idxBuf[0]+1];
4081                             bufSum32[1] = expLUT[idxBuf[1]+1];
4082                             bufSum32[2] = expLUT[idxBuf[2]+1];
4083                             bufSum32[3] = expLUT[idxBuf[3]+1];
4084                             float32x4_t _explut1 = vld1q_f32(bufSum32);
4085
4086                             float32x4_t _w = vmulq_f32(_sw, vaddq_f32(_explut, vmulq_f32(_alpha, vsubq_f32(_explut1, _explut))));
4087                             _val = vmulq_f32(_w, _val);
4088
4089                             float32x2_t _wval = vpadd_f32(vpadd_f32(vget_low_f32(_w),vget_high_f32(_w)), vpadd_f32(vget_low_f32(_val), vget_high_f32(_val)));
4090                             psum = vadd_f32(_wval, psum);
4091                         }
4092                         sum = vget_lane_f32(psum, 1);
4093                         wsum = vget_lane_f32(psum, 0);
4094                     }
4095                     #endif
4096
4097                     for( ; k < maxk; k++ )
4098                     {
4099                         float val = sptr[j + space_ofs[k]];
4100                         float alpha = (float)(std::abs(val - val0)*scale_index);
4101                         int idx = cvFloor(alpha);
4102                         alpha -= idx;
4103                         float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
4104                         sum += val*w;
4105                         wsum += w;
4106                     }
4107                     dptr[j] = (float)(sum/wsum);
4108                 }
4109             }
4110             else
4111             {
4112                 CV_Assert( cn == 3 );
4113                 for( j = 0; j < size.width*3; j += 3 )
4114                 {
4115                     float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
4116                     float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
4117                     k = 0;
4118                     #if  CV_SSE3
4119                     if( haveSSE3 )
4120                     {
4121                         __m128 sum = _mm_setzero_ps();
4122                         const __m128 _b0 = _mm_set1_ps(b0);
4123                         const __m128 _g0 = _mm_set1_ps(g0);
4124                         const __m128 _r0 = _mm_set1_ps(r0);
4125                         const __m128 _scale_index = _mm_set1_ps(scale_index);
4126                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
4127
4128                         for( ; k <= maxk-4; k += 4 )
4129                         {
4130                             __m128 _sw = _mm_loadu_ps(space_weight + k);
4131
4132                             const float* const sptr_k0 = sptr + j + space_ofs[k];
4133                             const float* const sptr_k1 = sptr + j + space_ofs[k+1];
4134                             const float* const sptr_k2 = sptr + j + space_ofs[k+2];
4135                             const float* const sptr_k3 = sptr + j + space_ofs[k+3];
4136
4137                             __m128 _b = _mm_loadu_ps(sptr_k0);
4138                             __m128 _g = _mm_loadu_ps(sptr_k1);
4139                             __m128 _r = _mm_loadu_ps(sptr_k2);
4140                             __m128 _z = _mm_loadu_ps(sptr_k3);
4141                             _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
4142
4143                             __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
4144                             __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
4145                             __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
4146
4147                             __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
4148
4149                             __m128i _idx  = _mm_cvtps_epi32(_alpha);
4150                             _mm_store_si128((__m128i*)idxBuf, _idx);
4151                             _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
4152
4153                             __m128 _explut  = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
4154                             __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
4155
4156                             __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
4157
4158                             _b = _mm_mul_ps(_b, _w);
4159                             _g = _mm_mul_ps(_g, _w);
4160                             _r = _mm_mul_ps(_r, _w);
4161
4162                              _w = _mm_hadd_ps(_w, _b);
4163                              _g = _mm_hadd_ps(_g, _r);
4164
4165                              _w = _mm_hadd_ps(_w, _g);
4166                              sum = _mm_add_ps(sum, _w);
4167                         }
4168                         _mm_store_ps(bufSum32, sum);
4169                         wsum  = bufSum32[0];
4170                         sum_b = bufSum32[1];
4171                         sum_g = bufSum32[2];
4172                         sum_r = bufSum32[3];
4173                     }
4174                     #elif CV_NEON
4175                     if( haveNEON )
4176                     {
4177                         float32x4_t sum = vdupq_n_f32(0.0f);
4178                         const float32x4_t _b0 = vdupq_n_f32(b0);
4179                         const float32x4_t _g0 = vdupq_n_f32(g0);
4180                         const float32x4_t _r0 = vdupq_n_f32(r0);
4181                         const float32x4_t _scale_index = vdupq_n_f32(scale_index);
4182                         const uint32x4_t _signMask = vld1q_u32(bufSignMask);
4183
4184                         for( ; k <= maxk-4; k += 4 )
4185                         {
4186                             float32x4_t _sw = vld1q_f32(space_weight + k);
4187
4188                             const float* const sptr_k0 = sptr + j + space_ofs[k];
4189                             const float* const sptr_k1 = sptr + j + space_ofs[k+1];
4190                             const float* const sptr_k2 = sptr + j + space_ofs[k+2];
4191                             const float* const sptr_k3 = sptr + j + space_ofs[k+3];
4192
4193                             float32x4_t _v0 = vld1q_f32(sptr_k0);
4194                             float32x4_t _v1 = vld1q_f32(sptr_k1);
4195                             float32x4_t _v2 = vld1q_f32(sptr_k2);
4196                             float32x4_t _v3 = vld1q_f32(sptr_k3);
4197
4198                             float32x4x2_t v01 = vtrnq_f32(_v0, _v1);
4199                             float32x4x2_t v23 = vtrnq_f32(_v2, _v3);
4200                             float32x4_t _b = vcombine_f32(vget_low_f32(v01.val[0]), vget_low_f32(v23.val[0]));
4201                             float32x4_t _g = vcombine_f32(vget_low_f32(v01.val[1]), vget_low_f32(v23.val[1]));
4202                             float32x4_t _r = vcombine_f32(vget_high_f32(v01.val[0]), vget_high_f32(v23.val[0]));
4203
4204                             float32x4_t _bt = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vsubq_f32(_b, _b0)), _signMask));
4205                             float32x4_t _gt = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vsubq_f32(_g, _g0)), _signMask));
4206                             float32x4_t _rt = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vsubq_f32(_r, _r0)), _signMask));
4207                             float32x4_t _alpha = vmulq_f32(_scale_index, vaddq_f32(_bt, vaddq_f32(_gt, _rt)));
4208
4209                             int32x4_t _idx = vcvtq_s32_f32(_alpha);
4210                             vst1q_s32((int*)idxBuf, _idx);
4211                             bufSum32[0] = expLUT[idxBuf[0]];
4212                             bufSum32[1] = expLUT[idxBuf[1]];
4213                             bufSum32[2] = expLUT[idxBuf[2]];
4214                             bufSum32[3] = expLUT[idxBuf[3]];
4215                             float32x4_t _explut = vld1q_f32(bufSum32);
4216                             bufSum32[0] = expLUT[idxBuf[0]+1];
4217                             bufSum32[1] = expLUT[idxBuf[1]+1];
4218                             bufSum32[2] = expLUT[idxBuf[2]+1];
4219                             bufSum32[3] = expLUT[idxBuf[3]+1];
4220                             float32x4_t _explut1 = vld1q_f32(bufSum32);
4221
4222                             float32x4_t _w = vmulq_f32(_sw, vaddq_f32(_explut, vmulq_f32(_alpha, vsubq_f32(_explut1, _explut))));
4223
4224                             _b = vmulq_f32(_b, _w);
4225                             _g = vmulq_f32(_g, _w);
4226                             _r = vmulq_f32(_r, _w);
4227
4228                             float32x2_t _wb = vpadd_f32(vpadd_f32(vget_low_f32(_w),vget_high_f32(_w)), vpadd_f32(vget_low_f32(_b), vget_high_f32(_b)));
4229                             float32x2_t _gr = vpadd_f32(vpadd_f32(vget_low_f32(_g),vget_high_f32(_g)), vpadd_f32(vget_low_f32(_r), vget_high_f32(_r)));
4230
4231                             _w = vcombine_f32(_wb, _gr);
4232                             sum = vaddq_f32(sum, _w);
4233                         }
4234                         vst1q_f32(bufSum32, sum);
4235                         wsum  = bufSum32[0];
4236                         sum_b = bufSum32[1];
4237                         sum_g = bufSum32[2];
4238                         sum_r = bufSum32[3];
4239                     }
4240                     #endif
4241
4242                     for(; k < maxk; k++ )
4243                     {
4244                         const float* sptr_k = sptr + j + space_ofs[k];
4245                         float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
4246                         float alpha = (float)((std::abs(b - b0) +
4247                             std::abs(g - g0) + std::abs(r - r0))*scale_index);
4248                         int idx = cvFloor(alpha);
4249                         alpha -= idx;
4250                         float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
4251                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
4252                         wsum += w;
4253                     }
4254                     wsum = 1.f/wsum;
4255                     b0 = sum_b*wsum;
4256                     g0 = sum_g*wsum;
4257                     r0 = sum_r*wsum;
4258                     dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
4259                 }
4260             }
4261         }
4262     }
4263
4264 private:
4265     int cn, radius, maxk, *space_ofs;
4266     const Mat* temp;
4267     Mat *dest;
4268     float scale_index, *space_weight, *expLUT;
4269 };
4270
4271
4272 static void
4273 bilateralFilter_32f( const Mat& src, Mat& dst, int d,
4274                      double sigma_color, double sigma_space,
4275                      int borderType )
4276 {
4277     int cn = src.channels();
4278     int i, j, maxk, radius;
4279     double minValSrc=-1, maxValSrc=1;
4280     const int kExpNumBinsPerChannel = 1 << 12;
4281     int kExpNumBins = 0;
4282     float lastExpVal = 1.f;
4283     float len, scale_index;
4284     Size size = src.size();
4285
4286     CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
4287
4288     if( sigma_color <= 0 )
4289         sigma_color = 1;
4290     if( sigma_space <= 0 )
4291         sigma_space = 1;
4292
4293     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
4294     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
4295
4296     if( d <= 0 )
4297         radius = cvRound(sigma_space*1.5);
4298     else
4299         radius = d/2;
4300     radius = MAX(radius, 1);
4301     d = radius*2 + 1;
4302     // compute the min/max range for the input image (even if multichannel)
4303
4304     minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
4305     if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
4306     {
4307         src.copyTo(dst);
4308         return;
4309     }
4310
4311     // temporary copy of the image with borders for easy processing
4312     Mat temp;
4313     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
4314     const double insteadNaNValue = -5. * sigma_color;
4315     patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative
4316                                         // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption
4317     // allocate lookup tables
4318     std::vector<float> _space_weight(d*d);
4319     std::vector<int> _space_ofs(d*d);
4320     float* space_weight = &_space_weight[0];
4321     int* space_ofs = &_space_ofs[0];
4322
4323     // assign a length which is slightly more than needed
4324     len = (float)(maxValSrc - minValSrc) * cn;
4325     kExpNumBins = kExpNumBinsPerChannel * cn;
4326     std::vector<float> _expLUT(kExpNumBins+2);
4327     float* expLUT = &_expLUT[0];
4328
4329     scale_index = kExpNumBins/len;
4330
4331     // initialize the exp LUT
4332     for( i = 0; i < kExpNumBins+2; i++ )
4333     {
4334         if( lastExpVal > 0.f )
4335         {
4336             double val =  i / scale_index;
4337             expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
4338             lastExpVal = expLUT[i];
4339         }
4340         else
4341             expLUT[i] = 0.f;
4342     }
4343
4344     // initialize space-related bilateral filter coefficients
4345     for( i = -radius, maxk = 0; i <= radius; i++ )
4346         for( j = -radius; j <= radius; j++ )
4347         {
4348             double r = std::sqrt((double)i*i + (double)j*j);
4349             if( r > radius )
4350                 continue;
4351             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
4352             space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
4353         }
4354
4355     // parallel_for usage
4356
4357     BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
4358     parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
4359 }
4360
4361 }
4362
4363 void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
4364                       double sigmaColor, double sigmaSpace,
4365                       int borderType )
4366 {
4367     CV_INSTRUMENT_REGION()
4368
4369     _dst.create( _src.size(), _src.type() );
4370
4371     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
4372                ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
4373
4374     Mat src = _src.getMat(), dst = _dst.getMat();
4375
4376     if( src.depth() == CV_8U )
4377         bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType );
4378     else if( src.depth() == CV_32F )
4379         bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
4380     else
4381         CV_Error( CV_StsUnsupportedFormat,
4382         "Bilateral filtering is only implemented for 8u and 32f images" );
4383 }
4384
4385 //////////////////////////////////////////////////////////////////////////////////////////
4386
4387 CV_IMPL void
4388 cvSmooth( const void* srcarr, void* dstarr, int smooth_type,
4389           int param1, int param2, double param3, double param4 )
4390 {
4391     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
4392
4393     CV_Assert( dst.size() == src.size() &&
4394         (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) );
4395
4396     if( param2 <= 0 )
4397         param2 = param1;
4398
4399     if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE )
4400         cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point(-1,-1),
4401             smooth_type == CV_BLUR, cv::BORDER_REPLICATE );
4402     else if( smooth_type == CV_GAUSSIAN )
4403         cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE );
4404     else if( smooth_type == CV_MEDIAN )
4405         cv::medianBlur( src, dst, param1 );
4406     else
4407         cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE );
4408
4409     if( dst.data != dst0.data )
4410         CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" );
4411 }
4412
4413 /* End of file. */