modules/core/src/stat.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "precomp.hpp"
  44 #include <climits>
  45 #include <limits>
  46
  47 #include "opencl_kernels_core.hpp"
  48
  49 namespace cv
  50 {
  51
  52 template<typename T> static inline Scalar rawToScalar(const T& v)
  53 {
  54     Scalar s;
  55     typedef typename DataType<T>::channel_type T1;
  56     int i, n = DataType<T>::channels;
  57     for( i = 0; i < n; i++ )
  58         s.val[i] = ((T1*)&v)[i];
  59     return s;
  60 }
  61
  62 /****************************************************************************************\
  63 *                                        sum                                             *
  64 \****************************************************************************************/
  65
  66 template <typename T, typename ST>
  67 struct Sum_SIMD
  68 {
  69     int operator () (const T *, const uchar *, ST *, int, int) const
  70     {
  71         return 0;
  72     }
  73 };
  74
  75 #if CV_NEON
  76
  77 template <>
  78 struct Sum_SIMD<uchar, int>
  79 {
  80     int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const
  81     {
  82         if (mask || (cn != 1 && cn != 2 && cn != 4))
  83             return 0;
  84
  85         int x = 0;
  86         uint32x4_t v_sum = vdupq_n_u32(0u);
  87
  88         for ( ; x <= len - 16; x += 16)
  89         {
  90             uint8x16_t v_src = vld1q_u8(src0 + x);
  91             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
  92
  93             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
  94             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));
  95
  96             v_half = vmovl_u8(vget_high_u8(v_src));
  97             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
  98             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));
  99         }
 100
 101         for ( ; x <= len - 8; x += 8)
 102         {
 103             uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x));
 104
 105             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
 106             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
 107         }
 108
 109         unsigned int CV_DECL_ALIGNED(16) ar[4];
 110         vst1q_u32(ar, v_sum);
 111
 112         for (int i = 0; i < 4; i += cn)
 113             for (int j = 0; j < cn; ++j)
 114                 dst[j] += ar[j + i];
 115
 116         return x / cn;
 117     }
 118 };
 119
 120 template <>
 121 struct Sum_SIMD<schar, int>
 122 {
 123     int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
 124     {
 125         if (mask || (cn != 1 && cn != 2 && cn != 4))
 126             return 0;
 127
 128         int x = 0;
 129         int32x4_t v_sum = vdupq_n_s32(0);
 130
 131         for ( ; x <= len - 16; x += 16)
 132         {
 133             int8x16_t v_src = vld1q_s8(src0 + x);
 134             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
 135
 136             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
 137             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));
 138
 139             v_half = vmovl_s8(vget_high_s8(v_src));
 140             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
 141             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));
 142         }
 143
 144         for ( ; x <= len - 8; x += 8)
 145         {
 146             int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x));
 147
 148             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
 149             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
 150         }
 151
 152         int CV_DECL_ALIGNED(16) ar[4];
 153         vst1q_s32(ar, v_sum);
 154
 155         for (int i = 0; i < 4; i += cn)
 156             for (int j = 0; j < cn; ++j)
 157                 dst[j] += ar[j + i];
 158
 159         return x / cn;
 160     }
 161 };
 162
 163 template <>
 164 struct Sum_SIMD<ushort, int>
 165 {
 166     int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const
 167     {
 168         if (mask || (cn != 1 && cn != 2 && cn != 4))
 169             return 0;
 170
 171         int x = 0;
 172         uint32x4_t v_sum = vdupq_n_u32(0u);
 173
 174         for ( ; x <= len - 8; x += 8)
 175         {
 176             uint16x8_t v_src = vld1q_u16(src0 + x);
 177
 178             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
 179             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
 180         }
 181
 182         for ( ; x <= len - 4; x += 4)
 183             v_sum = vaddq_u32(v_sum, vmovl_u16(vld1_u16(src0 + x)));
 184
 185         unsigned int CV_DECL_ALIGNED(16) ar[4];
 186         vst1q_u32(ar, v_sum);
 187
 188         for (int i = 0; i < 4; i += cn)
 189             for (int j = 0; j < cn; ++j)
 190                 dst[j] += ar[j + i];
 191
 192         return x / cn;
 193     }
 194 };
 195
 196 template <>
 197 struct Sum_SIMD<short, int>
 198 {
 199     int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
 200     {
 201         if (mask || (cn != 1 && cn != 2 && cn != 4))
 202             return 0;
 203
 204         int x = 0;
 205         int32x4_t v_sum = vdupq_n_s32(0u);
 206
 207         for ( ; x <= len - 8; x += 8)
 208         {
 209             int16x8_t v_src = vld1q_s16(src0 + x);
 210
 211             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
 212             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
 213         }
 214
 215         for ( ; x <= len - 4; x += 4)
 216             v_sum = vaddq_s32(v_sum, vmovl_s16(vld1_s16(src0 + x)));
 217
 218         int CV_DECL_ALIGNED(16) ar[4];
 219         vst1q_s32(ar, v_sum);
 220
 221         for (int i = 0; i < 4; i += cn)
 222             for (int j = 0; j < cn; ++j)
 223                 dst[j] += ar[j + i];
 224
 225         return x / cn;
 226     }
 227 };
 228
 229 #endif
 230
 231 template<typename T, typename ST>
 232 static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
 233 {
 234     const T* src = src0;
 235     if( !mask )
 236     {
 237         Sum_SIMD<T, ST> vop;
 238         int i = vop(src0, mask, dst, len, cn), k = cn % 4;
 239         src += i * cn;
 240
 241         if( k == 1 )
 242         {
 243             ST s0 = dst[0];
 244
 245             #if CV_ENABLE_UNROLLED
 246             for(; i <= len - 4; i += 4, src += cn*4 )
 247                 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
 248             #endif
 249             for( ; i < len; i++, src += cn )
 250                 s0 += src[0];
 251             dst[0] = s0;
 252         }
 253         else if( k == 2 )
 254         {
 255             ST s0 = dst[0], s1 = dst[1];
 256             for( ; i < len; i++, src += cn )
 257             {
 258                 s0 += src[0];
 259                 s1 += src[1];
 260             }
 261             dst[0] = s0;
 262             dst[1] = s1;
 263         }
 264         else if( k == 3 )
 265         {
 266             ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
 267             for( ; i < len; i++, src += cn )
 268             {
 269                 s0 += src[0];
 270                 s1 += src[1];
 271                 s2 += src[2];
 272             }
 273             dst[0] = s0;
 274             dst[1] = s1;
 275             dst[2] = s2;
 276         }
 277
 278         for( ; k < cn; k += 4 )
 279         {
 280             src = src0 + i*cn + k;
 281             ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
 282             for( ; i < len; i++, src += cn )
 283             {
 284                 s0 += src[0]; s1 += src[1];
 285                 s2 += src[2]; s3 += src[3];
 286             }
 287             dst[k] = s0;
 288             dst[k+1] = s1;
 289             dst[k+2] = s2;
 290             dst[k+3] = s3;
 291         }
 292         return len;
 293     }
 294
 295     int i, nzm = 0;
 296     if( cn == 1 )
 297     {
 298         ST s = dst[0];
 299         for( i = 0; i < len; i++ )
 300             if( mask[i] )
 301             {
 302                 s += src[i];
 303                 nzm++;
 304             }
 305         dst[0] = s;
 306     }
 307     else if( cn == 3 )
 308     {
 309         ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
 310         for( i = 0; i < len; i++, src += 3 )
 311             if( mask[i] )
 312             {
 313                 s0 += src[0];
 314                 s1 += src[1];
 315                 s2 += src[2];
 316                 nzm++;
 317             }
 318         dst[0] = s0;
 319         dst[1] = s1;
 320         dst[2] = s2;
 321     }
 322     else
 323     {
 324         for( i = 0; i < len; i++, src += cn )
 325             if( mask[i] )
 326             {
 327                 int k = 0;
 328                 #if CV_ENABLE_UNROLLED
 329                 for( ; k <= cn - 4; k += 4 )
 330                 {
 331                     ST s0, s1;
 332                     s0 = dst[k] + src[k];
 333                     s1 = dst[k+1] + src[k+1];
 334                     dst[k] = s0; dst[k+1] = s1;
 335                     s0 = dst[k+2] + src[k+2];
 336                     s1 = dst[k+3] + src[k+3];
 337                     dst[k+2] = s0; dst[k+3] = s1;
 338                 }
 339                 #endif
 340                 for( ; k < cn; k++ )
 341                     dst[k] += src[k];
 342                 nzm++;
 343             }
 344     }
 345     return nzm;
 346 }
 347
 348
 349 static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn )
 350 { return sum_(src, mask, dst, len, cn); }
 351
 352 static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn )
 353 { return sum_(src, mask, dst, len, cn); }
 354
 355 static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn )
 356 { return sum_(src, mask, dst, len, cn); }
 357
 358 static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn )
 359 { return sum_(src, mask, dst, len, cn); }
 360
 361 static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn )
 362 { return sum_(src, mask, dst, len, cn); }
 363
 364 static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn )
 365 { return sum_(src, mask, dst, len, cn); }
 366
 367 static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn )
 368 { return sum_(src, mask, dst, len, cn); }
 369
 370 typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int);
 371
 372 static SumFunc getSumFunc(int depth)
 373 {
 374     static SumFunc sumTab[] =
 375     {
 376         (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
 377         (SumFunc)sum16u, (SumFunc)sum16s,
 378         (SumFunc)sum32s,
 379         (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f,
 380         0
 381     };
 382
 383     return sumTab[depth];
 384 }
 385
 386 template<typename T>
 387 static int countNonZero_(const T* src, int len )
 388 {
 389     int i=0, nz = 0;
 390     #if CV_ENABLE_UNROLLED
 391     for(; i <= len - 4; i += 4 )
 392         nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
 393     #endif
 394     for( ; i < len; i++ )
 395         nz += src[i] != 0;
 396     return nz;
 397 }
 398
 399 static int countNonZero8u( const uchar* src, int len )
 400 {
 401     int i=0, nz = 0;
 402 #if CV_SSE2
 403     if(USE_SSE2)//5x-6x
 404     {
 405         __m128i pattern = _mm_setzero_si128 ();
 406         static uchar tab[256];
 407         static volatile bool initialized = false;
 408         if( !initialized )
 409         {
 410             // we compute inverse popcount table,
 411             // since we pass (img[x] == 0) mask as index in the table.
 412             for( int j = 0; j < 256; j++ )
 413             {
 414                 int val = 0;
 415                 for( int mask = 1; mask < 256; mask += mask )
 416                     val += (j & mask) == 0;
 417                 tab[j] = (uchar)val;
 418             }
 419             initialized = true;
 420         }
 421
 422         for (; i<=len-16; i+=16)
 423         {
 424             __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
 425             int val = _mm_movemask_epi8(_mm_cmpeq_epi8(r0, pattern));
 426             nz += tab[val & 255] + tab[val >> 8];
 427         }
 428     }
 429 #elif CV_NEON
 430     int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6;
 431     uint32x4_t v_nz = vdupq_n_u32(0u);
 432     uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1);
 433     const uchar * src0 = src;
 434
 435     while( i < len0 )
 436     {
 437         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 438
 439         while (j < blockSizei)
 440         {
 441             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 442             uint8x16_t v_pz = v_zero;
 443
 444             for( ; k <= blockSizej - 16; k += 16 )
 445                 v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1));
 446
 447             uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz));
 448             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz);
 449             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz);
 450
 451             src0 += blockSizej;
 452             j += blockSizej;
 453         }
 454
 455         i += blockSizei;
 456     }
 457
 458     CV_DECL_ALIGNED(16) unsigned int buf[4];
 459     vst1q_u32(buf, v_nz);
 460     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 461 #endif
 462     for( ; i < len; i++ )
 463         nz += src[i] != 0;
 464     return nz;
 465 }
 466
 467 static int countNonZero16u( const ushort* src, int len )
 468 {
 469     int i = 0, nz = 0;
 470 #if CV_NEON
 471     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
 472     uint32x4_t v_nz = vdupq_n_u32(0u);
 473     uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
 474
 475     while( i < len0 )
 476     {
 477         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 478
 479         while (j < blockSizei)
 480         {
 481             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 482             uint16x8_t v_pz = v_zero;
 483
 484             for( ; k <= blockSizej - 8; k += 8 )
 485                 v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1));
 486
 487             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
 488
 489             src += blockSizej;
 490             j += blockSizej;
 491         }
 492
 493         i += blockSizei;
 494     }
 495
 496     CV_DECL_ALIGNED(16) unsigned int buf[4];
 497     vst1q_u32(buf, v_nz);
 498     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 499 #endif
 500     return nz + countNonZero_(src, len - i);
 501 }
 502
 503 static int countNonZero32s( const int* src, int len )
 504 {
 505     int i = 0, nz = 0;
 506 #if CV_NEON
 507     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
 508     uint32x4_t v_nz = vdupq_n_u32(0u);
 509     int32x4_t v_zero = vdupq_n_s32(0.0f);
 510     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
 511
 512     while( i < len0 )
 513     {
 514         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 515
 516         while (j < blockSizei)
 517         {
 518             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 519             uint16x8_t v_pz = v_zerou;
 520
 521             for( ; k <= blockSizej - 8; k += 8 )
 522                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)),
 523                                                               vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1));
 524
 525             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
 526
 527             src += blockSizej;
 528             j += blockSizej;
 529         }
 530
 531         i += blockSizei;
 532     }
 533
 534     CV_DECL_ALIGNED(16) unsigned int buf[4];
 535     vst1q_u32(buf, v_nz);
 536     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 537 #endif
 538     return nz + countNonZero_(src, len - i);
 539 }
 540
 541 static int countNonZero32f( const float* src, int len )
 542 {
 543     int i = 0, nz = 0;
 544 #if CV_NEON
 545     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
 546     uint32x4_t v_nz = vdupq_n_u32(0u);
 547     float32x4_t v_zero = vdupq_n_f32(0.0f);
 548     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
 549
 550     while( i < len0 )
 551     {
 552         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 553
 554         while (j < blockSizei)
 555         {
 556             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 557             uint16x8_t v_pz = v_zerou;
 558
 559             for( ; k <= blockSizej - 8; k += 8 )
 560                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)),
 561                                                               vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1));
 562
 563             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
 564
 565             src += blockSizej;
 566             j += blockSizej;
 567         }
 568
 569         i += blockSizei;
 570     }
 571
 572     CV_DECL_ALIGNED(16) unsigned int buf[4];
 573     vst1q_u32(buf, v_nz);
 574     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 575 #endif
 576     return nz + countNonZero_(src, len - i);
 577 }
 578
 579 static int countNonZero64f( const double* src, int len )
 580 { return countNonZero_(src, len); }
 581
 582 typedef int (*CountNonZeroFunc)(const uchar*, int);
 583
 584 static CountNonZeroFunc getCountNonZeroTab(int depth)
 585 {
 586     static CountNonZeroFunc countNonZeroTab[] =
 587     {
 588         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
 589         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
 590         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),
 591         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0
 592     };
 593
 594     return countNonZeroTab[depth];
 595 }
 596
 597 template<typename T, typename ST, typename SQT>
 598 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
 599 {
 600     const T* src = src0;
 601
 602     if( !mask )
 603     {
 604         int i;
 605         int k = cn % 4;
 606
 607         if( k == 1 )
 608         {
 609             ST s0 = sum[0];
 610             SQT sq0 = sqsum[0];
 611             for( i = 0; i < len; i++, src += cn )
 612             {
 613                 T v = src[0];
 614                 s0 += v; sq0 += (SQT)v*v;
 615             }
 616             sum[0] = s0;
 617             sqsum[0] = sq0;
 618         }
 619         else if( k == 2 )
 620         {
 621             ST s0 = sum[0], s1 = sum[1];
 622             SQT sq0 = sqsum[0], sq1 = sqsum[1];
 623             for( i = 0; i < len; i++, src += cn )
 624             {
 625                 T v0 = src[0], v1 = src[1];
 626                 s0 += v0; sq0 += (SQT)v0*v0;
 627                 s1 += v1; sq1 += (SQT)v1*v1;
 628             }
 629             sum[0] = s0; sum[1] = s1;
 630             sqsum[0] = sq0; sqsum[1] = sq1;
 631         }
 632         else if( k == 3 )
 633         {
 634             ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
 635             SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
 636             for( i = 0; i < len; i++, src += cn )
 637             {
 638                 T v0 = src[0], v1 = src[1], v2 = src[2];
 639                 s0 += v0; sq0 += (SQT)v0*v0;
 640                 s1 += v1; sq1 += (SQT)v1*v1;
 641                 s2 += v2; sq2 += (SQT)v2*v2;
 642             }
 643             sum[0] = s0; sum[1] = s1; sum[2] = s2;
 644             sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
 645         }
 646
 647         for( ; k < cn; k += 4 )
 648         {
 649             src = src0 + k;
 650             ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
 651             SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
 652             for( i = 0; i < len; i++, src += cn )
 653             {
 654                 T v0, v1;
 655                 v0 = src[0], v1 = src[1];
 656                 s0 += v0; sq0 += (SQT)v0*v0;
 657                 s1 += v1; sq1 += (SQT)v1*v1;
 658                 v0 = src[2], v1 = src[3];
 659                 s2 += v0; sq2 += (SQT)v0*v0;
 660                 s3 += v1; sq3 += (SQT)v1*v1;
 661             }
 662             sum[k] = s0; sum[k+1] = s1;
 663             sum[k+2] = s2; sum[k+3] = s3;
 664             sqsum[k] = sq0; sqsum[k+1] = sq1;
 665             sqsum[k+2] = sq2; sqsum[k+3] = sq3;
 666         }
 667         return len;
 668     }
 669
 670     int i, nzm = 0;
 671
 672     if( cn == 1 )
 673     {
 674         ST s0 = sum[0];
 675         SQT sq0 = sqsum[0];
 676         for( i = 0; i < len; i++ )
 677             if( mask[i] )
 678             {
 679                 T v = src[i];
 680                 s0 += v; sq0 += (SQT)v*v;
 681                 nzm++;
 682             }
 683         sum[0] = s0;
 684         sqsum[0] = sq0;
 685     }
 686     else if( cn == 3 )
 687     {
 688         ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
 689         SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
 690         for( i = 0; i < len; i++, src += 3 )
 691             if( mask[i] )
 692             {
 693                 T v0 = src[0], v1 = src[1], v2 = src[2];
 694                 s0 += v0; sq0 += (SQT)v0*v0;
 695                 s1 += v1; sq1 += (SQT)v1*v1;
 696                 s2 += v2; sq2 += (SQT)v2*v2;
 697                 nzm++;
 698             }
 699         sum[0] = s0; sum[1] = s1; sum[2] = s2;
 700         sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
 701     }
 702     else
 703     {
 704         for( i = 0; i < len; i++, src += cn )
 705             if( mask[i] )
 706             {
 707                 for( int k = 0; k < cn; k++ )
 708                 {
 709                     T v = src[k];
 710                     ST s = sum[k] + v;
 711                     SQT sq = sqsum[k] + (SQT)v*v;
 712                     sum[k] = s; sqsum[k] = sq;
 713                 }
 714                 nzm++;
 715             }
 716     }
 717     return nzm;
 718 }
 719
 720
 721 static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
 722 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 723
 724 static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
 725 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 726
 727 static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
 728 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 729
 730 static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
 731 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 732
 733 static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 734 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 735
 736 static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 737 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 738
 739 static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 740 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 741
 742 typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int);
 743
 744 static SumSqrFunc getSumSqrTab(int depth)
 745 {
 746     static SumSqrFunc sumSqrTab[] =
 747     {
 748         (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
 749         (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
 750     };
 751
 752     return sumSqrTab[depth];
 753 }
 754
 755 #ifdef HAVE_OPENCL
 756
 757 template <typename T> Scalar ocl_part_sum(Mat m)
 758 {
 759     CV_Assert(m.rows == 1);
 760
 761     Scalar s = Scalar::all(0);
 762     int cn = m.channels();
 763     const T * const ptr = m.ptr<T>(0);
 764
 765     for (int x = 0, w = m.cols * cn; x < w; )
 766         for (int c = 0; c < cn; ++c, ++x)
 767             s[c] += ptr[x];
 768
 769     return s;
 770 }
 771
 772 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
 773
 774 static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
 775                      InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
 776 {
 777     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
 778
 779     const ocl::Device & dev = ocl::Device::getDefault();
 780     bool doubleSupport = dev.doubleFPConfig() > 0,
 781         haveMask = _mask.kind() != _InputArray::NONE,
 782         haveSrc2 = _src2.kind() != _InputArray::NONE;
 783     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
 784             kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
 785             mcn = std::max(cn, kercn);
 786     CV_Assert(!haveSrc2 || _src2.type() == type);
 787     int convert_cn = haveSrc2 ? mcn : cn;
 788
 789     if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
 790         return false;
 791
 792     int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
 793     size_t wgs = dev.maxWorkGroupSize();
 794
 795     int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
 796             dtype = CV_MAKE_TYPE(ddepth, cn);
 797     CV_Assert(!haveMask || _mask.type() == CV_8UC1);
 798
 799     int wgs2_aligned = 1;
 800     while (wgs2_aligned < (int)wgs)
 801         wgs2_aligned <<= 1;
 802     wgs2_aligned >>= 1;
 803
 804     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
 805     char cvt[2][40];
 806     String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
 807                          " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s",
 808                          ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
 809                          ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
 810                          ocl::typeToStr(ddepth), ddepth, cn,
 811                          ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]),
 812                          opMap[sum_op], (int)wgs, wgs2_aligned,
 813                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
 814                          haveMask ? " -D HAVE_MASK" : "",
 815                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
 816                          haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
 817                          haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
 818                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
 819                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");
 820
 821     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
 822     if (k.empty())
 823         return false;
 824
 825     UMat src = _src.getUMat(), src2 = _src2.getUMat(),
 826         db(1, dbsize, dtype), mask = _mask.getUMat();
 827
 828     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
 829             dbarg = ocl::KernelArg::PtrWriteOnly(db),
 830             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
 831             src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
 832
 833     if (haveMask)
 834     {
 835         if (haveSrc2)
 836             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
 837         else
 838             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
 839     }
 840     else
 841     {
 842         if (haveSrc2)
 843             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
 844         else
 845             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
 846     }
 847
 848     size_t globalsize = ngroups * wgs;
 849     if (k.run(1, &globalsize, &wgs, false))
 850     {
 851         typedef Scalar (*part_sum)(Mat m);
 852         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
 853                 func = funcs[ddepth - CV_32S];
 854
 855         Mat mres = db.getMat(ACCESS_READ);
 856         if (calc2)
 857             const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
 858
 859         res = func(mres.colRange(0, ngroups));
 860         return true;
 861     }
 862     return false;
 863 }
 864
 865 #endif
 866
 867 }
 868
 869 cv::Scalar cv::sum( InputArray _src )
 870 {
 871 #ifdef HAVE_OPENCL
 872     Scalar _res;
 873     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
 874                 ocl_sum(_src, _res, OCL_OP_SUM),
 875                 _res)
 876 #endif
 877
 878     Mat src = _src.getMat();
 879     int k, cn = src.channels(), depth = src.depth();
 880
 881 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 882     size_t total_size = src.total();
 883     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
 884     if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
 885     {
 886         IppiSize sz = { cols, rows };
 887         int type = src.type();
 888         typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
 889         typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
 890         ippiSumFuncHint ippFuncHint =
 891             type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
 892             type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
 893             type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
 894             0;
 895         ippiSumFuncNoHint ippFuncNoHint =
 896             type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
 897             type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
 898             type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
 899             type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
 900             type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
 901             type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
 902             type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
 903             type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
 904             type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
 905             0;
 906         CV_Assert(!ippFuncHint || !ippFuncNoHint);
 907         if( ippFuncHint || ippFuncNoHint )
 908         {
 909             Ipp64f res[4];
 910             IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
 911                             ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
 912             if( ret >= 0 )
 913             {
 914                 Scalar sc;
 915                 for( int i = 0; i < cn; i++ )
 916                     sc[i] = res[i];
 917                 return sc;
 918             }
 919             setIppErrorStatus();
 920         }
 921     }
 922 #endif
 923
 924     SumFunc func = getSumFunc(depth);
 925
 926     CV_Assert( cn <= 4 && func != 0 );
 927
 928     const Mat* arrays[] = {&src, 0};
 929     uchar* ptrs[1];
 930     NAryMatIterator it(arrays, ptrs);
 931     Scalar s;
 932     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
 933     int j, count = 0;
 934     AutoBuffer<int> _buf;
 935     int* buf = (int*)&s[0];
 936     size_t esz = 0;
 937     bool blockSum = depth < CV_32S;
 938
 939     if( blockSum )
 940     {
 941         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
 942         blockSize = std::min(blockSize, intSumBlockSize);
 943         _buf.allocate(cn);
 944         buf = _buf;
 945
 946         for( k = 0; k < cn; k++ )
 947             buf[k] = 0;
 948         esz = src.elemSize();
 949     }
 950
 951     for( size_t i = 0; i < it.nplanes; i++, ++it )
 952     {
 953         for( j = 0; j < total; j += blockSize )
 954         {
 955             int bsz = std::min(total - j, blockSize);
 956             func( ptrs[0], 0, (uchar*)buf, bsz, cn );
 957             count += bsz;
 958             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
 959             {
 960                 for( k = 0; k < cn; k++ )
 961                 {
 962                     s[k] += buf[k];
 963                     buf[k] = 0;
 964                 }
 965                 count = 0;
 966             }
 967             ptrs[0] += bsz*esz;
 968         }
 969     }
 970     return s;
 971 }
 972
 973 #ifdef HAVE_OPENCL
 974
 975 namespace cv {
 976
 977 static bool ocl_countNonZero( InputArray _src, int & res )
 978 {
 979     int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
 980     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 981
 982     if (depth == CV_64F && !doubleSupport)
 983         return false;
 984
 985     int dbsize = ocl::Device::getDefault().maxComputeUnits();
 986     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
 987
 988     int wgs2_aligned = 1;
 989     while (wgs2_aligned < (int)wgs)
 990         wgs2_aligned <<= 1;
 991     wgs2_aligned >>= 1;
 992
 993     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
 994                   format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
 995                          " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
 996                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
 997                          ocl::typeToStr(depth), (int)wgs, kercn,
 998                          wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
 999                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
1000     if (k.empty())
1001         return false;
1002
1003     UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
1004     k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1005            dbsize, ocl::KernelArg::PtrWriteOnly(db));
1006
1007     size_t globalsize = dbsize * wgs;
1008     if (k.run(1, &globalsize, &wgs, true))
1009         return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
1010     return false;
1011 }
1012
1013 }
1014
1015 #endif
1016
1017 int cv::countNonZero( InputArray _src )
1018 {
1019     int type = _src.type(), cn = CV_MAT_CN(type);
1020     CV_Assert( cn == 1 );
1021
1022 #ifdef HAVE_OPENCL
1023     int res = -1;
1024     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
1025                 ocl_countNonZero(_src, res),
1026                 res)
1027 #endif
1028
1029     Mat src = _src.getMat();
1030
1031 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && 0
1032     if (src.dims <= 2 || src.isContinuous())
1033     {
1034         IppiSize roiSize = { src.cols, src.rows };
1035         Ipp32s count = 0, srcstep = (Ipp32s)src.step;
1036         IppStatus status = (IppStatus)-1;
1037
1038         if (src.isContinuous())
1039         {
1040             roiSize.width = (Ipp32s)src.total();
1041             roiSize.height = 1;
1042             srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type);
1043         }
1044
1045         int depth = CV_MAT_DEPTH(type);
1046         if (depth == CV_8U)
1047             status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0);
1048         else if (depth == CV_32F)
1049             status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0);
1050
1051         if (status >= 0)
1052             return (Ipp32s)src.total() - count;
1053         setIppErrorStatus();
1054     }
1055 #endif
1056
1057     CountNonZeroFunc func = getCountNonZeroTab(src.depth());
1058     CV_Assert( func != 0 );
1059
1060     const Mat* arrays[] = {&src, 0};
1061     uchar* ptrs[1];
1062     NAryMatIterator it(arrays, ptrs);
1063     int total = (int)it.size, nz = 0;
1064
1065     for( size_t i = 0; i < it.nplanes; i++, ++it )
1066         nz += func( ptrs[0], total );
1067
1068     return nz;
1069 }
1070
1071 cv::Scalar cv::mean( InputArray _src, InputArray _mask )
1072 {
1073     Mat src = _src.getMat(), mask = _mask.getMat();
1074     CV_Assert( mask.empty() || mask.type() == CV_8U );
1075
1076     int k, cn = src.channels(), depth = src.depth();
1077
1078 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1079     size_t total_size = src.total();
1080     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
1081     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1082     {
1083         IppiSize sz = { cols, rows };
1084         int type = src.type();
1085         if( !mask.empty() )
1086         {
1087             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
1088             ippiMaskMeanFuncC1 ippFuncC1 =
1089             type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
1090             type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
1091             type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
1092             0;
1093             if( ippFuncC1 )
1094             {
1095                 Ipp64f res;
1096                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 )
1097                     return Scalar(res);
1098                 setIppErrorStatus();
1099             }
1100             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
1101             ippiMaskMeanFuncC3 ippFuncC3 =
1102             type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
1103             type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
1104             type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
1105             0;
1106             if( ippFuncC3 )
1107             {
1108                 Ipp64f res1, res2, res3;
1109                 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 &&
1110                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 &&
1111                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 )
1112                 {
1113                     return Scalar(res1, res2, res3);
1114                 }
1115                 setIppErrorStatus();
1116             }
1117         }
1118         else
1119         {
1120             typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
1121             typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
1122             ippiMeanFuncHint ippFuncHint =
1123                 type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
1124                 type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
1125                 type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
1126                 0;
1127             ippiMeanFuncNoHint ippFuncNoHint =
1128                 type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
1129                 type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
1130                 type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
1131                 type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
1132                 type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
1133                 type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
1134                 type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
1135                 type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
1136                 type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
1137                 0;
1138             // Make sure only zero or one version of the function pointer is valid
1139             CV_Assert(!ippFuncHint || !ippFuncNoHint);
1140             if( ippFuncHint || ippFuncNoHint )
1141             {
1142                 Ipp64f res[4];
1143                 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
1144                                 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
1145                 if( ret >= 0 )
1146                 {
1147                     Scalar sc;
1148                     for( int i = 0; i < cn; i++ )
1149                         sc[i] = res[i];
1150                     return sc;
1151                 }
1152                 setIppErrorStatus();
1153             }
1154         }
1155     }
1156 #endif
1157
1158     SumFunc func = getSumFunc(depth);
1159
1160     CV_Assert( cn <= 4 && func != 0 );
1161
1162     const Mat* arrays[] = {&src, &mask, 0};
1163     uchar* ptrs[2];
1164     NAryMatIterator it(arrays, ptrs);
1165     Scalar s;
1166     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
1167     int j, count = 0;
1168     AutoBuffer<int> _buf;
1169     int* buf = (int*)&s[0];
1170     bool blockSum = depth <= CV_16S;
1171     size_t esz = 0, nz0 = 0;
1172
1173     if( blockSum )
1174     {
1175         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
1176         blockSize = std::min(blockSize, intSumBlockSize);
1177         _buf.allocate(cn);
1178         buf = _buf;
1179
1180         for( k = 0; k < cn; k++ )
1181             buf[k] = 0;
1182         esz = src.elemSize();
1183     }
1184
1185     for( size_t i = 0; i < it.nplanes; i++, ++it )
1186     {
1187         for( j = 0; j < total; j += blockSize )
1188         {
1189             int bsz = std::min(total - j, blockSize);
1190             int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn );
1191             count += nz;
1192             nz0 += nz;
1193             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
1194             {
1195                 for( k = 0; k < cn; k++ )
1196                 {
1197                     s[k] += buf[k];
1198                     buf[k] = 0;
1199                 }
1200                 count = 0;
1201             }
1202             ptrs[0] += bsz*esz;
1203             if( ptrs[1] )
1204                 ptrs[1] += bsz;
1205         }
1206     }
1207     return s*(nz0 ? 1./nz0 : 0);
1208 }
1209
1210 #ifdef HAVE_OPENCL
1211
1212 namespace cv {
1213
1214 static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
1215 {
1216     bool haveMask = _mask.kind() != _InputArray::NONE;
1217     int nz = haveMask ? -1 : (int)_src.total();
1218     Scalar mean, stddev;
1219
1220     {
1221         int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1222         bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
1223                 isContinuous = _src.isContinuous(),
1224                 isMaskContinuous = _mask.isContinuous();
1225         const ocl::Device &defDev = ocl::Device::getDefault();
1226         int groups = defDev.maxComputeUnits();
1227         if (defDev.isIntel())
1228         {
1229             static const int subSliceEUCount = 10;
1230             groups = (groups / subSliceEUCount) * 2;
1231         }
1232         size_t wgs = defDev.maxWorkGroupSize();
1233
1234         int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
1235                 dtype = CV_MAKE_TYPE(ddepth, cn),
1236                 sqdtype = CV_MAKETYPE(sqddepth, cn);
1237         CV_Assert(!haveMask || _mask.type() == CV_8UC1);
1238
1239         int wgs2_aligned = 1;
1240         while (wgs2_aligned < (int)wgs)
1241             wgs2_aligned <<= 1;
1242         wgs2_aligned >>= 1;
1243
1244         if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
1245             return false;
1246
1247         char cvt[2][40];
1248         String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
1249                              " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s"
1250                              " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
1251                              ocl::typeToStr(type), ocl::typeToStr(depth),
1252                              ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
1253                              ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
1254                              ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
1255                              cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
1256                              isMaskContinuous ? " -D HAVE_MASK_CONT" : "",
1257                              ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
1258                              (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
1259                              doubleSupport ? " -D DOUBLE_SUPPORT" : "");
1260
1261         ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts);
1262         if (k.empty())
1263             return false;
1264
1265         int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) +
1266                                CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype));
1267         UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
1268
1269         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
1270                 dbarg = ocl::KernelArg::PtrWriteOnly(db),
1271                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
1272
1273         if (haveMask)
1274             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg);
1275         else
1276             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg);
1277
1278         size_t globalsize = groups * wgs;
1279         if (!k.run(1, &globalsize, &wgs, false))
1280             return false;
1281
1282         typedef Scalar (* part_sum)(Mat m);
1283         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> };
1284         Mat dbm = db.getMat(ACCESS_READ);
1285
1286         mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr()));
1287         stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype)));
1288
1289         if (haveMask)
1290             nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() +
1291                                                  groups * (CV_ELEM_SIZE(dtype) +
1292                                                            CV_ELEM_SIZE(sqdtype))))[0]);
1293     }
1294
1295     double total = nz != 0 ? 1.0 / nz : 0;
1296     int k, j, cn = _src.channels();
1297     for (int i = 0; i < cn; ++i)
1298     {
1299         mean[i] *= total;
1300         stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.));
1301     }
1302
1303     for( j = 0; j < 2; j++ )
1304     {
1305         const double * const sptr = j == 0 ? &mean[0] : &stddev[0];
1306         _OutputArray _dst = j == 0 ? _mean : _sdv;
1307         if( !_dst.needed() )
1308             continue;
1309
1310         if( !_dst.fixedSize() )
1311             _dst.create(cn, 1, CV_64F, -1, true);
1312         Mat dst = _dst.getMat();
1313         int dcn = (int)dst.total();
1314         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
1315                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
1316         double* dptr = dst.ptr<double>();
1317         for( k = 0; k < cn; k++ )
1318             dptr[k] = sptr[k];
1319         for( ; k < dcn; k++ )
1320             dptr[k] = 0;
1321     }
1322
1323     return true;
1324 }
1325
1326 }
1327
1328 #endif
1329
1330 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
1331 {
1332     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
1333                ocl_meanStdDev(_src, _mean, _sdv, _mask))
1334
1335     Mat src = _src.getMat(), mask = _mask.getMat();
1336     CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
1337
1338     int k, cn = src.channels(), depth = src.depth();
1339
1340 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1341     size_t total_size = src.total();
1342     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
1343     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1344     {
1345         Ipp64f mean_temp[3];
1346         Ipp64f stddev_temp[3];
1347         Ipp64f *pmean = &mean_temp[0];
1348         Ipp64f *pstddev = &stddev_temp[0];
1349         Mat mean, stddev;
1350         int dcn_mean = -1;
1351         if( _mean.needed() )
1352         {
1353             if( !_mean.fixedSize() )
1354                 _mean.create(cn, 1, CV_64F, -1, true);
1355             mean = _mean.getMat();
1356             dcn_mean = (int)mean.total();
1357             pmean = mean.ptr<Ipp64f>();
1358         }
1359         int dcn_stddev = -1;
1360         if( _sdv.needed() )
1361         {
1362             if( !_sdv.fixedSize() )
1363                 _sdv.create(cn, 1, CV_64F, -1, true);
1364             stddev = _sdv.getMat();
1365             dcn_stddev = (int)stddev.total();
1366             pstddev = stddev.ptr<Ipp64f>();
1367         }
1368         for( int c = cn; c < dcn_mean; c++ )
1369             pmean[c] = 0;
1370         for( int c = cn; c < dcn_stddev; c++ )
1371             pstddev[c] = 0;
1372         IppiSize sz = { cols, rows };
1373         int type = src.type();
1374         if( !mask.empty() )
1375         {
1376             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
1377             ippiMaskMeanStdDevFuncC1 ippFuncC1 =
1378             type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
1379             type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
1380             type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
1381             0;
1382             if( ippFuncC1 )
1383             {
1384                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 )
1385                     return;
1386                 setIppErrorStatus();
1387             }
1388             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
1389             ippiMaskMeanStdDevFuncC3 ippFuncC3 =
1390             type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
1391             type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
1392             type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
1393             0;
1394             if( ippFuncC3 )
1395             {
1396                 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
1397                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
1398                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
1399                     return;
1400                 setIppErrorStatus();
1401             }
1402         }
1403         else
1404         {
1405             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
1406             ippiMeanStdDevFuncC1 ippFuncC1 =
1407             type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
1408             type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
1409 #if (IPP_VERSION_X100 >= 801)
1410             type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
1411 #endif
1412             0;
1413             if( ippFuncC1 )
1414             {
1415                 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 )
1416                     return;
1417                 setIppErrorStatus();
1418             }
1419             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
1420             ippiMeanStdDevFuncC3 ippFuncC3 =
1421             type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
1422             type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
1423             type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
1424             0;
1425             if( ippFuncC3 )
1426             {
1427                 if( ippFuncC3(src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
1428                     ippFuncC3(src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
1429                     ippFuncC3(src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
1430                     return;
1431                 setIppErrorStatus();
1432             }
1433         }
1434     }
1435 #endif
1436
1437
1438     SumSqrFunc func = getSumSqrTab(depth);
1439
1440     CV_Assert( func != 0 );
1441
1442     const Mat* arrays[] = {&src, &mask, 0};
1443     uchar* ptrs[2];
1444     NAryMatIterator it(arrays, ptrs);
1445     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
1446     int j, count = 0, nz0 = 0;
1447     AutoBuffer<double> _buf(cn*4);
1448     double *s = (double*)_buf, *sq = s + cn;
1449     int *sbuf = (int*)s, *sqbuf = (int*)sq;
1450     bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S;
1451     size_t esz = 0;
1452
1453     for( k = 0; k < cn; k++ )
1454         s[k] = sq[k] = 0;
1455
1456     if( blockSum )
1457     {
1458         intSumBlockSize = 1 << 15;
1459         blockSize = std::min(blockSize, intSumBlockSize);
1460         sbuf = (int*)(sq + cn);
1461         if( blockSqSum )
1462             sqbuf = sbuf + cn;
1463         for( k = 0; k < cn; k++ )
1464             sbuf[k] = sqbuf[k] = 0;
1465         esz = src.elemSize();
1466     }
1467
1468     for( size_t i = 0; i < it.nplanes; i++, ++it )
1469     {
1470         for( j = 0; j < total; j += blockSize )
1471         {
1472             int bsz = std::min(total - j, blockSize);
1473             int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn );
1474             count += nz;
1475             nz0 += nz;
1476             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
1477             {
1478                 for( k = 0; k < cn; k++ )
1479                 {
1480                     s[k] += sbuf[k];
1481                     sbuf[k] = 0;
1482                 }
1483                 if( blockSqSum )
1484                 {
1485                     for( k = 0; k < cn; k++ )
1486                     {
1487                         sq[k] += sqbuf[k];
1488                         sqbuf[k] = 0;
1489                     }
1490                 }
1491                 count = 0;
1492             }
1493             ptrs[0] += bsz*esz;
1494             if( ptrs[1] )
1495                 ptrs[1] += bsz;
1496         }
1497     }
1498
1499     double scale = nz0 ? 1./nz0 : 0.;
1500     for( k = 0; k < cn; k++ )
1501     {
1502         s[k] *= scale;
1503         sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
1504     }
1505
1506     for( j = 0; j < 2; j++ )
1507     {
1508         const double* sptr = j == 0 ? s : sq;
1509         _OutputArray _dst = j == 0 ? _mean : _sdv;
1510         if( !_dst.needed() )
1511             continue;
1512
1513         if( !_dst.fixedSize() )
1514             _dst.create(cn, 1, CV_64F, -1, true);
1515         Mat dst = _dst.getMat();
1516         int dcn = (int)dst.total();
1517         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
1518                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
1519         double* dptr = dst.ptr<double>();
1520         for( k = 0; k < cn; k++ )
1521             dptr[k] = sptr[k];
1522         for( ; k < dcn; k++ )
1523             dptr[k] = 0;
1524     }
1525 }
1526
1527 /****************************************************************************************\
1528 *                                       minMaxLoc                                        *
1529 \****************************************************************************************/
1530
1531 namespace cv
1532 {
1533
1534 template<typename T, typename WT> static void
1535 minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal,
1536             size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx )
1537 {
1538     WT minVal = *_minVal, maxVal = *_maxVal;
1539     size_t minIdx = *_minIdx, maxIdx = *_maxIdx;
1540
1541     if( !mask )
1542     {
1543         for( int i = 0; i < len; i++ )
1544         {
1545             T val = src[i];
1546             if( val < minVal )
1547             {
1548                 minVal = val;
1549                 minIdx = startIdx + i;
1550             }
1551             if( val > maxVal )
1552             {
1553                 maxVal = val;
1554                 maxIdx = startIdx + i;
1555             }
1556         }
1557     }
1558     else
1559     {
1560         for( int i = 0; i < len; i++ )
1561         {
1562             T val = src[i];
1563             if( mask[i] && val < minVal )
1564             {
1565                 minVal = val;
1566                 minIdx = startIdx + i;
1567             }
1568             if( mask[i] && val > maxVal )
1569             {
1570                 maxVal = val;
1571                 maxIdx = startIdx + i;
1572             }
1573         }
1574     }
1575
1576     *_minIdx = minIdx;
1577     *_maxIdx = maxIdx;
1578     *_minVal = minVal;
1579     *_maxVal = maxVal;
1580 }
1581
1582 static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval,
1583                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
1584 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1585
1586 static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval,
1587                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
1588 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1589
1590 static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval,
1591                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1592 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1593
1594 static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval,
1595                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1596 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1597
1598 static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval,
1599                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1600 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1601
1602 static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval,
1603                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1604 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1605
1606 static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval,
1607                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1608 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1609
1610 typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t);
1611
1612 static MinMaxIdxFunc getMinmaxTab(int depth)
1613 {
1614     static MinMaxIdxFunc minmaxTab[] =
1615     {
1616         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s),
1617         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s),
1618         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s),
1619         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f),
1620         0
1621     };
1622
1623     return minmaxTab[depth];
1624 }
1625
1626 static void ofs2idx(const Mat& a, size_t ofs, int* idx)
1627 {
1628     int i, d = a.dims;
1629     if( ofs > 0 )
1630     {
1631         ofs--;
1632         for( i = d-1; i >= 0; i-- )
1633         {
1634             int sz = a.size[i];
1635             idx[i] = (int)(ofs % sz);
1636             ofs /= sz;
1637         }
1638     }
1639     else
1640     {
1641         for( i = d-1; i >= 0; i-- )
1642             idx[i] = -1;
1643     }
1644 }
1645
1646 #ifdef HAVE_OPENCL
1647
1648 template <typename T>
1649 void getMinMaxRes(const Mat & db, double * minVal, double * maxVal,
1650                   int* minLoc, int* maxLoc,
1651                   int groupnum, int cols, double * maxVal2)
1652 {
1653     uint index_max = std::numeric_limits<uint>::max();
1654     T minval = std::numeric_limits<T>::max();
1655     T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval;
1656     uint minloc = index_max, maxloc = index_max;
1657
1658     int index = 0;
1659     const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL;
1660     const uint * minlocptr = NULL, * maxlocptr = NULL;
1661     if (minVal || minLoc)
1662     {
1663         minptr = db.ptr<T>();
1664         index += sizeof(T) * groupnum;
1665     }
1666     if (maxVal || maxLoc)
1667     {
1668         maxptr = (const T *)(db.ptr() + index);
1669         index += sizeof(T) * groupnum;
1670     }
1671     if (minLoc)
1672     {
1673         minlocptr = (const uint *)(db.ptr() + index);
1674         index += sizeof(uint) * groupnum;
1675     }
1676     if (maxLoc)
1677     {
1678         maxlocptr = (const uint *)(db.ptr() + index);
1679         index += sizeof(uint) * groupnum;
1680     }
1681     if (maxVal2)
1682         maxptr2 = (const T *)(db.ptr() + index);
1683
1684     for (int i = 0; i < groupnum; i++)
1685     {
1686         if (minptr && minptr[i] <= minval)
1687         {
1688             if (minptr[i] == minval)
1689             {
1690                 if (minlocptr)
1691                     minloc = std::min(minlocptr[i], minloc);
1692             }
1693             else
1694             {
1695                 if (minlocptr)
1696                     minloc = minlocptr[i];
1697                 minval = minptr[i];
1698             }
1699         }
1700         if (maxptr && maxptr[i] >= maxval)
1701         {
1702             if (maxptr[i] == maxval)
1703             {
1704                 if (maxlocptr)
1705                     maxloc = std::min(maxlocptr[i], maxloc);
1706             }
1707             else
1708             {
1709                 if (maxlocptr)
1710                     maxloc = maxlocptr[i];
1711                 maxval = maxptr[i];
1712             }
1713         }
1714         if (maxptr2 && maxptr2[i] > maxval2)
1715             maxval2 = maxptr2[i];
1716     }
1717     bool zero_mask = (minLoc && minloc == index_max) ||
1718             (maxLoc && maxloc == index_max);
1719
1720     if (minVal)
1721         *minVal = zero_mask ? 0 : (double)minval;
1722     if (maxVal)
1723         *maxVal = zero_mask ? 0 : (double)maxval;
1724     if (maxVal2)
1725         *maxVal2 = zero_mask ? 0 : (double)maxval2;
1726
1727     if (minLoc)
1728     {
1729         minLoc[0] = zero_mask ? -1 : minloc / cols;
1730         minLoc[1] = zero_mask ? -1 : minloc % cols;
1731     }
1732     if (maxLoc)
1733     {
1734         maxLoc[0] = zero_mask ? -1 : maxloc / cols;
1735         maxLoc[1] = zero_mask ? -1 : maxloc % cols;
1736     }
1737 }
1738
1739 typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal,
1740                                  int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2);
1741
1742 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
1743                            int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL)
1744 {
1745     const ocl::Device & dev = ocl::Device::getDefault();
1746     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
1747         haveSrc2 = _src2.kind() != _InputArray::NONE;
1748     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
1749             kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src, _src2));
1750
1751     // disabled following modes since it occasionally fails on AMD devices (e.g. A10-6800K, sep. 2014)
1752     if ((haveMask || type == CV_32FC1) && dev.isAMD())
1753         return false;
1754
1755     CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
1756               (cn >= 1 && !minLoc && !maxLoc) );
1757
1758     if (ddepth < 0)
1759         ddepth = depth;
1760
1761     CV_Assert(!haveSrc2 || _src2.type() == type);
1762
1763     if (depth == CV_32S)
1764         return false;
1765
1766     if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
1767         return false;
1768
1769     int groupnum = dev.maxComputeUnits();
1770     size_t wgs = dev.maxWorkGroupSize();
1771
1772     int wgs2_aligned = 1;
1773     while (wgs2_aligned < (int)wgs)
1774         wgs2_aligned <<= 1;
1775     wgs2_aligned >>= 1;
1776
1777     bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL,
1778             needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL;
1779
1780     // in case of mask we must know whether mask is filled with zeros or not
1781     // so let's calculate min or max location, if it's undefined, so mask is zeros
1782     if (!(needMaxLoc || needMinLoc) && haveMask)
1783     {
1784         if (needMinVal)
1785             needMinLoc = true;
1786         else
1787             needMaxLoc = true;
1788     }
1789
1790     char cvt[2][40];
1791     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
1792                          " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
1793                          " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s",
1794                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
1795                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
1796                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1797                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
1798                          _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
1799                          needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
1800                          needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
1801                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
1802                          ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]),
1803                          absValues ? " -D OP_ABS" : "",
1804                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
1805                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth,
1806                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert");
1807
1808     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
1809     if (k.empty())
1810         return false;
1811
1812     int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
1813             dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
1814                                  (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
1815                                  (maxVal2 ? esz : 0));
1816     UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
1817
1818     if (cn > 1 && !haveMask)
1819     {
1820         src = src.reshape(1);
1821         src2 = src2.reshape(1);
1822     }
1823
1824     if (haveSrc2)
1825     {
1826         if (!haveMask)
1827             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1828                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
1829         else
1830             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1831                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
1832                    ocl::KernelArg::ReadOnlyNoSize(src2));
1833     }
1834     else
1835     {
1836         if (!haveMask)
1837             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1838                    groupnum, ocl::KernelArg::PtrWriteOnly(db));
1839         else
1840             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1841                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
1842     }
1843
1844     size_t globalsize = groupnum * wgs;
1845     if (!k.run(1, &globalsize, &wgs, true))
1846         return false;
1847
1848     static const getMinMaxResFunc functab[7] =
1849     {
1850         getMinMaxRes<uchar>,
1851         getMinMaxRes<char>,
1852         getMinMaxRes<ushort>,
1853         getMinMaxRes<short>,
1854         getMinMaxRes<int>,
1855         getMinMaxRes<float>,
1856         getMinMaxRes<double>
1857     };
1858
1859     getMinMaxResFunc func = functab[ddepth];
1860
1861     int locTemp[2];
1862     func(db.getMat(ACCESS_READ), minVal, maxVal,
1863          needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
1864          needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc,
1865          groupnum, src.cols, maxVal2);
1866
1867     return true;
1868 }
1869
1870 #endif
1871
1872 }
1873
1874 void cv::minMaxIdx(InputArray _src, double* minVal,
1875                    double* maxVal, int* minIdx, int* maxIdx,
1876                    InputArray _mask)
1877 {
1878     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1879     CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
1880         (cn > 1 && _mask.empty() && !minIdx && !maxIdx) );
1881
1882     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
1883                ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))
1884
1885     Mat src = _src.getMat(), mask = _mask.getMat();
1886
1887 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1888     size_t total_size = src.total();
1889     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
1890     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1891     {
1892         IppiSize sz = { cols * cn, rows };
1893
1894         if( !mask.empty() )
1895         {
1896             typedef IppStatus (CV_STDCALL* ippiMaskMinMaxIndxFuncC1)(const void *, int, const void *, int,
1897                                                                      IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
1898
1899             CV_SUPPRESS_DEPRECATED_START
1900             ippiMaskMinMaxIndxFuncC1 ippFuncC1 =
1901                 type == CV_8UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1MR :
1902                 type == CV_8SC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1MR :
1903                 type == CV_16UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1MR :
1904                 type == CV_32FC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1MR : 0;
1905             CV_SUPPRESS_DEPRECATED_END
1906
1907             if( ippFuncC1 )
1908             {
1909                 Ipp32f min, max;
1910                 IppiPoint minp, maxp;
1911                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
1912                 {
1913                     if( minVal )
1914                         *minVal = (double)min;
1915                     if( maxVal )
1916                         *maxVal = (double)max;
1917                     if( !minp.x && !minp.y && !maxp.x && !maxp.y && !mask.ptr()[0] )
1918                         minp.x = maxp.x = -1;
1919                     if( minIdx )
1920                     {
1921                         size_t minidx = minp.y * cols + minp.x + 1;
1922                         ofs2idx(src, minidx, minIdx);
1923                     }
1924                     if( maxIdx )
1925                     {
1926                         size_t maxidx = maxp.y * cols + maxp.x + 1;
1927                         ofs2idx(src, maxidx, maxIdx);
1928                     }
1929                     return;
1930                 }
1931                 setIppErrorStatus();
1932             }
1933         }
1934         else
1935         {
1936             typedef IppStatus (CV_STDCALL* ippiMinMaxIndxFuncC1)(const void *, int, IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
1937
1938             CV_SUPPRESS_DEPRECATED_START
1939             ippiMinMaxIndxFuncC1 ippFuncC1 =
1940                 depth == CV_8U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1R :
1941                 depth == CV_8S ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1R :
1942                 depth == CV_16U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1R :
1943                 depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R : 0;
1944             CV_SUPPRESS_DEPRECATED_END
1945
1946             if( ippFuncC1 )
1947             {
1948                 Ipp32f min, max;
1949                 IppiPoint minp, maxp;
1950                 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
1951                 {
1952                     if( minVal )
1953                         *minVal = (double)min;
1954                     if( maxVal )
1955                         *maxVal = (double)max;
1956                     if( minIdx )
1957                     {
1958                         size_t minidx = minp.y * cols + minp.x + 1;
1959                         ofs2idx(src, minidx, minIdx);
1960                     }
1961                     if( maxIdx )
1962                     {
1963                         size_t maxidx = maxp.y * cols + maxp.x + 1;
1964                         ofs2idx(src, maxidx, maxIdx);
1965                     }
1966                     return;
1967                 }
1968                 setIppErrorStatus();
1969             }
1970         }
1971     }
1972 #endif
1973
1974     MinMaxIdxFunc func = getMinmaxTab(depth);
1975     CV_Assert( func != 0 );
1976
1977     const Mat* arrays[] = {&src, &mask, 0};
1978     uchar* ptrs[2];
1979     NAryMatIterator it(arrays, ptrs);
1980
1981     size_t minidx = 0, maxidx = 0;
1982     int iminval = INT_MAX, imaxval = INT_MIN;
1983     float fminval = FLT_MAX, fmaxval = -FLT_MAX;
1984     double dminval = DBL_MAX, dmaxval = -DBL_MAX;
1985     size_t startidx = 1;
1986     int *minval = &iminval, *maxval = &imaxval;
1987     int planeSize = (int)it.size*cn;
1988
1989     if( depth == CV_32F )
1990         minval = (int*)&fminval, maxval = (int*)&fmaxval;
1991     else if( depth == CV_64F )
1992         minval = (int*)&dminval, maxval = (int*)&dmaxval;
1993
1994     for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize )
1995         func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx );
1996
1997     if( minidx == 0 )
1998         dminval = dmaxval = 0;
1999     else if( depth == CV_32F )
2000         dminval = fminval, dmaxval = fmaxval;
2001     else if( depth <= CV_32S )
2002         dminval = iminval, dmaxval = imaxval;
2003
2004     if( minVal )
2005         *minVal = dminval;
2006     if( maxVal )
2007         *maxVal = dmaxval;
2008
2009     if( minIdx )
2010         ofs2idx(src, minidx, minIdx);
2011     if( maxIdx )
2012         ofs2idx(src, maxidx, maxIdx);
2013 }
2014
2015 void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
2016                     Point* minLoc, Point* maxLoc, InputArray mask )
2017 {
2018     CV_Assert(_img.dims() <= 2);
2019
2020     minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask);
2021     if( minLoc )
2022         std::swap(minLoc->x, minLoc->y);
2023     if( maxLoc )
2024         std::swap(maxLoc->x, maxLoc->y);
2025 }
2026
2027 /****************************************************************************************\
2028 *                                         norm                                           *
2029 \****************************************************************************************/
2030
2031 namespace cv
2032 {
2033
2034 float normL2Sqr_(const float* a, const float* b, int n)
2035 {
2036     int j = 0; float d = 0.f;
2037 #if CV_SSE
2038     if( USE_SSE2 )
2039     {
2040         float CV_DECL_ALIGNED(16) buf[4];
2041         __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
2042
2043         for( ; j <= n - 8; j += 8 )
2044         {
2045             __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
2046             __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
2047             d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
2048             d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
2049         }
2050         _mm_store_ps(buf, _mm_add_ps(d0, d1));
2051         d = buf[0] + buf[1] + buf[2] + buf[3];
2052     }
2053     else
2054 #elif CV_NEON
2055     float32x4_t v_sum = vdupq_n_f32(0.0f);
2056     for ( ; j <= n - 4; j += 4)
2057     {
2058         float32x4_t v_diff = vmulq_f32(vld1q_f32(a + j), vld1q_f32(b + j));
2059         v_sum = vaddq_f32(v_sum, vmulq_f32(v_diff, v_diff));
2060     }
2061
2062     float CV_DECL_ALIGNED(16) buf[4];
2063     vst1q_f32(buf, v_sum);
2064     d = buf[0] + buf[1] + buf[2] + buf[3];
2065 #endif
2066     {
2067         for( ; j <= n - 4; j += 4 )
2068         {
2069             float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
2070             d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
2071         }
2072     }
2073
2074     for( ; j < n; j++ )
2075     {
2076         float t = a[j] - b[j];
2077         d += t*t;
2078     }
2079     return d;
2080 }
2081
2082
2083 float normL1_(const float* a, const float* b, int n)
2084 {
2085     int j = 0; float d = 0.f;
2086 #if CV_SSE
2087     if( USE_SSE2 )
2088     {
2089         float CV_DECL_ALIGNED(16) buf[4];
2090         static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
2091         __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
2092         __m128 absmask = _mm_load_ps((const float*)absbuf);
2093
2094         for( ; j <= n - 8; j += 8 )
2095         {
2096             __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
2097             __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
2098             d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
2099             d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
2100         }
2101         _mm_store_ps(buf, _mm_add_ps(d0, d1));
2102         d = buf[0] + buf[1] + buf[2] + buf[3];
2103     }
2104     else
2105 #elif CV_NEON
2106     float32x4_t v_sum = vdupq_n_f32(0.0f);
2107     for ( ; j <= n - 4; j += 4)
2108         v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
2109
2110     float CV_DECL_ALIGNED(16) buf[4];
2111     vst1q_f32(buf, v_sum);
2112     d = buf[0] + buf[1] + buf[2] + buf[3];
2113 #endif
2114     {
2115         for( ; j <= n - 4; j += 4 )
2116         {
2117             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
2118                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
2119         }
2120     }
2121
2122     for( ; j < n; j++ )
2123         d += std::abs(a[j] - b[j]);
2124     return d;
2125 }
2126
2127 int normL1_(const uchar* a, const uchar* b, int n)
2128 {
2129     int j = 0, d = 0;
2130 #if CV_SSE
2131     if( USE_SSE2 )
2132     {
2133         __m128i d0 = _mm_setzero_si128();
2134
2135         for( ; j <= n - 16; j += 16 )
2136         {
2137             __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
2138             __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
2139
2140             d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
2141         }
2142
2143         for( ; j <= n - 4; j += 4 )
2144         {
2145             __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
2146             __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
2147
2148             d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
2149         }
2150         d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
2151     }
2152     else
2153 #elif CV_NEON
2154     uint32x4_t v_sum = vdupq_n_u32(0.0f);
2155     for ( ; j <= n - 16; j += 16)
2156     {
2157         uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
2158         uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
2159         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
2160         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
2161     }
2162
2163     uint CV_DECL_ALIGNED(16) buf[4];
2164     vst1q_u32(buf, v_sum);
2165     d = buf[0] + buf[1] + buf[2] + buf[3];
2166 #endif
2167     {
2168         for( ; j <= n - 4; j += 4 )
2169         {
2170             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
2171                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
2172         }
2173     }
2174     for( ; j < n; j++ )
2175         d += std::abs(a[j] - b[j]);
2176     return d;
2177 }
2178
2179 static const uchar popCountTable[] =
2180 {
2181     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2182     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2183     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2184     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2185     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2186     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2187     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2188     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
2189 };
2190
2191 static const uchar popCountTable2[] =
2192 {
2193     0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
2194     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
2195     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2196     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2197     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2198     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2199     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2200     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
2201 };
2202
2203 static const uchar popCountTable4[] =
2204 {
2205     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2206     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2207     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2208     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2209     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2210     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2211     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2212     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
2213 };
2214
2215 static int normHamming(const uchar* a, int n)
2216 {
2217     int i = 0, result = 0;
2218 #if CV_NEON
2219     {
2220         uint32x4_t bits = vmovq_n_u32(0);
2221         for (; i <= n - 16; i += 16) {
2222             uint8x16_t A_vec = vld1q_u8 (a + i);
2223             uint8x16_t bitsSet = vcntq_u8 (A_vec);
2224             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
2225             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
2226             bits = vaddq_u32(bits, bitSet4);
2227         }
2228         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
2229         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
2230         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
2231     }
2232 #endif
2233         for( ; i <= n - 4; i += 4 )
2234             result += popCountTable[a[i]] + popCountTable[a[i+1]] +
2235             popCountTable[a[i+2]] + popCountTable[a[i+3]];
2236     for( ; i < n; i++ )
2237         result += popCountTable[a[i]];
2238     return result;
2239 }
2240
2241 int normHamming(const uchar* a, const uchar* b, int n)
2242 {
2243     int i = 0, result = 0;
2244 #if CV_NEON
2245     {
2246         uint32x4_t bits = vmovq_n_u32(0);
2247         for (; i <= n - 16; i += 16) {
2248             uint8x16_t A_vec = vld1q_u8 (a + i);
2249             uint8x16_t B_vec = vld1q_u8 (b + i);
2250             uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
2251             uint8x16_t bitsSet = vcntq_u8 (AxorB);
2252             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
2253             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
2254             bits = vaddq_u32(bits, bitSet4);
2255         }
2256         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
2257         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
2258         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
2259     }
2260 #endif
2261         for( ; i <= n - 4; i += 4 )
2262             result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
2263                     popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
2264     for( ; i < n; i++ )
2265         result += popCountTable[a[i] ^ b[i]];
2266     return result;
2267 }
2268
2269 static int normHamming(const uchar* a, int n, int cellSize)
2270 {
2271     if( cellSize == 1 )
2272         return normHamming(a, n);
2273     const uchar* tab = 0;
2274     if( cellSize == 2 )
2275         tab = popCountTable2;
2276     else if( cellSize == 4 )
2277         tab = popCountTable4;
2278     else
2279         CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
2280     int i = 0, result = 0;
2281 #if CV_ENABLE_UNROLLED
2282     for( ; i <= n - 4; i += 4 )
2283         result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
2284 #endif
2285     for( ; i < n; i++ )
2286         result += tab[a[i]];
2287     return result;
2288 }
2289
2290 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
2291 {
2292     if( cellSize == 1 )
2293         return normHamming(a, b, n);
2294     const uchar* tab = 0;
2295     if( cellSize == 2 )
2296         tab = popCountTable2;
2297     else if( cellSize == 4 )
2298         tab = popCountTable4;
2299     else
2300         CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
2301     int i = 0, result = 0;
2302     #if CV_ENABLE_UNROLLED
2303     for( ; i <= n - 4; i += 4 )
2304         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
2305                 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
2306     #endif
2307     for( ; i < n; i++ )
2308         result += tab[a[i] ^ b[i]];
2309     return result;
2310 }
2311
2312
2313 template<typename T, typename ST> int
2314 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2315 {
2316     ST result = *_result;
2317     if( !mask )
2318     {
2319         result = std::max(result, normInf<T, ST>(src, len*cn));
2320     }
2321     else
2322     {
2323         for( int i = 0; i < len; i++, src += cn )
2324             if( mask[i] )
2325             {
2326                 for( int k = 0; k < cn; k++ )
2327                     result = std::max(result, ST(std::abs(src[k])));
2328             }
2329     }
2330     *_result = result;
2331     return 0;
2332 }
2333
2334 template<typename T, typename ST> int
2335 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2336 {
2337     ST result = *_result;
2338     if( !mask )
2339     {
2340         result += normL1<T, ST>(src, len*cn);
2341     }
2342     else
2343     {
2344         for( int i = 0; i < len; i++, src += cn )
2345             if( mask[i] )
2346             {
2347                 for( int k = 0; k < cn; k++ )
2348                     result += std::abs(src[k]);
2349             }
2350     }
2351     *_result = result;
2352     return 0;
2353 }
2354
2355 template<typename T, typename ST> int
2356 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2357 {
2358     ST result = *_result;
2359     if( !mask )
2360     {
2361         result += normL2Sqr<T, ST>(src, len*cn);
2362     }
2363     else
2364     {
2365         for( int i = 0; i < len; i++, src += cn )
2366             if( mask[i] )
2367             {
2368                 for( int k = 0; k < cn; k++ )
2369                 {
2370                     T v = src[k];
2371                     result += (ST)v*v;
2372                 }
2373             }
2374     }
2375     *_result = result;
2376     return 0;
2377 }
2378
2379 template<typename T, typename ST> int
2380 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2381 {
2382     ST result = *_result;
2383     if( !mask )
2384     {
2385         result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
2386     }
2387     else
2388     {
2389         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
2390             if( mask[i] )
2391             {
2392                 for( int k = 0; k < cn; k++ )
2393                     result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
2394             }
2395     }
2396     *_result = result;
2397     return 0;
2398 }
2399
2400 template<typename T, typename ST> int
2401 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2402 {
2403     ST result = *_result;
2404     if( !mask )
2405     {
2406         result += normL1<T, ST>(src1, src2, len*cn);
2407     }
2408     else
2409     {
2410         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
2411             if( mask[i] )
2412             {
2413                 for( int k = 0; k < cn; k++ )
2414                     result += std::abs(src1[k] - src2[k]);
2415             }
2416     }
2417     *_result = result;
2418     return 0;
2419 }
2420
2421 template<typename T, typename ST> int
2422 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2423 {
2424     ST result = *_result;
2425     if( !mask )
2426     {
2427         result += normL2Sqr<T, ST>(src1, src2, len*cn);
2428     }
2429     else
2430     {
2431         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
2432             if( mask[i] )
2433             {
2434                 for( int k = 0; k < cn; k++ )
2435                 {
2436                     ST v = src1[k] - src2[k];
2437                     result += v*v;
2438                 }
2439             }
2440     }
2441     *_result = result;
2442     return 0;
2443 }
2444
2445
2446 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
2447     static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
2448 { return norm##L##_(src, mask, r, len, cn); } \
2449     static int normDiff##L##_##suffix(const type* src1, const type* src2, \
2450     const uchar* mask, ntype* r, int len, int cn) \
2451 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
2452
2453 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
2454     CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
2455     CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
2456     CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
2457
2458 CV_DEF_NORM_ALL(8u, uchar, int, int, int)
2459 CV_DEF_NORM_ALL(8s, schar, int, int, int)
2460 CV_DEF_NORM_ALL(16u, ushort, int, int, double)
2461 CV_DEF_NORM_ALL(16s, short, int, int, double)
2462 CV_DEF_NORM_ALL(32s, int, int, double, double)
2463 CV_DEF_NORM_ALL(32f, float, float, double, double)
2464 CV_DEF_NORM_ALL(64f, double, double, double, double)
2465
2466
2467 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
2468 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
2469
2470 static NormFunc getNormFunc(int normType, int depth)
2471 {
2472     static NormFunc normTab[3][8] =
2473     {
2474         {
2475             (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
2476             (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
2477         },
2478         {
2479             (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
2480             (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
2481         },
2482         {
2483             (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
2484             (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
2485         }
2486     };
2487
2488     return normTab[normType][depth];
2489 }
2490
2491 static NormDiffFunc getNormDiffFunc(int normType, int depth)
2492 {
2493     static NormDiffFunc normDiffTab[3][8] =
2494     {
2495         {
2496             (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
2497             (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
2498             (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
2499             (NormDiffFunc)normDiffInf_64f, 0
2500         },
2501         {
2502             (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
2503             (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
2504             (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
2505             (NormDiffFunc)normDiffL1_64f, 0
2506         },
2507         {
2508             (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
2509             (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
2510             (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
2511             (NormDiffFunc)normDiffL2_64f, 0
2512         }
2513     };
2514
2515     return normDiffTab[normType][depth];
2516 }
2517
2518 #ifdef HAVE_OPENCL
2519
2520 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
2521 {
2522     const ocl::Device & d = ocl::Device::getDefault();
2523     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2524     bool doubleSupport = d.doubleFPConfig() > 0,
2525             haveMask = _mask.kind() != _InputArray::NONE;
2526
2527     if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
2528          (!doubleSupport && depth == CV_64F))
2529         return false;
2530
2531     UMat src = _src.getUMat();
2532
2533     if (normType == NORM_INF)
2534     {
2535         if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
2536                            std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
2537             return false;
2538     }
2539     else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
2540     {
2541         Scalar sc;
2542         bool unstype = depth == CV_8U || depth == CV_16U;
2543
2544         if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
2545                     OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
2546             return false;
2547
2548         if (!haveMask)
2549             cn = 1;
2550
2551         double s = 0.0;
2552         for (int i = 0; i < cn; ++i)
2553             s += sc[i];
2554
2555         result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
2556     }
2557
2558     return true;
2559 }
2560
2561 #endif
2562
2563 }
2564
2565 double cv::norm( InputArray _src, int normType, InputArray _mask )
2566 {
2567     normType &= NORM_TYPE_MASK;
2568     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
2569                normType == NORM_L2 || normType == NORM_L2SQR ||
2570                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
2571
2572 #ifdef HAVE_OPENCL
2573     double _result = 0;
2574     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
2575                 ocl_norm(_src, normType, _mask, _result),
2576                 _result)
2577 #endif
2578
2579     Mat src = _src.getMat(), mask = _mask.getMat();
2580     int depth = src.depth(), cn = src.channels();
2581
2582 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
2583     size_t total_size = src.total();
2584     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
2585
2586     if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
2587         && cols > 0 && (size_t)rows*cols == total_size
2588         && (normType == NORM_INF || normType == NORM_L1 ||
2589             normType == NORM_L2 || normType == NORM_L2SQR) )
2590     {
2591         IppiSize sz = { cols, rows };
2592         int type = src.type();
2593         if( !mask.empty() )
2594         {
2595             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
2596             ippiMaskNormFuncC1 ippFuncC1 =
2597                 normType == NORM_INF ?
2598                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
2599                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8s_C1MR :
2600 //                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
2601                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
2602                 0) :
2603             normType == NORM_L1 ?
2604                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
2605                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8s_C1MR :
2606                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
2607                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
2608                 0) :
2609             normType == NORM_L2 || normType == NORM_L2SQR ?
2610                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
2611                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8s_C1MR :
2612                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
2613                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
2614                 0) : 0;
2615             if( ippFuncC1 )
2616             {
2617                 Ipp64f norm;
2618                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
2619                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2620
2621                 setIppErrorStatus();
2622             }
2623             /*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
2624             ippiMaskNormFuncC3 ippFuncC3 =
2625                 normType == NORM_INF ?
2626                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
2627                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8s_C3CMR :
2628                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
2629                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
2630                 0) :
2631             normType == NORM_L1 ?
2632                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
2633                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8s_C3CMR :
2634                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
2635                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
2636                 0) :
2637             normType == NORM_L2 || normType == NORM_L2SQR ?
2638                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
2639                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8s_C3CMR :
2640                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
2641                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
2642                 0) : 0;
2643             if( ippFuncC3 )
2644             {
2645                 Ipp64f norm1, norm2, norm3;
2646                 if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
2647                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
2648                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
2649                 {
2650                     Ipp64f norm =
2651                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
2652                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
2653                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
2654                         0;
2655                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2656                 }
2657                 setIppErrorStatus();
2658             }*/
2659         }
2660         else
2661         {
2662             typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
2663             typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
2664             ippiNormFuncHint ippFuncHint =
2665                 normType == NORM_L1 ?
2666                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
2667                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L1_32f_C3R :
2668                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L1_32f_C4R :
2669                 0) :
2670                 normType == NORM_L2 || normType == NORM_L2SQR ?
2671                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
2672                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L2_32f_C3R :
2673                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L2_32f_C4R :
2674                 0) : 0;
2675             ippiNormFuncNoHint ippFuncNoHint =
2676                 normType == NORM_INF ?
2677                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
2678                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C3R :
2679                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C4R :
2680                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
2681                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C3R :
2682                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C4R :
2683                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
2684 #if (IPP_VERSION_X100 >= 801)
2685                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2686                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2687 #endif
2688                 type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
2689                 type == CV_32FC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C3R :
2690                 type == CV_32FC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C4R :
2691                 0) :
2692                 normType == NORM_L1 ?
2693                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
2694                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C3R :
2695                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C4R :
2696                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
2697                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C3R :
2698                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C4R :
2699                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
2700                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C3R :
2701                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C4R :
2702                 0) :
2703                 normType == NORM_L2 || normType == NORM_L2SQR ?
2704                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
2705                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C3R :
2706                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C4R :
2707                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
2708                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C3R :
2709                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C4R :
2710                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
2711                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C3R :
2712                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C4R :
2713                 0) : 0;
2714             // Make sure only zero or one version of the function pointer is valid
2715             CV_Assert(!ippFuncHint || !ippFuncNoHint);
2716             if( ippFuncHint || ippFuncNoHint )
2717             {
2718                 Ipp64f norm_array[4];
2719                 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, norm_array, ippAlgHintAccurate) :
2720                                 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, norm_array);
2721                 if( ret >= 0 )
2722                 {
2723                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
2724                     for( int i = 1; i < cn; i++ )
2725                     {
2726                         norm =
2727                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
2728                             normType == NORM_L1 ? norm + norm_array[i] :
2729                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
2730                             0;
2731                     }
2732                     return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
2733                 }
2734                 setIppErrorStatus();
2735             }
2736         }
2737     }
2738 #endif
2739
2740     if( src.isContinuous() && mask.empty() )
2741     {
2742         size_t len = src.total()*cn;
2743         if( len == (size_t)(int)len )
2744         {
2745             if( depth == CV_32F )
2746             {
2747                 const float* data = src.ptr<float>();
2748
2749                 if( normType == NORM_L2 )
2750                 {
2751                     double result = 0;
2752                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
2753                     return std::sqrt(result);
2754                 }
2755                 if( normType == NORM_L2SQR )
2756                 {
2757                     double result = 0;
2758                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
2759                     return result;
2760                 }
2761                 if( normType == NORM_L1 )
2762                 {
2763                     double result = 0;
2764                     GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
2765                     return result;
2766                 }
2767                 if( normType == NORM_INF )
2768                 {
2769                     float result = 0;
2770                     GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
2771                     return result;
2772                 }
2773             }
2774             if( depth == CV_8U )
2775             {
2776                 const uchar* data = src.ptr<uchar>();
2777
2778                 if( normType == NORM_HAMMING )
2779                     return normHamming(data, (int)len);
2780
2781                 if( normType == NORM_HAMMING2 )
2782                     return normHamming(data, (int)len, 2);
2783             }
2784         }
2785     }
2786
2787     CV_Assert( mask.empty() || mask.type() == CV_8U );
2788
2789     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
2790     {
2791         if( !mask.empty() )
2792         {
2793             Mat temp;
2794             bitwise_and(src, mask, temp);
2795             return norm(temp, normType);
2796         }
2797         int cellSize = normType == NORM_HAMMING ? 1 : 2;
2798
2799         const Mat* arrays[] = {&src, 0};
2800         uchar* ptrs[1];
2801         NAryMatIterator it(arrays, ptrs);
2802         int total = (int)it.size;
2803         int result = 0;
2804
2805         for( size_t i = 0; i < it.nplanes; i++, ++it )
2806             result += normHamming(ptrs[0], total, cellSize);
2807
2808         return result;
2809     }
2810
2811     NormFunc func = getNormFunc(normType >> 1, depth);
2812     CV_Assert( func != 0 );
2813
2814     const Mat* arrays[] = {&src, &mask, 0};
2815     uchar* ptrs[2];
2816     union
2817     {
2818         double d;
2819         int i;
2820         float f;
2821     }
2822     result;
2823     result.d = 0;
2824     NAryMatIterator it(arrays, ptrs);
2825     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
2826     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
2827             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
2828     int isum = 0;
2829     int *ibuf = &result.i;
2830     size_t esz = 0;
2831
2832     if( blockSum )
2833     {
2834         intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
2835         blockSize = std::min(blockSize, intSumBlockSize);
2836         ibuf = &isum;
2837         esz = src.elemSize();
2838     }
2839
2840     for( size_t i = 0; i < it.nplanes; i++, ++it )
2841     {
2842         for( j = 0; j < total; j += blockSize )
2843         {
2844             int bsz = std::min(total - j, blockSize);
2845             func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
2846             count += bsz;
2847             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
2848             {
2849                 result.d += isum;
2850                 isum = 0;
2851                 count = 0;
2852             }
2853             ptrs[0] += bsz*esz;
2854             if( ptrs[1] )
2855                 ptrs[1] += bsz;
2856         }
2857     }
2858
2859     if( normType == NORM_INF )
2860     {
2861         if( depth == CV_64F )
2862             ;
2863         else if( depth == CV_32F )
2864             result.d = result.f;
2865         else
2866             result.d = result.i;
2867     }
2868     else if( normType == NORM_L2 )
2869         result.d = std::sqrt(result.d);
2870
2871     return result.d;
2872 }
2873
2874 #ifdef HAVE_OPENCL
2875
2876 namespace cv {
2877
2878 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
2879 {
2880     Scalar sc1, sc2;
2881     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2882     bool relative = (normType & NORM_RELATIVE) != 0;
2883     normType &= ~NORM_RELATIVE;
2884     bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
2885
2886     if (normsum)
2887     {
2888         if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
2889                      OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
2890             return false;
2891     }
2892     else
2893     {
2894         if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
2895                            false, _src2, relative ? &sc2[0] : NULL))
2896             return false;
2897         cn = 1;
2898     }
2899
2900     double s2 = 0;
2901     for (int i = 0; i < cn; ++i)
2902     {
2903         result += sc1[i];
2904         if (relative)
2905             s2 += sc2[i];
2906     }
2907
2908     if (normType == NORM_L2)
2909     {
2910         result = std::sqrt(result);
2911         if (relative)
2912             s2 = std::sqrt(s2);
2913     }
2914
2915     if (relative)
2916         result /= (s2 + DBL_EPSILON);
2917
2918     return true;
2919 }
2920
2921 }
2922
2923 #endif
2924
2925 double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
2926 {
2927     CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );
2928
2929 #ifdef HAVE_OPENCL
2930     double _result = 0;
2931     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
2932                 ocl_norm(_src1, _src2, normType, _mask, _result),
2933                 _result)
2934 #endif
2935
2936     if( normType & CV_RELATIVE )
2937     {
2938 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
2939         Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
2940
2941         normType &= NORM_TYPE_MASK;
2942         CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
2943                 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
2944         size_t total_size = src1.total();
2945         int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
2946         if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
2947             && cols > 0 && (size_t)rows*cols == total_size
2948             && (normType == NORM_INF || normType == NORM_L1 ||
2949                 normType == NORM_L2 || normType == NORM_L2SQR) )
2950         {
2951             IppiSize sz = { cols, rows };
2952             int type = src1.type();
2953             if( !mask.empty() )
2954             {
2955                 typedef IppStatus (CV_STDCALL* ippiMaskNormRelFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
2956                 ippiMaskNormRelFuncC1 ippFuncC1 =
2957                     normType == NORM_INF ?
2958                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8u_C1MR :
2959 #ifndef __APPLE__
2960                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8s_C1MR :
2961 #endif
2962                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_16u_C1MR :
2963                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_32f_C1MR :
2964                     0) :
2965                     normType == NORM_L1 ?
2966                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8u_C1MR :
2967 #ifndef __APPLE__
2968                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8s_C1MR :
2969 #endif
2970                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_16u_C1MR :
2971                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_32f_C1MR :
2972                     0) :
2973                     normType == NORM_L2 || normType == NORM_L2SQR ?
2974                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8u_C1MR :
2975                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8s_C1MR :
2976                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_16u_C1MR :
2977                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_32f_C1MR :
2978                     0) : 0;
2979                 if( ippFuncC1 )
2980                 {
2981                     Ipp64f norm;
2982                     if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
2983                         return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2984                     setIppErrorStatus();
2985                 }
2986             }
2987             else
2988             {
2989                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
2990                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
2991                 ippiNormRelFuncNoHint ippFuncNoHint =
2992                     normType == NORM_INF ?
2993                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
2994                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
2995                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
2996                     type == CV_32FC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
2997                     0) :
2998                     normType == NORM_L1 ?
2999                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
3000                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
3001                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
3002                     0) :
3003                     normType == NORM_L2 || normType == NORM_L2SQR ?
3004                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
3005                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
3006                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
3007                     0) : 0;
3008                 ippiNormRelFuncHint ippFuncHint =
3009                     normType == NORM_L1 ?
3010                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
3011                     0) :
3012                     normType == NORM_L2 || normType == NORM_L2SQR ?
3013                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
3014                     0) : 0;
3015                 if (ippFuncNoHint)
3016                 {
3017                     Ipp64f norm;
3018                     if( ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm) >= 0 )
3019                         return (double)norm;
3020                     setIppErrorStatus();
3021                 }
3022                 if (ippFuncHint)
3023                 {
3024                     Ipp64f norm;
3025                     if( ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) >= 0 )
3026                         return (double)norm;
3027                     setIppErrorStatus();
3028                 }
3029             }
3030         }
3031 #endif
3032         return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
3033     }
3034
3035     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
3036     int depth = src1.depth(), cn = src1.channels();
3037
3038     normType &= 7;
3039     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
3040                normType == NORM_L2 || normType == NORM_L2SQR ||
3041               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
3042
3043 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
3044     size_t total_size = src1.total();
3045     int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
3046     if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
3047         && cols > 0 && (size_t)rows*cols == total_size
3048         && (normType == NORM_INF || normType == NORM_L1 ||
3049             normType == NORM_L2 || normType == NORM_L2SQR) )
3050     {
3051         IppiSize sz = { cols, rows };
3052         int type = src1.type();
3053         if( !mask.empty() )
3054         {
3055             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
3056             ippiMaskNormDiffFuncC1 ippFuncC1 =
3057                 normType == NORM_INF ?
3058                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
3059                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8s_C1MR :
3060                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
3061                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
3062                 0) :
3063                 normType == NORM_L1 ?
3064                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
3065 #ifndef __APPLE__
3066                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
3067 #endif
3068                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
3069                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
3070                 0) :
3071                 normType == NORM_L2 || normType == NORM_L2SQR ?
3072                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
3073                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8s_C1MR :
3074                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
3075                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
3076                 0) : 0;
3077             if( ippFuncC1 )
3078             {
3079                 Ipp64f norm;
3080                 if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
3081                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
3082                 setIppErrorStatus();
3083             }
3084 #ifndef __APPLE__
3085             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
3086             ippiMaskNormDiffFuncC3 ippFuncC3 =
3087                 normType == NORM_INF ?
3088                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
3089                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8s_C3CMR :
3090                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
3091                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
3092                 0) :
3093                 normType == NORM_L1 ?
3094                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
3095                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8s_C3CMR :
3096                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
3097                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
3098                 0) :
3099                 normType == NORM_L2 || normType == NORM_L2SQR ?
3100                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
3101                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8s_C3CMR :
3102                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
3103                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
3104                 0) : 0;
3105             if( ippFuncC3 )
3106             {
3107                 Ipp64f norm1, norm2, norm3;
3108                 if( ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
3109                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
3110                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
3111                 {
3112                     Ipp64f norm =
3113                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
3114                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
3115                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
3116                         0;
3117                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
3118                 }
3119                 setIppErrorStatus();
3120             }
3121 #endif
3122         }
3123         else
3124         {
3125             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
3126             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
3127             ippiNormDiffFuncHint ippFuncHint =
3128                 normType == NORM_L1 ?
3129                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
3130                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C3R :
3131                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C4R :
3132                 0) :
3133                 normType == NORM_L2 || normType == NORM_L2SQR ?
3134                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
3135                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C3R :
3136                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C4R :
3137                 0) : 0;
3138             ippiNormDiffFuncNoHint ippFuncNoHint =
3139                 normType == NORM_INF ?
3140                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
3141                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C3R :
3142                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C4R :
3143                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
3144                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C3R :
3145                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C4R :
3146                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
3147 #if (IPP_VERSION_X100 >= 801)
3148                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
3149                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
3150 #endif
3151                 type == CV_32FC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
3152                 type == CV_32FC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C3R :
3153                 type == CV_32FC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C4R :
3154                 0) :
3155                 normType == NORM_L1 ?
3156                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
3157                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C3R :
3158                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C4R :
3159                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
3160                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C3R :
3161                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C4R :
3162                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
3163                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C3R :
3164                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C4R :
3165                 0) :
3166                 normType == NORM_L2 || normType == NORM_L2SQR ?
3167                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
3168                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C3R :
3169                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C4R :
3170                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
3171                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C3R :
3172                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C4R :
3173                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
3174                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C3R :
3175                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C4R :
3176                 0) : 0;
3177             // Make sure only zero or one version of the function pointer is valid
3178             CV_Assert(!ippFuncHint || !ippFuncNoHint);
3179             if( ippFuncHint || ippFuncNoHint )
3180             {
3181                 Ipp64f norm_array[4];
3182                 IppStatus ret = ippFuncHint ? ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) :
3183                                 ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array);
3184                 if( ret >= 0 )
3185                 {
3186                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
3187                     for( int i = 1; i < src1.channels(); i++ )
3188                     {
3189                         norm =
3190                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
3191                             normType == NORM_L1 ? norm + norm_array[i] :
3192                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
3193                             0;
3194                     }
3195                     return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
3196                 }
3197                 setIppErrorStatus();
3198             }
3199         }
3200     }
3201 #endif
3202
3203     if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
3204     {
3205         size_t len = src1.total()*src1.channels();
3206         if( len == (size_t)(int)len )
3207         {
3208             if( src1.depth() == CV_32F )
3209             {
3210                 const float* data1 = src1.ptr<float>();
3211                 const float* data2 = src2.ptr<float>();
3212
3213                 if( normType == NORM_L2 )
3214                 {
3215                     double result = 0;
3216                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
3217                     return std::sqrt(result);
3218                 }
3219                 if( normType == NORM_L2SQR )
3220                 {
3221                     double result = 0;
3222                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
3223                     return result;
3224                 }
3225                 if( normType == NORM_L1 )
3226                 {
3227                     double result = 0;
3228                     GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
3229                     return result;
3230                 }
3231                 if( normType == NORM_INF )
3232                 {
3233                     float result = 0;
3234                     GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
3235                     return result;
3236                 }
3237             }
3238         }
3239     }
3240
3241     CV_Assert( mask.empty() || mask.type() == CV_8U );
3242
3243     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
3244     {
3245         if( !mask.empty() )
3246         {
3247             Mat temp;
3248             bitwise_xor(src1, src2, temp);
3249             bitwise_and(temp, mask, temp);
3250             return norm(temp, normType);
3251         }
3252         int cellSize = normType == NORM_HAMMING ? 1 : 2;
3253
3254         const Mat* arrays[] = {&src1, &src2, 0};
3255         uchar* ptrs[2];
3256         NAryMatIterator it(arrays, ptrs);
3257         int total = (int)it.size;
3258         int result = 0;
3259
3260         for( size_t i = 0; i < it.nplanes; i++, ++it )
3261             result += normHamming(ptrs[0], ptrs[1], total, cellSize);
3262
3263         return result;
3264     }
3265
3266     NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
3267     CV_Assert( func != 0 );
3268
3269     const Mat* arrays[] = {&src1, &src2, &mask, 0};
3270     uchar* ptrs[3];
3271     union
3272     {
3273         double d;
3274         float f;
3275         int i;
3276         unsigned u;
3277     }
3278     result;
3279     result.d = 0;
3280     NAryMatIterator it(arrays, ptrs);
3281     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
3282     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
3283             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
3284     unsigned isum = 0;
3285     unsigned *ibuf = &result.u;
3286     size_t esz = 0;
3287
3288     if( blockSum )
3289     {
3290         intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
3291         blockSize = std::min(blockSize, intSumBlockSize);
3292         ibuf = &isum;
3293         esz = src1.elemSize();
3294     }
3295
3296     for( size_t i = 0; i < it.nplanes; i++, ++it )
3297     {
3298         for( j = 0; j < total; j += blockSize )
3299         {
3300             int bsz = std::min(total - j, blockSize);
3301             func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
3302             count += bsz;
3303             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
3304             {
3305                 result.d += isum;
3306                 isum = 0;
3307                 count = 0;
3308             }
3309             ptrs[0] += bsz*esz;
3310             ptrs[1] += bsz*esz;
3311             if( ptrs[2] )
3312                 ptrs[2] += bsz;
3313         }
3314     }
3315
3316     if( normType == NORM_INF )
3317     {
3318         if( depth == CV_64F )
3319             ;
3320         else if( depth == CV_32F )
3321             result.d = result.f;
3322         else
3323             result.d = result.u;
3324     }
3325     else if( normType == NORM_L2 )
3326         result.d = std::sqrt(result.d);
3327
3328     return result.d;
3329 }
3330
3331
3332 ///////////////////////////////////// batch distance ///////////////////////////////////////
3333
3334 namespace cv
3335 {
3336
3337 template<typename _Tp, typename _Rt>
3338 void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2,
3339                   int nvecs, int len, _Rt* dist, const uchar* mask)
3340 {
3341     step2 /= sizeof(src2[0]);
3342     if( !mask )
3343     {
3344         for( int i = 0; i < nvecs; i++ )
3345             dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len);
3346     }
3347     else
3348     {
3349         _Rt val0 = std::numeric_limits<_Rt>::max();
3350         for( int i = 0; i < nvecs; i++ )
3351             dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
3352     }
3353 }
3354
3355 template<typename _Tp, typename _Rt>
3356 void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2,
3357                      int nvecs, int len, _Rt* dist, const uchar* mask)
3358 {
3359     step2 /= sizeof(src2[0]);
3360     if( !mask )
3361     {
3362         for( int i = 0; i < nvecs; i++ )
3363             dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len);
3364     }
3365     else
3366     {
3367         _Rt val0 = std::numeric_limits<_Rt>::max();
3368         for( int i = 0; i < nvecs; i++ )
3369             dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
3370     }
3371 }
3372
3373 template<typename _Tp, typename _Rt>
3374 void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
3375                   int nvecs, int len, _Rt* dist, const uchar* mask)
3376 {
3377     step2 /= sizeof(src2[0]);
3378     if( !mask )
3379     {
3380         for( int i = 0; i < nvecs; i++ )
3381             dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len));
3382     }
3383     else
3384     {
3385         _Rt val0 = std::numeric_limits<_Rt>::max();
3386         for( int i = 0; i < nvecs; i++ )
3387             dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0;
3388     }
3389 }
3390
3391 static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
3392                              int nvecs, int len, int* dist, const uchar* mask)
3393 {
3394     step2 /= sizeof(src2[0]);
3395     if( !mask )
3396     {
3397         for( int i = 0; i < nvecs; i++ )
3398             dist[i] = normHamming(src1, src2 + step2*i, len);
3399     }
3400     else
3401     {
3402         int val0 = INT_MAX;
3403         for( int i = 0; i < nvecs; i++ )
3404             dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len) : val0;
3405     }
3406 }
3407
3408 static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2,
3409                               int nvecs, int len, int* dist, const uchar* mask)
3410 {
3411     step2 /= sizeof(src2[0]);
3412     if( !mask )
3413     {
3414         for( int i = 0; i < nvecs; i++ )
3415             dist[i] = normHamming(src1, src2 + step2*i, len, 2);
3416     }
3417     else
3418     {
3419         int val0 = INT_MAX;
3420         for( int i = 0; i < nvecs; i++ )
3421             dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len, 2) : val0;
3422     }
3423 }
3424
3425 static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2,
3426                                int nvecs, int len, int* dist, const uchar* mask)
3427 {
3428     batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
3429 }
3430
3431 static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2,
3432                                int nvecs, int len, float* dist, const uchar* mask)
3433 {
3434     batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
3435 }
3436
3437 static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2,
3438                                   int nvecs, int len, int* dist, const uchar* mask)
3439 {
3440     batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
3441 }
3442
3443 static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2,
3444                                   int nvecs, int len, float* dist, const uchar* mask)
3445 {
3446     batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
3447 }
3448
3449 static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2,
3450                                int nvecs, int len, float* dist, const uchar* mask)
3451 {
3452     batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
3453 }
3454
3455 static void batchDistL1_32f(const float* src1, const float* src2, size_t step2,
3456                              int nvecs, int len, float* dist, const uchar* mask)
3457 {
3458     batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
3459 }
3460
3461 static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2,
3462                                 int nvecs, int len, float* dist, const uchar* mask)
3463 {
3464     batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
3465 }
3466
3467 static void batchDistL2_32f(const float* src1, const float* src2, size_t step2,
3468                              int nvecs, int len, float* dist, const uchar* mask)
3469 {
3470     batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
3471 }
3472
3473 typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2,
3474                               int nvecs, int len, uchar* dist, const uchar* mask);
3475
3476
3477 struct BatchDistInvoker : public ParallelLoopBody
3478 {
3479     BatchDistInvoker( const Mat& _src1, const Mat& _src2,
3480                       Mat& _dist, Mat& _nidx, int _K,
3481                       const Mat& _mask, int _update,
3482                       BatchDistFunc _func)
3483     {
3484         src1 = &_src1;
3485         src2 = &_src2;
3486         dist = &_dist;
3487         nidx = &_nidx;
3488         K = _K;
3489         mask = &_mask;
3490         update = _update;
3491         func = _func;
3492     }
3493
3494     void operator()(const Range& range) const
3495     {
3496         AutoBuffer<int> buf(src2->rows);
3497         int* bufptr = buf;
3498
3499         for( int i = range.start; i < range.end; i++ )
3500         {
3501             func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols,
3502                  K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0);
3503
3504             if( K > 0 )
3505             {
3506                 int* nidxptr = nidx->ptr<int>(i);
3507                 // since positive float's can be compared just like int's,
3508                 // we handle both CV_32S and CV_32F cases with a single branch
3509                 int* distptr = (int*)dist->ptr(i);
3510
3511                 int j, k;
3512
3513                 for( j = 0; j < src2->rows; j++ )
3514                 {
3515                     int d = bufptr[j];
3516                     if( d < distptr[K-1] )
3517                     {
3518                         for( k = K-2; k >= 0 && distptr[k] > d; k-- )
3519                         {
3520                             nidxptr[k+1] = nidxptr[k];
3521                             distptr[k+1] = distptr[k];
3522                         }
3523                         nidxptr[k+1] = j + update;
3524                         distptr[k+1] = d;
3525                     }
3526                 }
3527             }
3528         }
3529     }
3530
3531     const Mat *src1;
3532     const Mat *src2;
3533     Mat *dist;
3534     Mat *nidx;
3535     const Mat *mask;
3536     int K;
3537     int update;
3538     BatchDistFunc func;
3539 };
3540
3541 }
3542
3543 void cv::batchDistance( InputArray _src1, InputArray _src2,
3544                         OutputArray _dist, int dtype, OutputArray _nidx,
3545                         int normType, int K, InputArray _mask,
3546                         int update, bool crosscheck )
3547 {
3548     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
3549     int type = src1.type();
3550     CV_Assert( type == src2.type() && src1.cols == src2.cols &&
3551                (type == CV_32F || type == CV_8U));
3552     CV_Assert( _nidx.needed() == (K > 0) );
3553
3554     if( dtype == -1 )
3555     {
3556         dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F;
3557     }
3558     CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F);
3559
3560     K = std::min(K, src2.rows);
3561
3562     _dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype);
3563     Mat dist = _dist.getMat(), nidx;
3564     if( _nidx.needed() )
3565     {
3566         _nidx.create(dist.size(), CV_32S);
3567         nidx = _nidx.getMat();
3568     }
3569
3570     if( update == 0 && K > 0 )
3571     {
3572         dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX);
3573         nidx = Scalar::all(-1);
3574     }
3575
3576     if( crosscheck )
3577     {
3578         CV_Assert( K == 1 && update == 0 && mask.empty() );
3579         Mat tdist, tidx;
3580         batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);
3581
3582         // if an idx-th element from src1 appeared to be the nearest to i-th element of src2,
3583         // we update the minimum mutual distance between idx-th element of src1 and the whole src2 set.
3584         // As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest
3585         // to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1.
3586         // If nidx[idx] = -1, it means that there is no such ideal couple for it in src2.
3587         // This O(N) procedure is called cross-check and it helps to eliminate some false matches.
3588         if( dtype == CV_32S )
3589         {
3590             for( int i = 0; i < tdist.rows; i++ )
3591             {
3592                 int idx = tidx.at<int>(i);
3593                 int d = tdist.at<int>(i), d0 = dist.at<int>(idx);
3594                 if( d < d0 )
3595                 {
3596                     dist.at<int>(idx) = d;
3597                     nidx.at<int>(idx) = i + update;
3598                 }
3599             }
3600         }
3601         else
3602         {
3603             for( int i = 0; i < tdist.rows; i++ )
3604             {
3605                 int idx = tidx.at<int>(i);
3606                 float d = tdist.at<float>(i), d0 = dist.at<float>(idx);
3607                 if( d < d0 )
3608                 {
3609                     dist.at<float>(idx) = d;
3610                     nidx.at<int>(idx) = i + update;
3611                 }
3612             }
3613         }
3614         return;
3615     }
3616
3617     BatchDistFunc func = 0;
3618     if( type == CV_8U )
3619     {
3620         if( normType == NORM_L1 && dtype == CV_32S )
3621             func = (BatchDistFunc)batchDistL1_8u32s;
3622         else if( normType == NORM_L1 && dtype == CV_32F )
3623             func = (BatchDistFunc)batchDistL1_8u32f;
3624         else if( normType == NORM_L2SQR && dtype == CV_32S )
3625             func = (BatchDistFunc)batchDistL2Sqr_8u32s;
3626         else if( normType == NORM_L2SQR && dtype == CV_32F )
3627             func = (BatchDistFunc)batchDistL2Sqr_8u32f;
3628         else if( normType == NORM_L2 && dtype == CV_32F )
3629             func = (BatchDistFunc)batchDistL2_8u32f;
3630         else if( normType == NORM_HAMMING && dtype == CV_32S )
3631             func = (BatchDistFunc)batchDistHamming;
3632         else if( normType == NORM_HAMMING2 && dtype == CV_32S )
3633             func = (BatchDistFunc)batchDistHamming2;
3634     }
3635     else if( type == CV_32F && dtype == CV_32F )
3636     {
3637         if( normType == NORM_L1 )
3638             func = (BatchDistFunc)batchDistL1_32f;
3639         else if( normType == NORM_L2SQR )
3640             func = (BatchDistFunc)batchDistL2Sqr_32f;
3641         else if( normType == NORM_L2 )
3642             func = (BatchDistFunc)batchDistL2_32f;
3643     }
3644
3645     if( func == 0 )
3646         CV_Error_(CV_StsUnsupportedFormat,
3647                   ("The combination of type=%d, dtype=%d and normType=%d is not supported",
3648                    type, dtype, normType));
3649
3650     parallel_for_(Range(0, src1.rows),
3651                   BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func));
3652 }
3653
3654
3655 void cv::findNonZero( InputArray _src, OutputArray _idx )
3656 {
3657     Mat src = _src.getMat();
3658     CV_Assert( src.type() == CV_8UC1 );
3659     int n = countNonZero(src);
3660     if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
3661         _idx.release();
3662     _idx.create(n, 1, CV_32SC2);
3663     Mat idx = _idx.getMat();
3664     CV_Assert(idx.isContinuous());
3665     Point* idx_ptr = idx.ptr<Point>();
3666
3667     for( int i = 0; i < src.rows; i++ )
3668     {
3669         const uchar* bin_ptr = src.ptr(i);
3670         for( int j = 0; j < src.cols; j++ )
3671             if( bin_ptr[j] )
3672                 *idx_ptr++ = Point(j, i);
3673     }
3674 }
3675
3676 double cv::PSNR(InputArray _src1, InputArray _src2)
3677 {
3678     CV_Assert( _src1.depth() == CV_8U );
3679     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
3680     return 20*log10(255./(diff+DBL_EPSILON));
3681 }
3682
3683
3684 CV_IMPL CvScalar cvSum( const CvArr* srcarr )
3685 {
3686     cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
3687     if( CV_IS_IMAGE(srcarr) )
3688     {
3689         int coi = cvGetImageCOI((IplImage*)srcarr);
3690         if( coi )
3691         {
3692             CV_Assert( 0 < coi && coi <= 4 );
3693             sum = cv::Scalar(sum[coi-1]);
3694         }
3695     }
3696     return sum;
3697 }
3698
3699 CV_IMPL int cvCountNonZero( const CvArr* imgarr )
3700 {
3701     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
3702     if( img.channels() > 1 )
3703         cv::extractImageCOI(imgarr, img);
3704     return countNonZero(img);
3705 }
3706
3707
3708 CV_IMPL  CvScalar
3709 cvAvg( const void* imgarr, const void* maskarr )
3710 {
3711     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
3712     cv::Scalar mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr));
3713     if( CV_IS_IMAGE(imgarr) )
3714     {
3715         int coi = cvGetImageCOI((IplImage*)imgarr);
3716         if( coi )
3717         {
3718             CV_Assert( 0 < coi && coi <= 4 );
3719             mean = cv::Scalar(mean[coi-1]);
3720         }
3721     }
3722     return mean;
3723 }
3724
3725
3726 CV_IMPL  void
3727 cvAvgSdv( const CvArr* imgarr, CvScalar* _mean, CvScalar* _sdv, const void* maskarr )
3728 {
3729     cv::Scalar mean, sdv;
3730
3731     cv::Mat mask;
3732     if( maskarr )
3733         mask = cv::cvarrToMat(maskarr);
3734
3735     cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask );
3736
3737     if( CV_IS_IMAGE(imgarr) )
3738     {
3739         int coi = cvGetImageCOI((IplImage*)imgarr);
3740         if( coi )
3741         {
3742             CV_Assert( 0 < coi && coi <= 4 );
3743             mean = cv::Scalar(mean[coi-1]);
3744             sdv = cv::Scalar(sdv[coi-1]);
3745         }
3746     }
3747
3748     if( _mean )
3749         *(cv::Scalar*)_mean = mean;
3750     if( _sdv )
3751         *(cv::Scalar*)_sdv = sdv;
3752 }
3753
3754
3755 CV_IMPL void
3756 cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal,
3757              CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr )
3758 {
3759     cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1);
3760     if( maskarr )
3761         mask = cv::cvarrToMat(maskarr);
3762     if( img.channels() > 1 )
3763         cv::extractImageCOI(imgarr, img);
3764
3765     cv::minMaxLoc( img, _minVal, _maxVal,
3766                    (cv::Point*)_minLoc, (cv::Point*)_maxLoc, mask );
3767 }
3768
3769
3770 CV_IMPL  double
3771 cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
3772 {
3773     cv::Mat a, mask;
3774     if( !imgA )
3775     {
3776         imgA = imgB;
3777         imgB = 0;
3778     }
3779
3780     a = cv::cvarrToMat(imgA, false, true, 1);
3781     if( maskarr )
3782         mask = cv::cvarrToMat(maskarr);
3783
3784     if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 )
3785         cv::extractImageCOI(imgA, a);
3786
3787     if( !imgB )
3788         return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask);
3789
3790     cv::Mat b = cv::cvarrToMat(imgB, false, true, 1);
3791     if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 )
3792         cv::extractImageCOI(imgB, b);
3793
3794     return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
3795 }