modules/core/src/stat.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "precomp.hpp"
  44 #include <climits>
  45 #include <limits>
  46
  47 #include "opencl_kernels_core.hpp"
  48
  49 namespace cv
  50 {
  51
  52 template<typename T> static inline Scalar rawToScalar(const T& v)
  53 {
  54     Scalar s;
  55     typedef typename DataType<T>::channel_type T1;
  56     int i, n = DataType<T>::channels;
  57     for( i = 0; i < n; i++ )
  58         s.val[i] = ((T1*)&v)[i];
  59     return s;
  60 }
  61
  62 /****************************************************************************************\
  63 *                                        sum                                             *
  64 \****************************************************************************************/
  65
  66 template <typename T, typename ST>
  67 struct Sum_SIMD
  68 {
  69     int operator () (const T *, const uchar *, ST *, int, int) const
  70     {
  71         return 0;
  72     }
  73 };
  74
  75 #if CV_NEON
  76
  77 template <>
  78 struct Sum_SIMD<uchar, int>
  79 {
  80     int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const
  81     {
  82         if (mask || (cn != 1 && cn != 2 && cn != 4))
  83             return 0;
  84
  85         int x = 0;
  86         uint32x4_t v_sum = vdupq_n_u32(0u);
  87
  88         for ( ; x <= len - 16; x += 16)
  89         {
  90             uint8x16_t v_src = vld1q_u8(src0 + x);
  91             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
  92
  93             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
  94             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));
  95
  96             v_half = vmovl_u8(vget_high_u8(v_src));
  97             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
  98             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));
  99         }
 100
 101         for ( ; x <= len - 8; x += 8)
 102         {
 103             uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x));
 104
 105             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
 106             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
 107         }
 108
 109         unsigned int CV_DECL_ALIGNED(16) ar[4];
 110         vst1q_u32(ar, v_sum);
 111
 112         for (int i = 0; i < 4; i += cn)
 113             for (int j = 0; j < cn; ++j)
 114                 dst[j] += ar[j + i];
 115
 116         return x / cn;
 117     }
 118 };
 119
 120 template <>
 121 struct Sum_SIMD<schar, int>
 122 {
 123     int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
 124     {
 125         if (mask || (cn != 1 && cn != 2 && cn != 4))
 126             return 0;
 127
 128         int x = 0;
 129         int32x4_t v_sum = vdupq_n_s32(0);
 130
 131         for ( ; x <= len - 16; x += 16)
 132         {
 133             int8x16_t v_src = vld1q_s8(src0 + x);
 134             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
 135
 136             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
 137             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));
 138
 139             v_half = vmovl_s8(vget_high_s8(v_src));
 140             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
 141             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));
 142         }
 143
 144         for ( ; x <= len - 8; x += 8)
 145         {
 146             int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x));
 147
 148             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
 149             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
 150         }
 151
 152         int CV_DECL_ALIGNED(16) ar[4];
 153         vst1q_s32(ar, v_sum);
 154
 155         for (int i = 0; i < 4; i += cn)
 156             for (int j = 0; j < cn; ++j)
 157                 dst[j] += ar[j + i];
 158
 159         return x / cn;
 160     }
 161 };
 162
 163 template <>
 164 struct Sum_SIMD<ushort, int>
 165 {
 166     int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const
 167     {
 168         if (mask || (cn != 1 && cn != 2 && cn != 4))
 169             return 0;
 170
 171         int x = 0;
 172         uint32x4_t v_sum = vdupq_n_u32(0u);
 173
 174         for ( ; x <= len - 8; x += 8)
 175         {
 176             uint16x8_t v_src = vld1q_u16(src0 + x);
 177
 178             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
 179             v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
 180         }
 181
 182         for ( ; x <= len - 4; x += 4)
 183             v_sum = vaddq_u32(v_sum, vmovl_u16(vld1_u16(src0 + x)));
 184
 185         unsigned int CV_DECL_ALIGNED(16) ar[4];
 186         vst1q_u32(ar, v_sum);
 187
 188         for (int i = 0; i < 4; i += cn)
 189             for (int j = 0; j < cn; ++j)
 190                 dst[j] += ar[j + i];
 191
 192         return x / cn;
 193     }
 194 };
 195
 196 template <>
 197 struct Sum_SIMD<short, int>
 198 {
 199     int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
 200     {
 201         if (mask || (cn != 1 && cn != 2 && cn != 4))
 202             return 0;
 203
 204         int x = 0;
 205         int32x4_t v_sum = vdupq_n_s32(0u);
 206
 207         for ( ; x <= len - 8; x += 8)
 208         {
 209             int16x8_t v_src = vld1q_s16(src0 + x);
 210
 211             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
 212             v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
 213         }
 214
 215         for ( ; x <= len - 4; x += 4)
 216             v_sum = vaddq_s32(v_sum, vmovl_s16(vld1_s16(src0 + x)));
 217
 218         int CV_DECL_ALIGNED(16) ar[4];
 219         vst1q_s32(ar, v_sum);
 220
 221         for (int i = 0; i < 4; i += cn)
 222             for (int j = 0; j < cn; ++j)
 223                 dst[j] += ar[j + i];
 224
 225         return x / cn;
 226     }
 227 };
 228
 229 #endif
 230
 231 template<typename T, typename ST>
 232 static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
 233 {
 234     const T* src = src0;
 235     if( !mask )
 236     {
 237         Sum_SIMD<T, ST> vop;
 238         int i = vop(src0, mask, dst, len, cn), k = cn % 4;
 239         src += i * cn;
 240
 241         if( k == 1 )
 242         {
 243             ST s0 = dst[0];
 244
 245             #if CV_ENABLE_UNROLLED
 246             for(; i <= len - 4; i += 4, src += cn*4 )
 247                 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
 248             #endif
 249             for( ; i < len; i++, src += cn )
 250                 s0 += src[0];
 251             dst[0] = s0;
 252         }
 253         else if( k == 2 )
 254         {
 255             ST s0 = dst[0], s1 = dst[1];
 256             for( ; i < len; i++, src += cn )
 257             {
 258                 s0 += src[0];
 259                 s1 += src[1];
 260             }
 261             dst[0] = s0;
 262             dst[1] = s1;
 263         }
 264         else if( k == 3 )
 265         {
 266             ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
 267             for( ; i < len; i++, src += cn )
 268             {
 269                 s0 += src[0];
 270                 s1 += src[1];
 271                 s2 += src[2];
 272             }
 273             dst[0] = s0;
 274             dst[1] = s1;
 275             dst[2] = s2;
 276         }
 277
 278         for( ; k < cn; k += 4 )
 279         {
 280             src = src0 + i*cn + k;
 281             ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
 282             for( ; i < len; i++, src += cn )
 283             {
 284                 s0 += src[0]; s1 += src[1];
 285                 s2 += src[2]; s3 += src[3];
 286             }
 287             dst[k] = s0;
 288             dst[k+1] = s1;
 289             dst[k+2] = s2;
 290             dst[k+3] = s3;
 291         }
 292         return len;
 293     }
 294
 295     int i, nzm = 0;
 296     if( cn == 1 )
 297     {
 298         ST s = dst[0];
 299         for( i = 0; i < len; i++ )
 300             if( mask[i] )
 301             {
 302                 s += src[i];
 303                 nzm++;
 304             }
 305         dst[0] = s;
 306     }
 307     else if( cn == 3 )
 308     {
 309         ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
 310         for( i = 0; i < len; i++, src += 3 )
 311             if( mask[i] )
 312             {
 313                 s0 += src[0];
 314                 s1 += src[1];
 315                 s2 += src[2];
 316                 nzm++;
 317             }
 318         dst[0] = s0;
 319         dst[1] = s1;
 320         dst[2] = s2;
 321     }
 322     else
 323     {
 324         for( i = 0; i < len; i++, src += cn )
 325             if( mask[i] )
 326             {
 327                 int k = 0;
 328                 #if CV_ENABLE_UNROLLED
 329                 for( ; k <= cn - 4; k += 4 )
 330                 {
 331                     ST s0, s1;
 332                     s0 = dst[k] + src[k];
 333                     s1 = dst[k+1] + src[k+1];
 334                     dst[k] = s0; dst[k+1] = s1;
 335                     s0 = dst[k+2] + src[k+2];
 336                     s1 = dst[k+3] + src[k+3];
 337                     dst[k+2] = s0; dst[k+3] = s1;
 338                 }
 339                 #endif
 340                 for( ; k < cn; k++ )
 341                     dst[k] += src[k];
 342                 nzm++;
 343             }
 344     }
 345     return nzm;
 346 }
 347
 348
 349 static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn )
 350 { return sum_(src, mask, dst, len, cn); }
 351
 352 static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn )
 353 { return sum_(src, mask, dst, len, cn); }
 354
 355 static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn )
 356 { return sum_(src, mask, dst, len, cn); }
 357
 358 static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn )
 359 { return sum_(src, mask, dst, len, cn); }
 360
 361 static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn )
 362 { return sum_(src, mask, dst, len, cn); }
 363
 364 static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn )
 365 { return sum_(src, mask, dst, len, cn); }
 366
 367 static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn )
 368 { return sum_(src, mask, dst, len, cn); }
 369
 370 typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int);
 371
 372 static SumFunc getSumFunc(int depth)
 373 {
 374     static SumFunc sumTab[] =
 375     {
 376         (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
 377         (SumFunc)sum16u, (SumFunc)sum16s,
 378         (SumFunc)sum32s,
 379         (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f,
 380         0
 381     };
 382
 383     return sumTab[depth];
 384 }
 385
 386 template<typename T>
 387 static int countNonZero_(const T* src, int len )
 388 {
 389     int i=0, nz = 0;
 390     #if CV_ENABLE_UNROLLED
 391     for(; i <= len - 4; i += 4 )
 392         nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
 393     #endif
 394     for( ; i < len; i++ )
 395         nz += src[i] != 0;
 396     return nz;
 397 }
 398
 399 static int countNonZero8u( const uchar* src, int len )
 400 {
 401     int i=0, nz = 0;
 402 #if CV_SSE2
 403     if(USE_SSE2)//5x-6x
 404     {
 405         __m128i pattern = _mm_setzero_si128 ();
 406         static uchar tab[256];
 407         static volatile bool initialized = false;
 408         if( !initialized )
 409         {
 410             // we compute inverse popcount table,
 411             // since we pass (img[x] == 0) mask as index in the table.
 412             for( int j = 0; j < 256; j++ )
 413             {
 414                 int val = 0;
 415                 for( int mask = 1; mask < 256; mask += mask )
 416                     val += (j & mask) == 0;
 417                 tab[j] = (uchar)val;
 418             }
 419             initialized = true;
 420         }
 421
 422         for (; i<=len-16; i+=16)
 423         {
 424             __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
 425             int val = _mm_movemask_epi8(_mm_cmpeq_epi8(r0, pattern));
 426             nz += tab[val & 255] + tab[val >> 8];
 427         }
 428     }
 429 #elif CV_NEON
 430     int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6;
 431     uint32x4_t v_nz = vdupq_n_u32(0u);
 432     uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1);
 433     const uchar * src0 = src;
 434
 435     while( i < len0 )
 436     {
 437         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 438
 439         while (j < blockSizei)
 440         {
 441             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 442             uint8x16_t v_pz = v_zero;
 443
 444             for( ; k <= blockSizej - 16; k += 16 )
 445                 v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1));
 446
 447             uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz));
 448             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz);
 449             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz);
 450
 451             src0 += blockSizej;
 452             j += blockSizej;
 453         }
 454
 455         i += blockSizei;
 456     }
 457
 458     CV_DECL_ALIGNED(16) unsigned int buf[4];
 459     vst1q_u32(buf, v_nz);
 460     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 461 #endif
 462     for( ; i < len; i++ )
 463         nz += src[i] != 0;
 464     return nz;
 465 }
 466
 467 static int countNonZero16u( const ushort* src, int len )
 468 {
 469     int i = 0, nz = 0;
 470 #if CV_NEON
 471     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
 472     uint32x4_t v_nz = vdupq_n_u32(0u);
 473     uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
 474
 475     while( i < len0 )
 476     {
 477         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 478
 479         while (j < blockSizei)
 480         {
 481             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 482             uint16x8_t v_pz = v_zero;
 483
 484             for( ; k <= blockSizej - 8; k += 8 )
 485                 v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1));
 486
 487             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
 488
 489             src += blockSizej;
 490             j += blockSizej;
 491         }
 492
 493         i += blockSizei;
 494     }
 495
 496     CV_DECL_ALIGNED(16) unsigned int buf[4];
 497     vst1q_u32(buf, v_nz);
 498     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 499 #endif
 500     return nz + countNonZero_(src, len - i);
 501 }
 502
 503 static int countNonZero32s( const int* src, int len )
 504 {
 505     int i = 0, nz = 0;
 506 #if CV_NEON
 507     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
 508     uint32x4_t v_nz = vdupq_n_u32(0u);
 509     int32x4_t v_zero = vdupq_n_s32(0.0f);
 510     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
 511
 512     while( i < len0 )
 513     {
 514         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 515
 516         while (j < blockSizei)
 517         {
 518             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 519             uint16x8_t v_pz = v_zerou;
 520
 521             for( ; k <= blockSizej - 8; k += 8 )
 522                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)),
 523                                                               vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1));
 524
 525             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
 526
 527             src += blockSizej;
 528             j += blockSizej;
 529         }
 530
 531         i += blockSizei;
 532     }
 533
 534     CV_DECL_ALIGNED(16) unsigned int buf[4];
 535     vst1q_u32(buf, v_nz);
 536     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 537 #endif
 538     return nz + countNonZero_(src, len - i);
 539 }
 540
 541 static int countNonZero32f( const float* src, int len )
 542 {
 543     int i = 0, nz = 0;
 544 #if CV_NEON
 545     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
 546     uint32x4_t v_nz = vdupq_n_u32(0u);
 547     float32x4_t v_zero = vdupq_n_f32(0.0f);
 548     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
 549
 550     while( i < len0 )
 551     {
 552         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
 553
 554         while (j < blockSizei)
 555         {
 556             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
 557             uint16x8_t v_pz = v_zerou;
 558
 559             for( ; k <= blockSizej - 8; k += 8 )
 560                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)),
 561                                                               vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1));
 562
 563             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
 564
 565             src += blockSizej;
 566             j += blockSizej;
 567         }
 568
 569         i += blockSizei;
 570     }
 571
 572     CV_DECL_ALIGNED(16) unsigned int buf[4];
 573     vst1q_u32(buf, v_nz);
 574     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 575 #endif
 576     return nz + countNonZero_(src, len - i);
 577 }
 578
 579 static int countNonZero64f( const double* src, int len )
 580 { return countNonZero_(src, len); }
 581
 582 typedef int (*CountNonZeroFunc)(const uchar*, int);
 583
 584 static CountNonZeroFunc getCountNonZeroTab(int depth)
 585 {
 586     static CountNonZeroFunc countNonZeroTab[] =
 587     {
 588         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
 589         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
 590         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),
 591         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0
 592     };
 593
 594     return countNonZeroTab[depth];
 595 }
 596
 597 template<typename T, typename ST, typename SQT>
 598 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
 599 {
 600     const T* src = src0;
 601
 602     if( !mask )
 603     {
 604         int i;
 605         int k = cn % 4;
 606
 607         if( k == 1 )
 608         {
 609             ST s0 = sum[0];
 610             SQT sq0 = sqsum[0];
 611             for( i = 0; i < len; i++, src += cn )
 612             {
 613                 T v = src[0];
 614                 s0 += v; sq0 += (SQT)v*v;
 615             }
 616             sum[0] = s0;
 617             sqsum[0] = sq0;
 618         }
 619         else if( k == 2 )
 620         {
 621             ST s0 = sum[0], s1 = sum[1];
 622             SQT sq0 = sqsum[0], sq1 = sqsum[1];
 623             for( i = 0; i < len; i++, src += cn )
 624             {
 625                 T v0 = src[0], v1 = src[1];
 626                 s0 += v0; sq0 += (SQT)v0*v0;
 627                 s1 += v1; sq1 += (SQT)v1*v1;
 628             }
 629             sum[0] = s0; sum[1] = s1;
 630             sqsum[0] = sq0; sqsum[1] = sq1;
 631         }
 632         else if( k == 3 )
 633         {
 634             ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
 635             SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
 636             for( i = 0; i < len; i++, src += cn )
 637             {
 638                 T v0 = src[0], v1 = src[1], v2 = src[2];
 639                 s0 += v0; sq0 += (SQT)v0*v0;
 640                 s1 += v1; sq1 += (SQT)v1*v1;
 641                 s2 += v2; sq2 += (SQT)v2*v2;
 642             }
 643             sum[0] = s0; sum[1] = s1; sum[2] = s2;
 644             sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
 645         }
 646
 647         for( ; k < cn; k += 4 )
 648         {
 649             src = src0 + k;
 650             ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
 651             SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
 652             for( i = 0; i < len; i++, src += cn )
 653             {
 654                 T v0, v1;
 655                 v0 = src[0], v1 = src[1];
 656                 s0 += v0; sq0 += (SQT)v0*v0;
 657                 s1 += v1; sq1 += (SQT)v1*v1;
 658                 v0 = src[2], v1 = src[3];
 659                 s2 += v0; sq2 += (SQT)v0*v0;
 660                 s3 += v1; sq3 += (SQT)v1*v1;
 661             }
 662             sum[k] = s0; sum[k+1] = s1;
 663             sum[k+2] = s2; sum[k+3] = s3;
 664             sqsum[k] = sq0; sqsum[k+1] = sq1;
 665             sqsum[k+2] = sq2; sqsum[k+3] = sq3;
 666         }
 667         return len;
 668     }
 669
 670     int i, nzm = 0;
 671
 672     if( cn == 1 )
 673     {
 674         ST s0 = sum[0];
 675         SQT sq0 = sqsum[0];
 676         for( i = 0; i < len; i++ )
 677             if( mask[i] )
 678             {
 679                 T v = src[i];
 680                 s0 += v; sq0 += (SQT)v*v;
 681                 nzm++;
 682             }
 683         sum[0] = s0;
 684         sqsum[0] = sq0;
 685     }
 686     else if( cn == 3 )
 687     {
 688         ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
 689         SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
 690         for( i = 0; i < len; i++, src += 3 )
 691             if( mask[i] )
 692             {
 693                 T v0 = src[0], v1 = src[1], v2 = src[2];
 694                 s0 += v0; sq0 += (SQT)v0*v0;
 695                 s1 += v1; sq1 += (SQT)v1*v1;
 696                 s2 += v2; sq2 += (SQT)v2*v2;
 697                 nzm++;
 698             }
 699         sum[0] = s0; sum[1] = s1; sum[2] = s2;
 700         sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
 701     }
 702     else
 703     {
 704         for( i = 0; i < len; i++, src += cn )
 705             if( mask[i] )
 706             {
 707                 for( int k = 0; k < cn; k++ )
 708                 {
 709                     T v = src[k];
 710                     ST s = sum[k] + v;
 711                     SQT sq = sqsum[k] + (SQT)v*v;
 712                     sum[k] = s; sqsum[k] = sq;
 713                 }
 714                 nzm++;
 715             }
 716     }
 717     return nzm;
 718 }
 719
 720
 721 static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
 722 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 723
 724 static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
 725 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 726
 727 static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
 728 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 729
 730 static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
 731 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 732
 733 static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 734 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 735
 736 static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 737 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 738
 739 static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 740 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 741
 742 typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int);
 743
 744 static SumSqrFunc getSumSqrTab(int depth)
 745 {
 746     static SumSqrFunc sumSqrTab[] =
 747     {
 748         (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
 749         (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
 750     };
 751
 752     return sumSqrTab[depth];
 753 }
 754
 755 #ifdef HAVE_OPENCL
 756
 757 template <typename T> Scalar ocl_part_sum(Mat m)
 758 {
 759     CV_Assert(m.rows == 1);
 760
 761     Scalar s = Scalar::all(0);
 762     int cn = m.channels();
 763     const T * const ptr = m.ptr<T>(0);
 764
 765     for (int x = 0, w = m.cols * cn; x < w; )
 766         for (int c = 0; c < cn; ++c, ++x)
 767             s[c] += ptr[x];
 768
 769     return s;
 770 }
 771
 772 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
 773
 774 static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
 775                      InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
 776 {
 777     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
 778
 779     const ocl::Device & dev = ocl::Device::getDefault();
 780     bool doubleSupport = dev.doubleFPConfig() > 0,
 781         haveMask = _mask.kind() != _InputArray::NONE,
 782         haveSrc2 = _src2.kind() != _InputArray::NONE;
 783     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
 784             kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
 785             mcn = std::max(cn, kercn);
 786     CV_Assert(!haveSrc2 || _src2.type() == type);
 787     int convert_cn = haveSrc2 ? mcn : cn;
 788
 789     if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
 790         return false;
 791
 792     int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
 793     size_t wgs = dev.maxWorkGroupSize();
 794
 795     int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
 796             dtype = CV_MAKE_TYPE(ddepth, cn);
 797     CV_Assert(!haveMask || _mask.type() == CV_8UC1);
 798
 799     int wgs2_aligned = 1;
 800     while (wgs2_aligned < (int)wgs)
 801         wgs2_aligned <<= 1;
 802     wgs2_aligned >>= 1;
 803
 804     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
 805     char cvt[2][40];
 806     String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
 807                          " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s",
 808                          ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
 809                          ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
 810                          ocl::typeToStr(ddepth), ddepth, cn,
 811                          ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]),
 812                          opMap[sum_op], (int)wgs, wgs2_aligned,
 813                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
 814                          haveMask ? " -D HAVE_MASK" : "",
 815                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
 816                          haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
 817                          haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
 818                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
 819                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");
 820
 821     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
 822     if (k.empty())
 823         return false;
 824
 825     UMat src = _src.getUMat(), src2 = _src2.getUMat(),
 826         db(1, dbsize, dtype), mask = _mask.getUMat();
 827
 828     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
 829             dbarg = ocl::KernelArg::PtrWriteOnly(db),
 830             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
 831             src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
 832
 833     if (haveMask)
 834     {
 835         if (haveSrc2)
 836             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
 837         else
 838             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
 839     }
 840     else
 841     {
 842         if (haveSrc2)
 843             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
 844         else
 845             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
 846     }
 847
 848     size_t globalsize = ngroups * wgs;
 849     if (k.run(1, &globalsize, &wgs, false))
 850     {
 851         typedef Scalar (*part_sum)(Mat m);
 852         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
 853                 func = funcs[ddepth - CV_32S];
 854
 855         Mat mres = db.getMat(ACCESS_READ);
 856         if (calc2)
 857             const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
 858
 859         res = func(mres.colRange(0, ngroups));
 860         return true;
 861     }
 862     return false;
 863 }
 864
 865 #endif
 866
 867 }
 868
 869 cv::Scalar cv::sum( InputArray _src )
 870 {
 871 #ifdef HAVE_OPENCL
 872     Scalar _res;
 873     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
 874                 ocl_sum(_src, _res, OCL_OP_SUM),
 875                 _res)
 876 #endif
 877
 878     Mat src = _src.getMat();
 879     int k, cn = src.channels(), depth = src.depth();
 880
 881 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 882     size_t total_size = src.total();
 883     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
 884     if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
 885     {
 886         IppiSize sz = { cols, rows };
 887         int type = src.type();
 888         typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
 889         typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
 890         ippiSumFuncHint ippFuncHint =
 891             type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
 892             type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
 893             type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
 894             0;
 895         ippiSumFuncNoHint ippFuncNoHint =
 896             type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
 897             type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
 898             type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
 899             type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
 900             type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
 901             type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
 902             type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
 903             type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
 904             type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
 905             0;
 906         CV_Assert(!ippFuncHint || !ippFuncNoHint);
 907         if( ippFuncHint || ippFuncNoHint )
 908         {
 909             Ipp64f res[4];
 910             IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
 911                             ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
 912             if( ret >= 0 )
 913             {
 914                 Scalar sc;
 915                 for( int i = 0; i < cn; i++ )
 916                     sc[i] = res[i];
 917                 return sc;
 918             }
 919             setIppErrorStatus();
 920         }
 921     }
 922 #endif
 923
 924     SumFunc func = getSumFunc(depth);
 925
 926     CV_Assert( cn <= 4 && func != 0 );
 927
 928     const Mat* arrays[] = {&src, 0};
 929     uchar* ptrs[1];
 930     NAryMatIterator it(arrays, ptrs);
 931     Scalar s;
 932     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
 933     int j, count = 0;
 934     AutoBuffer<int> _buf;
 935     int* buf = (int*)&s[0];
 936     size_t esz = 0;
 937     bool blockSum = depth < CV_32S;
 938
 939     if( blockSum )
 940     {
 941         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
 942         blockSize = std::min(blockSize, intSumBlockSize);
 943         _buf.allocate(cn);
 944         buf = _buf;
 945
 946         for( k = 0; k < cn; k++ )
 947             buf[k] = 0;
 948         esz = src.elemSize();
 949     }
 950
 951     for( size_t i = 0; i < it.nplanes; i++, ++it )
 952     {
 953         for( j = 0; j < total; j += blockSize )
 954         {
 955             int bsz = std::min(total - j, blockSize);
 956             func( ptrs[0], 0, (uchar*)buf, bsz, cn );
 957             count += bsz;
 958             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
 959             {
 960                 for( k = 0; k < cn; k++ )
 961                 {
 962                     s[k] += buf[k];
 963                     buf[k] = 0;
 964                 }
 965                 count = 0;
 966             }
 967             ptrs[0] += bsz*esz;
 968         }
 969     }
 970     return s;
 971 }
 972
 973 #ifdef HAVE_OPENCL
 974
 975 namespace cv {
 976
 977 static bool ocl_countNonZero( InputArray _src, int & res )
 978 {
 979     int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
 980     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 981
 982     if (depth == CV_64F && !doubleSupport)
 983         return false;
 984
 985     int dbsize = ocl::Device::getDefault().maxComputeUnits();
 986     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
 987
 988     int wgs2_aligned = 1;
 989     while (wgs2_aligned < (int)wgs)
 990         wgs2_aligned <<= 1;
 991     wgs2_aligned >>= 1;
 992
 993     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
 994                   format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
 995                          " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
 996                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
 997                          ocl::typeToStr(depth), (int)wgs, kercn,
 998                          wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
 999                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
1000     if (k.empty())
1001         return false;
1002
1003     UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
1004     k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1005            dbsize, ocl::KernelArg::PtrWriteOnly(db));
1006
1007     size_t globalsize = dbsize * wgs;
1008     if (k.run(1, &globalsize, &wgs, true))
1009         return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
1010     return false;
1011 }
1012
1013 }
1014
1015 #endif
1016
1017 int cv::countNonZero( InputArray _src )
1018 {
1019     int type = _src.type(), cn = CV_MAT_CN(type);
1020     CV_Assert( cn == 1 );
1021
1022 #ifdef HAVE_OPENCL
1023     int res = -1;
1024     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
1025                 ocl_countNonZero(_src, res),
1026                 res)
1027 #endif
1028
1029     Mat src = _src.getMat();
1030
1031 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && 0
1032     if (src.dims <= 2 || src.isContinuous())
1033     {
1034         IppiSize roiSize = { src.cols, src.rows };
1035         Ipp32s count = 0, srcstep = (Ipp32s)src.step;
1036         IppStatus status = (IppStatus)-1;
1037
1038         if (src.isContinuous())
1039         {
1040             roiSize.width = (Ipp32s)src.total();
1041             roiSize.height = 1;
1042             srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type);
1043         }
1044
1045         int depth = CV_MAT_DEPTH(type);
1046         if (depth == CV_8U)
1047             status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0);
1048         else if (depth == CV_32F)
1049             status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0);
1050
1051         if (status >= 0)
1052             return (Ipp32s)src.total() - count;
1053         setIppErrorStatus();
1054     }
1055 #endif
1056
1057     CountNonZeroFunc func = getCountNonZeroTab(src.depth());
1058     CV_Assert( func != 0 );
1059
1060     const Mat* arrays[] = {&src, 0};
1061     uchar* ptrs[1];
1062     NAryMatIterator it(arrays, ptrs);
1063     int total = (int)it.size, nz = 0;
1064
1065     for( size_t i = 0; i < it.nplanes; i++, ++it )
1066         nz += func( ptrs[0], total );
1067
1068     return nz;
1069 }
1070
1071 cv::Scalar cv::mean( InputArray _src, InputArray _mask )
1072 {
1073     Mat src = _src.getMat(), mask = _mask.getMat();
1074     CV_Assert( mask.empty() || mask.type() == CV_8U );
1075
1076     int k, cn = src.channels(), depth = src.depth();
1077
1078 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1079     size_t total_size = src.total();
1080     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
1081     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1082     {
1083         IppiSize sz = { cols, rows };
1084         int type = src.type();
1085         if( !mask.empty() )
1086         {
1087             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
1088             ippiMaskMeanFuncC1 ippFuncC1 =
1089             type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
1090             type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
1091             type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
1092             0;
1093             if( ippFuncC1 )
1094             {
1095                 Ipp64f res;
1096                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 )
1097                     return Scalar(res);
1098                 setIppErrorStatus();
1099             }
1100             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
1101             ippiMaskMeanFuncC3 ippFuncC3 =
1102             type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
1103             type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
1104             type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
1105             0;
1106             if( ippFuncC3 )
1107             {
1108                 Ipp64f res1, res2, res3;
1109                 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 &&
1110                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 &&
1111                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 )
1112                 {
1113                     return Scalar(res1, res2, res3);
1114                 }
1115                 setIppErrorStatus();
1116             }
1117         }
1118         else
1119         {
1120             typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
1121             typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
1122             ippiMeanFuncHint ippFuncHint =
1123                 type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
1124                 type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
1125                 type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
1126                 0;
1127             ippiMeanFuncNoHint ippFuncNoHint =
1128                 type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
1129                 type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
1130                 type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
1131                 type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
1132                 type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
1133                 type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
1134                 type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
1135                 type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
1136                 type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
1137                 0;
1138             // Make sure only zero or one version of the function pointer is valid
1139             CV_Assert(!ippFuncHint || !ippFuncNoHint);
1140             if( ippFuncHint || ippFuncNoHint )
1141             {
1142                 Ipp64f res[4];
1143                 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
1144                                 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
1145                 if( ret >= 0 )
1146                 {
1147                     Scalar sc;
1148                     for( int i = 0; i < cn; i++ )
1149                         sc[i] = res[i];
1150                     return sc;
1151                 }
1152                 setIppErrorStatus();
1153             }
1154         }
1155     }
1156 #endif
1157
1158     SumFunc func = getSumFunc(depth);
1159
1160     CV_Assert( cn <= 4 && func != 0 );
1161
1162     const Mat* arrays[] = {&src, &mask, 0};
1163     uchar* ptrs[2];
1164     NAryMatIterator it(arrays, ptrs);
1165     Scalar s;
1166     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
1167     int j, count = 0;
1168     AutoBuffer<int> _buf;
1169     int* buf = (int*)&s[0];
1170     bool blockSum = depth <= CV_16S;
1171     size_t esz = 0, nz0 = 0;
1172
1173     if( blockSum )
1174     {
1175         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
1176         blockSize = std::min(blockSize, intSumBlockSize);
1177         _buf.allocate(cn);
1178         buf = _buf;
1179
1180         for( k = 0; k < cn; k++ )
1181             buf[k] = 0;
1182         esz = src.elemSize();
1183     }
1184
1185     for( size_t i = 0; i < it.nplanes; i++, ++it )
1186     {
1187         for( j = 0; j < total; j += blockSize )
1188         {
1189             int bsz = std::min(total - j, blockSize);
1190             int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn );
1191             count += nz;
1192             nz0 += nz;
1193             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
1194             {
1195                 for( k = 0; k < cn; k++ )
1196                 {
1197                     s[k] += buf[k];
1198                     buf[k] = 0;
1199                 }
1200                 count = 0;
1201             }
1202             ptrs[0] += bsz*esz;
1203             if( ptrs[1] )
1204                 ptrs[1] += bsz;
1205         }
1206     }
1207     return s*(nz0 ? 1./nz0 : 0);
1208 }
1209
1210 #ifdef HAVE_OPENCL
1211
1212 namespace cv {
1213
1214 static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
1215 {
1216     bool haveMask = _mask.kind() != _InputArray::NONE;
1217     int nz = haveMask ? -1 : (int)_src.total();
1218     Scalar mean, stddev;
1219
1220     {
1221         int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1222         bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
1223                 isContinuous = _src.isContinuous(),
1224                 isMaskContinuous = _mask.isContinuous();
1225         const ocl::Device &defDev = ocl::Device::getDefault();
1226         int groups = defDev.maxComputeUnits();
1227         if (defDev.isIntel())
1228         {
1229             static const int subSliceEUCount = 10;
1230             groups = (groups / subSliceEUCount) * 2;
1231         }
1232         size_t wgs = defDev.maxWorkGroupSize();
1233
1234         int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
1235                 dtype = CV_MAKE_TYPE(ddepth, cn),
1236                 sqdtype = CV_MAKETYPE(sqddepth, cn);
1237         CV_Assert(!haveMask || _mask.type() == CV_8UC1);
1238
1239         int wgs2_aligned = 1;
1240         while (wgs2_aligned < (int)wgs)
1241             wgs2_aligned <<= 1;
1242         wgs2_aligned >>= 1;
1243
1244         if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
1245             return false;
1246
1247         char cvt[2][40];
1248         String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
1249                              " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s"
1250                              " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
1251                              ocl::typeToStr(type), ocl::typeToStr(depth),
1252                              ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
1253                              ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
1254                              ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
1255                              cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
1256                              isMaskContinuous ? " -D HAVE_MASK_CONT" : "",
1257                              ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
1258                              (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
1259                              doubleSupport ? " -D DOUBLE_SUPPORT" : "");
1260
1261         ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts);
1262         if (k.empty())
1263             return false;
1264
1265         int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) +
1266                                CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype));
1267         UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
1268
1269         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
1270                 dbarg = ocl::KernelArg::PtrWriteOnly(db),
1271                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
1272
1273         if (haveMask)
1274             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg);
1275         else
1276             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg);
1277
1278         size_t globalsize = groups * wgs;
1279         if (!k.run(1, &globalsize, &wgs, false))
1280             return false;
1281
1282         typedef Scalar (* part_sum)(Mat m);
1283         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> };
1284         Mat dbm = db.getMat(ACCESS_READ);
1285
1286         mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr()));
1287         stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype)));
1288
1289         if (haveMask)
1290             nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() +
1291                                                  groups * (CV_ELEM_SIZE(dtype) +
1292                                                            CV_ELEM_SIZE(sqdtype))))[0]);
1293     }
1294
1295     double total = nz != 0 ? 1.0 / nz : 0;
1296     int k, j, cn = _src.channels();
1297     for (int i = 0; i < cn; ++i)
1298     {
1299         mean[i] *= total;
1300         stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.));
1301     }
1302
1303     for( j = 0; j < 2; j++ )
1304     {
1305         const double * const sptr = j == 0 ? &mean[0] : &stddev[0];
1306         _OutputArray _dst = j == 0 ? _mean : _sdv;
1307         if( !_dst.needed() )
1308             continue;
1309
1310         if( !_dst.fixedSize() )
1311             _dst.create(cn, 1, CV_64F, -1, true);
1312         Mat dst = _dst.getMat();
1313         int dcn = (int)dst.total();
1314         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
1315                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
1316         double* dptr = dst.ptr<double>();
1317         for( k = 0; k < cn; k++ )
1318             dptr[k] = sptr[k];
1319         for( ; k < dcn; k++ )
1320             dptr[k] = 0;
1321     }
1322
1323     return true;
1324 }
1325
1326 }
1327
1328 #endif
1329
1330 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
1331 {
1332     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
1333                ocl_meanStdDev(_src, _mean, _sdv, _mask))
1334
1335     Mat src = _src.getMat(), mask = _mask.getMat();
1336     CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
1337
1338     int k, cn = src.channels(), depth = src.depth();
1339
1340 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1341     size_t total_size = src.total();
1342     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
1343     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1344     {
1345         Ipp64f mean_temp[3];
1346         Ipp64f stddev_temp[3];
1347         Ipp64f *pmean = &mean_temp[0];
1348         Ipp64f *pstddev = &stddev_temp[0];
1349         Mat mean, stddev;
1350         int dcn_mean = -1;
1351         if( _mean.needed() )
1352         {
1353             if( !_mean.fixedSize() )
1354                 _mean.create(cn, 1, CV_64F, -1, true);
1355             mean = _mean.getMat();
1356             dcn_mean = (int)mean.total();
1357             pmean = mean.ptr<Ipp64f>();
1358         }
1359         int dcn_stddev = -1;
1360         if( _sdv.needed() )
1361         {
1362             if( !_sdv.fixedSize() )
1363                 _sdv.create(cn, 1, CV_64F, -1, true);
1364             stddev = _sdv.getMat();
1365             dcn_stddev = (int)stddev.total();
1366             pstddev = stddev.ptr<Ipp64f>();
1367         }
1368         for( int c = cn; c < dcn_mean; c++ )
1369             pmean[c] = 0;
1370         for( int c = cn; c < dcn_stddev; c++ )
1371             pstddev[c] = 0;
1372         IppiSize sz = { cols, rows };
1373         int type = src.type();
1374         if( !mask.empty() )
1375         {
1376             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
1377             ippiMaskMeanStdDevFuncC1 ippFuncC1 =
1378             type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
1379             type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
1380             type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
1381             0;
1382             if( ippFuncC1 )
1383             {
1384                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 )
1385                     return;
1386                 setIppErrorStatus();
1387             }
1388             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
1389             ippiMaskMeanStdDevFuncC3 ippFuncC3 =
1390             type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
1391             type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
1392             type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
1393             0;
1394             if( ippFuncC3 )
1395             {
1396                 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
1397                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
1398                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
1399                     return;
1400                 setIppErrorStatus();
1401             }
1402         }
1403         else
1404         {
1405             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
1406             ippiMeanStdDevFuncC1 ippFuncC1 =
1407             type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
1408             type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
1409 #if (IPP_VERSION_X100 >= 801)
1410             type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
1411 #endif
1412             0;
1413             if( ippFuncC1 )
1414             {
1415                 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 )
1416                     return;
1417                 setIppErrorStatus();
1418             }
1419             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
1420             ippiMeanStdDevFuncC3 ippFuncC3 =
1421             type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
1422             type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
1423             type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
1424             0;
1425             if( ippFuncC3 )
1426             {
1427                 if( ippFuncC3(src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
1428                     ippFuncC3(src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
1429                     ippFuncC3(src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
1430                     return;
1431                 setIppErrorStatus();
1432             }
1433         }
1434     }
1435 #endif
1436
1437
1438     SumSqrFunc func = getSumSqrTab(depth);
1439
1440     CV_Assert( func != 0 );
1441
1442     const Mat* arrays[] = {&src, &mask, 0};
1443     uchar* ptrs[2];
1444     NAryMatIterator it(arrays, ptrs);
1445     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
1446     int j, count = 0, nz0 = 0;
1447     AutoBuffer<double> _buf(cn*4);
1448     double *s = (double*)_buf, *sq = s + cn;
1449     int *sbuf = (int*)s, *sqbuf = (int*)sq;
1450     bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S;
1451     size_t esz = 0;
1452
1453     for( k = 0; k < cn; k++ )
1454         s[k] = sq[k] = 0;
1455
1456     if( blockSum )
1457     {
1458         intSumBlockSize = 1 << 15;
1459         blockSize = std::min(blockSize, intSumBlockSize);
1460         sbuf = (int*)(sq + cn);
1461         if( blockSqSum )
1462             sqbuf = sbuf + cn;
1463         for( k = 0; k < cn; k++ )
1464             sbuf[k] = sqbuf[k] = 0;
1465         esz = src.elemSize();
1466     }
1467
1468     for( size_t i = 0; i < it.nplanes; i++, ++it )
1469     {
1470         for( j = 0; j < total; j += blockSize )
1471         {
1472             int bsz = std::min(total - j, blockSize);
1473             int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn );
1474             count += nz;
1475             nz0 += nz;
1476             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
1477             {
1478                 for( k = 0; k < cn; k++ )
1479                 {
1480                     s[k] += sbuf[k];
1481                     sbuf[k] = 0;
1482                 }
1483                 if( blockSqSum )
1484                 {
1485                     for( k = 0; k < cn; k++ )
1486                     {
1487                         sq[k] += sqbuf[k];
1488                         sqbuf[k] = 0;
1489                     }
1490                 }
1491                 count = 0;
1492             }
1493             ptrs[0] += bsz*esz;
1494             if( ptrs[1] )
1495                 ptrs[1] += bsz;
1496         }
1497     }
1498
1499     double scale = nz0 ? 1./nz0 : 0.;
1500     for( k = 0; k < cn; k++ )
1501     {
1502         s[k] *= scale;
1503         sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
1504     }
1505
1506     for( j = 0; j < 2; j++ )
1507     {
1508         const double* sptr = j == 0 ? s : sq;
1509         _OutputArray _dst = j == 0 ? _mean : _sdv;
1510         if( !_dst.needed() )
1511             continue;
1512
1513         if( !_dst.fixedSize() )
1514             _dst.create(cn, 1, CV_64F, -1, true);
1515         Mat dst = _dst.getMat();
1516         int dcn = (int)dst.total();
1517         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
1518                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
1519         double* dptr = dst.ptr<double>();
1520         for( k = 0; k < cn; k++ )
1521             dptr[k] = sptr[k];
1522         for( ; k < dcn; k++ )
1523             dptr[k] = 0;
1524     }
1525 }
1526
1527 /****************************************************************************************\
1528 *                                       minMaxLoc                                        *
1529 \****************************************************************************************/
1530
1531 namespace cv
1532 {
1533
1534 template<typename T, typename WT> static void
1535 minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal,
1536             size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx )
1537 {
1538     WT minVal = *_minVal, maxVal = *_maxVal;
1539     size_t minIdx = *_minIdx, maxIdx = *_maxIdx;
1540
1541     if( !mask )
1542     {
1543         for( int i = 0; i < len; i++ )
1544         {
1545             T val = src[i];
1546             if( val < minVal )
1547             {
1548                 minVal = val;
1549                 minIdx = startIdx + i;
1550             }
1551             if( val > maxVal )
1552             {
1553                 maxVal = val;
1554                 maxIdx = startIdx + i;
1555             }
1556         }
1557     }
1558     else
1559     {
1560         for( int i = 0; i < len; i++ )
1561         {
1562             T val = src[i];
1563             if( mask[i] && val < minVal )
1564             {
1565                 minVal = val;
1566                 minIdx = startIdx + i;
1567             }
1568             if( mask[i] && val > maxVal )
1569             {
1570                 maxVal = val;
1571                 maxIdx = startIdx + i;
1572             }
1573         }
1574     }
1575
1576     *_minIdx = minIdx;
1577     *_maxIdx = maxIdx;
1578     *_minVal = minVal;
1579     *_maxVal = maxVal;
1580 }
1581
1582 static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval,
1583                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
1584 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1585
1586 static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval,
1587                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
1588 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1589
1590 static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval,
1591                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1592 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1593
1594 static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval,
1595                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1596 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1597
1598 static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval,
1599                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1600 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1601
1602 static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval,
1603                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1604 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1605
1606 static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval,
1607                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1608 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1609
1610 typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t);
1611
1612 static MinMaxIdxFunc getMinmaxTab(int depth)
1613 {
1614     static MinMaxIdxFunc minmaxTab[] =
1615     {
1616         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s),
1617         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s),
1618         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s),
1619         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f),
1620         0
1621     };
1622
1623     return minmaxTab[depth];
1624 }
1625
1626 static void ofs2idx(const Mat& a, size_t ofs, int* idx)
1627 {
1628     int i, d = a.dims;
1629     if( ofs > 0 )
1630     {
1631         ofs--;
1632         for( i = d-1; i >= 0; i-- )
1633         {
1634             int sz = a.size[i];
1635             idx[i] = (int)(ofs % sz);
1636             ofs /= sz;
1637         }
1638     }
1639     else
1640     {
1641         for( i = d-1; i >= 0; i-- )
1642             idx[i] = -1;
1643     }
1644 }
1645
1646 #ifdef HAVE_OPENCL
1647
1648 template <typename T>
1649 void getMinMaxRes(const Mat & db, double * minVal, double * maxVal,
1650                   int* minLoc, int* maxLoc,
1651                   int groupnum, int cols, double * maxVal2)
1652 {
1653     uint index_max = std::numeric_limits<uint>::max();
1654     T minval = std::numeric_limits<T>::max();
1655     T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval;
1656     uint minloc = index_max, maxloc = index_max;
1657
1658     int index = 0;
1659     const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL;
1660     const uint * minlocptr = NULL, * maxlocptr = NULL;
1661     if (minVal || minLoc)
1662     {
1663         minptr = db.ptr<T>();
1664         index += sizeof(T) * groupnum;
1665     }
1666     if (maxVal || maxLoc)
1667     {
1668         maxptr = (const T *)(db.ptr() + index);
1669         index += sizeof(T) * groupnum;
1670     }
1671     if (minLoc)
1672     {
1673         minlocptr = (const uint *)(db.ptr() + index);
1674         index += sizeof(uint) * groupnum;
1675     }
1676     if (maxLoc)
1677     {
1678         maxlocptr = (const uint *)(db.ptr() + index);
1679         index += sizeof(uint) * groupnum;
1680     }
1681     if (maxVal2)
1682         maxptr2 = (const T *)(db.ptr() + index);
1683
1684     for (int i = 0; i < groupnum; i++)
1685     {
1686         if (minptr && minptr[i] <= minval)
1687         {
1688             if (minptr[i] == minval)
1689             {
1690                 if (minlocptr)
1691                     minloc = std::min(minlocptr[i], minloc);
1692             }
1693             else
1694             {
1695                 if (minlocptr)
1696                     minloc = minlocptr[i];
1697                 minval = minptr[i];
1698             }
1699         }
1700         if (maxptr && maxptr[i] >= maxval)
1701         {
1702             if (maxptr[i] == maxval)
1703             {
1704                 if (maxlocptr)
1705                     maxloc = std::min(maxlocptr[i], maxloc);
1706             }
1707             else
1708             {
1709                 if (maxlocptr)
1710                     maxloc = maxlocptr[i];
1711                 maxval = maxptr[i];
1712             }
1713         }
1714         if (maxptr2 && maxptr2[i] > maxval2)
1715             maxval2 = maxptr2[i];
1716     }
1717     bool zero_mask = (minLoc && minloc == index_max) ||
1718             (maxLoc && maxloc == index_max);
1719
1720     if (minVal)
1721         *minVal = zero_mask ? 0 : (double)minval;
1722     if (maxVal)
1723         *maxVal = zero_mask ? 0 : (double)maxval;
1724     if (maxVal2)
1725         *maxVal2 = zero_mask ? 0 : (double)maxval2;
1726
1727     if (minLoc)
1728     {
1729         minLoc[0] = zero_mask ? -1 : minloc / cols;
1730         minLoc[1] = zero_mask ? -1 : minloc % cols;
1731     }
1732     if (maxLoc)
1733     {
1734         maxLoc[0] = zero_mask ? -1 : maxloc / cols;
1735         maxLoc[1] = zero_mask ? -1 : maxloc % cols;
1736     }
1737 }
1738
1739 typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal,
1740                                  int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2);
1741
1742 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
1743                            int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL)
1744 {
1745     const ocl::Device & dev = ocl::Device::getDefault();
1746     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
1747         haveSrc2 = _src2.kind() != _InputArray::NONE;
1748     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
1749             kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src, _src2));
1750
1751     // disabled following modes since it occasionally fails on AMD devices (e.g. A10-6800K, sep. 2014)
1752     if ((haveMask || type == CV_32FC1) && dev.isAMD())
1753         return false;
1754
1755     CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
1756               (cn >= 1 && !minLoc && !maxLoc) );
1757
1758     if (ddepth < 0)
1759         ddepth = depth;
1760
1761     CV_Assert(!haveSrc2 || _src2.type() == type);
1762
1763     if (depth == CV_32S)
1764         return false;
1765
1766     if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
1767         return false;
1768
1769     int groupnum = dev.maxComputeUnits();
1770     size_t wgs = dev.maxWorkGroupSize();
1771
1772     int wgs2_aligned = 1;
1773     while (wgs2_aligned < (int)wgs)
1774         wgs2_aligned <<= 1;
1775     wgs2_aligned >>= 1;
1776
1777     bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL,
1778             needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL;
1779
1780     // in case of mask we must know whether mask is filled with zeros or not
1781     // so let's calculate min or max location, if it's undefined, so mask is zeros
1782     if (!(needMaxLoc || needMinLoc) && haveMask)
1783     {
1784         if (needMinVal)
1785             needMinLoc = true;
1786         else
1787             needMaxLoc = true;
1788     }
1789
1790     char cvt[2][40];
1791     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
1792                          " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
1793                          " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s",
1794                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
1795                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
1796                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1797                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
1798                          _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
1799                          needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
1800                          needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
1801                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
1802                          ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]),
1803                          absValues ? " -D OP_ABS" : "",
1804                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
1805                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth,
1806                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert");
1807
1808     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
1809     if (k.empty())
1810         return false;
1811
1812     int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
1813             dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
1814                                  (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
1815                                  (maxVal2 ? esz : 0));
1816     UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
1817
1818     if (cn > 1 && !haveMask)
1819     {
1820         src = src.reshape(1);
1821         src2 = src2.reshape(1);
1822     }
1823
1824     if (haveSrc2)
1825     {
1826         if (!haveMask)
1827             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1828                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
1829         else
1830             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1831                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
1832                    ocl::KernelArg::ReadOnlyNoSize(src2));
1833     }
1834     else
1835     {
1836         if (!haveMask)
1837             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1838                    groupnum, ocl::KernelArg::PtrWriteOnly(db));
1839         else
1840             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1841                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
1842     }
1843
1844     size_t globalsize = groupnum * wgs;
1845     if (!k.run(1, &globalsize, &wgs, true))
1846         return false;
1847
1848     static const getMinMaxResFunc functab[7] =
1849     {
1850         getMinMaxRes<uchar>,
1851         getMinMaxRes<char>,
1852         getMinMaxRes<ushort>,
1853         getMinMaxRes<short>,
1854         getMinMaxRes<int>,
1855         getMinMaxRes<float>,
1856         getMinMaxRes<double>
1857     };
1858
1859     getMinMaxResFunc func = functab[ddepth];
1860
1861     int locTemp[2];
1862     func(db.getMat(ACCESS_READ), minVal, maxVal,
1863          needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
1864          needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc,
1865          groupnum, src.cols, maxVal2);
1866
1867     return true;
1868 }
1869
1870 #endif
1871
1872 }
1873
1874 void cv::minMaxIdx(InputArray _src, double* minVal,
1875                    double* maxVal, int* minIdx, int* maxIdx,
1876                    InputArray _mask)
1877 {
1878     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1879     CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
1880         (cn > 1 && _mask.empty() && !minIdx && !maxIdx) );
1881
1882     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
1883                ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))
1884
1885     Mat src = _src.getMat(), mask = _mask.getMat();
1886
1887 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1888     size_t total_size = src.total();
1889     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
1890     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1891     {
1892         IppiSize sz = { cols * cn, rows };
1893
1894         if( !mask.empty() )
1895         {
1896             typedef IppStatus (CV_STDCALL* ippiMaskMinMaxIndxFuncC1)(const void *, int, const void *, int,
1897                                                                      IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
1898
1899             CV_SUPPRESS_DEPRECATED_START
1900             ippiMaskMinMaxIndxFuncC1 ippFuncC1 =
1901                 type == CV_8UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1MR :
1902                 type == CV_8SC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1MR :
1903                 type == CV_16UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1MR :
1904                 type == CV_32FC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1MR : 0;
1905             CV_SUPPRESS_DEPRECATED_END
1906
1907             if( ippFuncC1 )
1908             {
1909                 Ipp32f min, max;
1910                 IppiPoint minp, maxp;
1911                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
1912                 {
1913                     if( minVal )
1914                         *minVal = (double)min;
1915                     if( maxVal )
1916                         *maxVal = (double)max;
1917                     if( !minp.x && !minp.y && !maxp.x && !maxp.y && !mask.ptr()[0] )
1918                         minp.x = maxp.x = -1;
1919                     if( minIdx )
1920                     {
1921                         size_t minidx = minp.y * cols + minp.x + 1;
1922                         ofs2idx(src, minidx, minIdx);
1923                     }
1924                     if( maxIdx )
1925                     {
1926                         size_t maxidx = maxp.y * cols + maxp.x + 1;
1927                         ofs2idx(src, maxidx, maxIdx);
1928                     }
1929                     return;
1930                 }
1931                 setIppErrorStatus();
1932             }
1933         }
1934         else
1935         {
1936             typedef IppStatus (CV_STDCALL* ippiMinMaxIndxFuncC1)(const void *, int, IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
1937
1938             CV_SUPPRESS_DEPRECATED_START
1939             ippiMinMaxIndxFuncC1 ippFuncC1 =
1940                 depth == CV_8U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1R :
1941                 depth == CV_8S ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1R :
1942                 depth == CV_16U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1R :
1943                 depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R : 0;
1944             CV_SUPPRESS_DEPRECATED_END
1945
1946             if( ippFuncC1 )
1947             {
1948                 Ipp32f min, max;
1949                 IppiPoint minp, maxp;
1950                 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
1951                 {
1952                     if( minVal )
1953                         *minVal = (double)min;
1954                     if( maxVal )
1955                         *maxVal = (double)max;
1956                     if( minIdx )
1957                     {
1958                         size_t minidx = minp.y * cols + minp.x + 1;
1959                         ofs2idx(src, minidx, minIdx);
1960                     }
1961                     if( maxIdx )
1962                     {
1963                         size_t maxidx = maxp.y * cols + maxp.x + 1;
1964                         ofs2idx(src, maxidx, maxIdx);
1965                     }
1966                     return;
1967                 }
1968                 setIppErrorStatus();
1969             }
1970         }
1971     }
1972 #endif
1973
1974     MinMaxIdxFunc func = getMinmaxTab(depth);
1975     CV_Assert( func != 0 );
1976
1977     const Mat* arrays[] = {&src, &mask, 0};
1978     uchar* ptrs[2];
1979     NAryMatIterator it(arrays, ptrs);
1980
1981     size_t minidx = 0, maxidx = 0;
1982     int iminval = INT_MAX, imaxval = INT_MIN;
1983     float fminval = FLT_MAX, fmaxval = -FLT_MAX;
1984     double dminval = DBL_MAX, dmaxval = -DBL_MAX;
1985     size_t startidx = 1;
1986     int *minval = &iminval, *maxval = &imaxval;
1987     int planeSize = (int)it.size*cn;
1988
1989     if( depth == CV_32F )
1990         minval = (int*)&fminval, maxval = (int*)&fmaxval;
1991     else if( depth == CV_64F )
1992         minval = (int*)&dminval, maxval = (int*)&dmaxval;
1993
1994     for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize )
1995         func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx );
1996
1997     if( minidx == 0 )
1998         dminval = dmaxval = 0;
1999     else if( depth == CV_32F )
2000         dminval = fminval, dmaxval = fmaxval;
2001     else if( depth <= CV_32S )
2002         dminval = iminval, dmaxval = imaxval;
2003
2004     if( minVal )
2005         *minVal = dminval;
2006     if( maxVal )
2007         *maxVal = dmaxval;
2008
2009     if( minIdx )
2010         ofs2idx(src, minidx, minIdx);
2011     if( maxIdx )
2012         ofs2idx(src, maxidx, maxIdx);
2013 }
2014
2015 void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
2016                     Point* minLoc, Point* maxLoc, InputArray mask )
2017 {
2018     CV_Assert(_img.dims() <= 2);
2019
2020     minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask);
2021     if( minLoc )
2022         std::swap(minLoc->x, minLoc->y);
2023     if( maxLoc )
2024         std::swap(maxLoc->x, maxLoc->y);
2025 }
2026
2027 /****************************************************************************************\
2028 *                                         norm                                           *
2029 \****************************************************************************************/
2030
2031 namespace cv
2032 {
2033
2034 float normL2Sqr_(const float* a, const float* b, int n)
2035 {
2036     int j = 0; float d = 0.f;
2037 #if CV_SSE
2038     if( USE_SSE2 )
2039     {
2040         float CV_DECL_ALIGNED(16) buf[4];
2041         __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
2042
2043         for( ; j <= n - 8; j += 8 )
2044         {
2045             __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
2046             __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
2047             d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
2048             d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
2049         }
2050         _mm_store_ps(buf, _mm_add_ps(d0, d1));
2051         d = buf[0] + buf[1] + buf[2] + buf[3];
2052     }
2053     else
2054 #endif
2055     {
2056         for( ; j <= n - 4; j += 4 )
2057         {
2058             float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
2059             d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
2060         }
2061     }
2062
2063     for( ; j < n; j++ )
2064     {
2065         float t = a[j] - b[j];
2066         d += t*t;
2067     }
2068     return d;
2069 }
2070
2071
2072 float normL1_(const float* a, const float* b, int n)
2073 {
2074     int j = 0; float d = 0.f;
2075 #if CV_SSE
2076     if( USE_SSE2 )
2077     {
2078         float CV_DECL_ALIGNED(16) buf[4];
2079         static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
2080         __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
2081         __m128 absmask = _mm_load_ps((const float*)absbuf);
2082
2083         for( ; j <= n - 8; j += 8 )
2084         {
2085             __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
2086             __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
2087             d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
2088             d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
2089         }
2090         _mm_store_ps(buf, _mm_add_ps(d0, d1));
2091         d = buf[0] + buf[1] + buf[2] + buf[3];
2092     }
2093     else
2094 #elif CV_NEON
2095     float32x4_t v_sum = vdupq_n_f32(0.0f);
2096     for ( ; j <= n - 4; j += 4)
2097         v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
2098
2099     float CV_DECL_ALIGNED(16) buf[4];
2100     vst1q_f32(buf, v_sum);
2101     d = buf[0] + buf[1] + buf[2] + buf[3];
2102 #endif
2103     {
2104         for( ; j <= n - 4; j += 4 )
2105         {
2106             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
2107                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
2108         }
2109     }
2110
2111     for( ; j < n; j++ )
2112         d += std::abs(a[j] - b[j]);
2113     return d;
2114 }
2115
2116 int normL1_(const uchar* a, const uchar* b, int n)
2117 {
2118     int j = 0, d = 0;
2119 #if CV_SSE
2120     if( USE_SSE2 )
2121     {
2122         __m128i d0 = _mm_setzero_si128();
2123
2124         for( ; j <= n - 16; j += 16 )
2125         {
2126             __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
2127             __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
2128
2129             d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
2130         }
2131
2132         for( ; j <= n - 4; j += 4 )
2133         {
2134             __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
2135             __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
2136
2137             d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
2138         }
2139         d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
2140     }
2141     else
2142 #elif CV_NEON
2143     uint32x4_t v_sum = vdupq_n_u32(0.0f);
2144     for ( ; j <= n - 16; j += 16)
2145     {
2146         uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
2147         uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
2148         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
2149         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
2150     }
2151
2152     uint CV_DECL_ALIGNED(16) buf[4];
2153     vst1q_u32(buf, v_sum);
2154     d = buf[0] + buf[1] + buf[2] + buf[3];
2155 #endif
2156     {
2157         for( ; j <= n - 4; j += 4 )
2158         {
2159             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
2160                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
2161         }
2162     }
2163     for( ; j < n; j++ )
2164         d += std::abs(a[j] - b[j]);
2165     return d;
2166 }
2167
2168 static const uchar popCountTable[] =
2169 {
2170     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2171     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2172     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2173     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2174     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2175     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2176     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2177     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
2178 };
2179
2180 static const uchar popCountTable2[] =
2181 {
2182     0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
2183     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
2184     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2185     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2186     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2187     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2188     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2189     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
2190 };
2191
2192 static const uchar popCountTable4[] =
2193 {
2194     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2195     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2196     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2197     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2198     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2199     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2200     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2201     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
2202 };
2203
2204 static int normHamming(const uchar* a, int n)
2205 {
2206     int i = 0, result = 0;
2207 #if CV_NEON
2208     {
2209         uint32x4_t bits = vmovq_n_u32(0);
2210         for (; i <= n - 16; i += 16) {
2211             uint8x16_t A_vec = vld1q_u8 (a + i);
2212             uint8x16_t bitsSet = vcntq_u8 (A_vec);
2213             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
2214             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
2215             bits = vaddq_u32(bits, bitSet4);
2216         }
2217         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
2218         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
2219         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
2220     }
2221 #endif
2222         for( ; i <= n - 4; i += 4 )
2223             result += popCountTable[a[i]] + popCountTable[a[i+1]] +
2224             popCountTable[a[i+2]] + popCountTable[a[i+3]];
2225     for( ; i < n; i++ )
2226         result += popCountTable[a[i]];
2227     return result;
2228 }
2229
2230 int normHamming(const uchar* a, const uchar* b, int n)
2231 {
2232     int i = 0, result = 0;
2233 #if CV_NEON
2234     {
2235         uint32x4_t bits = vmovq_n_u32(0);
2236         for (; i <= n - 16; i += 16) {
2237             uint8x16_t A_vec = vld1q_u8 (a + i);
2238             uint8x16_t B_vec = vld1q_u8 (b + i);
2239             uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
2240             uint8x16_t bitsSet = vcntq_u8 (AxorB);
2241             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
2242             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
2243             bits = vaddq_u32(bits, bitSet4);
2244         }
2245         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
2246         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
2247         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
2248     }
2249 #endif
2250         for( ; i <= n - 4; i += 4 )
2251             result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
2252                     popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
2253     for( ; i < n; i++ )
2254         result += popCountTable[a[i] ^ b[i]];
2255     return result;
2256 }
2257
2258 static int normHamming(const uchar* a, int n, int cellSize)
2259 {
2260     if( cellSize == 1 )
2261         return normHamming(a, n);
2262     const uchar* tab = 0;
2263     if( cellSize == 2 )
2264         tab = popCountTable2;
2265     else if( cellSize == 4 )
2266         tab = popCountTable4;
2267     else
2268         CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
2269     int i = 0, result = 0;
2270 #if CV_ENABLE_UNROLLED
2271     for( ; i <= n - 4; i += 4 )
2272         result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
2273 #endif
2274     for( ; i < n; i++ )
2275         result += tab[a[i]];
2276     return result;
2277 }
2278
2279 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
2280 {
2281     if( cellSize == 1 )
2282         return normHamming(a, b, n);
2283     const uchar* tab = 0;
2284     if( cellSize == 2 )
2285         tab = popCountTable2;
2286     else if( cellSize == 4 )
2287         tab = popCountTable4;
2288     else
2289         CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
2290     int i = 0, result = 0;
2291     #if CV_ENABLE_UNROLLED
2292     for( ; i <= n - 4; i += 4 )
2293         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
2294                 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
2295     #endif
2296     for( ; i < n; i++ )
2297         result += tab[a[i] ^ b[i]];
2298     return result;
2299 }
2300
2301
2302 template<typename T, typename ST> int
2303 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2304 {
2305     ST result = *_result;
2306     if( !mask )
2307     {
2308         result = std::max(result, normInf<T, ST>(src, len*cn));
2309     }
2310     else
2311     {
2312         for( int i = 0; i < len; i++, src += cn )
2313             if( mask[i] )
2314             {
2315                 for( int k = 0; k < cn; k++ )
2316                     result = std::max(result, ST(std::abs(src[k])));
2317             }
2318     }
2319     *_result = result;
2320     return 0;
2321 }
2322
2323 template<typename T, typename ST> int
2324 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2325 {
2326     ST result = *_result;
2327     if( !mask )
2328     {
2329         result += normL1<T, ST>(src, len*cn);
2330     }
2331     else
2332     {
2333         for( int i = 0; i < len; i++, src += cn )
2334             if( mask[i] )
2335             {
2336                 for( int k = 0; k < cn; k++ )
2337                     result += std::abs(src[k]);
2338             }
2339     }
2340     *_result = result;
2341     return 0;
2342 }
2343
2344 template<typename T, typename ST> int
2345 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2346 {
2347     ST result = *_result;
2348     if( !mask )
2349     {
2350         result += normL2Sqr<T, ST>(src, len*cn);
2351     }
2352     else
2353     {
2354         for( int i = 0; i < len; i++, src += cn )
2355             if( mask[i] )
2356             {
2357                 for( int k = 0; k < cn; k++ )
2358                 {
2359                     T v = src[k];
2360                     result += (ST)v*v;
2361                 }
2362             }
2363     }
2364     *_result = result;
2365     return 0;
2366 }
2367
2368 template<typename T, typename ST> int
2369 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2370 {
2371     ST result = *_result;
2372     if( !mask )
2373     {
2374         result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
2375     }
2376     else
2377     {
2378         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
2379             if( mask[i] )
2380             {
2381                 for( int k = 0; k < cn; k++ )
2382                     result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
2383             }
2384     }
2385     *_result = result;
2386     return 0;
2387 }
2388
2389 template<typename T, typename ST> int
2390 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2391 {
2392     ST result = *_result;
2393     if( !mask )
2394     {
2395         result += normL1<T, ST>(src1, src2, len*cn);
2396     }
2397     else
2398     {
2399         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
2400             if( mask[i] )
2401             {
2402                 for( int k = 0; k < cn; k++ )
2403                     result += std::abs(src1[k] - src2[k]);
2404             }
2405     }
2406     *_result = result;
2407     return 0;
2408 }
2409
2410 template<typename T, typename ST> int
2411 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2412 {
2413     ST result = *_result;
2414     if( !mask )
2415     {
2416         result += normL2Sqr<T, ST>(src1, src2, len*cn);
2417     }
2418     else
2419     {
2420         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
2421             if( mask[i] )
2422             {
2423                 for( int k = 0; k < cn; k++ )
2424                 {
2425                     ST v = src1[k] - src2[k];
2426                     result += v*v;
2427                 }
2428             }
2429     }
2430     *_result = result;
2431     return 0;
2432 }
2433
2434
2435 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
2436     static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
2437 { return norm##L##_(src, mask, r, len, cn); } \
2438     static int normDiff##L##_##suffix(const type* src1, const type* src2, \
2439     const uchar* mask, ntype* r, int len, int cn) \
2440 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
2441
2442 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
2443     CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
2444     CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
2445     CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
2446
2447 CV_DEF_NORM_ALL(8u, uchar, int, int, int)
2448 CV_DEF_NORM_ALL(8s, schar, int, int, int)
2449 CV_DEF_NORM_ALL(16u, ushort, int, int, double)
2450 CV_DEF_NORM_ALL(16s, short, int, int, double)
2451 CV_DEF_NORM_ALL(32s, int, int, double, double)
2452 CV_DEF_NORM_ALL(32f, float, float, double, double)
2453 CV_DEF_NORM_ALL(64f, double, double, double, double)
2454
2455
2456 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
2457 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
2458
2459 static NormFunc getNormFunc(int normType, int depth)
2460 {
2461     static NormFunc normTab[3][8] =
2462     {
2463         {
2464             (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
2465             (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
2466         },
2467         {
2468             (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
2469             (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
2470         },
2471         {
2472             (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
2473             (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
2474         }
2475     };
2476
2477     return normTab[normType][depth];
2478 }
2479
2480 static NormDiffFunc getNormDiffFunc(int normType, int depth)
2481 {
2482     static NormDiffFunc normDiffTab[3][8] =
2483     {
2484         {
2485             (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
2486             (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
2487             (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
2488             (NormDiffFunc)normDiffInf_64f, 0
2489         },
2490         {
2491             (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
2492             (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
2493             (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
2494             (NormDiffFunc)normDiffL1_64f, 0
2495         },
2496         {
2497             (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
2498             (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
2499             (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
2500             (NormDiffFunc)normDiffL2_64f, 0
2501         }
2502     };
2503
2504     return normDiffTab[normType][depth];
2505 }
2506
2507 #ifdef HAVE_OPENCL
2508
2509 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
2510 {
2511     const ocl::Device & d = ocl::Device::getDefault();
2512     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2513     bool doubleSupport = d.doubleFPConfig() > 0,
2514             haveMask = _mask.kind() != _InputArray::NONE;
2515
2516     if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
2517          (!doubleSupport && depth == CV_64F))
2518         return false;
2519
2520     UMat src = _src.getUMat();
2521
2522     if (normType == NORM_INF)
2523     {
2524         if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
2525                            std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
2526             return false;
2527     }
2528     else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
2529     {
2530         Scalar sc;
2531         bool unstype = depth == CV_8U || depth == CV_16U;
2532
2533         if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
2534                     OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
2535             return false;
2536
2537         if (!haveMask)
2538             cn = 1;
2539
2540         double s = 0.0;
2541         for (int i = 0; i < cn; ++i)
2542             s += sc[i];
2543
2544         result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
2545     }
2546
2547     return true;
2548 }
2549
2550 #endif
2551
2552 }
2553
2554 double cv::norm( InputArray _src, int normType, InputArray _mask )
2555 {
2556     normType &= NORM_TYPE_MASK;
2557     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
2558                normType == NORM_L2 || normType == NORM_L2SQR ||
2559                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
2560
2561 #ifdef HAVE_OPENCL
2562     double _result = 0;
2563     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
2564                 ocl_norm(_src, normType, _mask, _result),
2565                 _result)
2566 #endif
2567
2568     Mat src = _src.getMat(), mask = _mask.getMat();
2569     int depth = src.depth(), cn = src.channels();
2570
2571 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
2572     size_t total_size = src.total();
2573     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
2574
2575     if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
2576         && cols > 0 && (size_t)rows*cols == total_size
2577         && (normType == NORM_INF || normType == NORM_L1 ||
2578             normType == NORM_L2 || normType == NORM_L2SQR) )
2579     {
2580         IppiSize sz = { cols, rows };
2581         int type = src.type();
2582         if( !mask.empty() )
2583         {
2584             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
2585             ippiMaskNormFuncC1 ippFuncC1 =
2586                 normType == NORM_INF ?
2587                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
2588                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8s_C1MR :
2589 //                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
2590                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
2591                 0) :
2592             normType == NORM_L1 ?
2593                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
2594                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8s_C1MR :
2595                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
2596                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
2597                 0) :
2598             normType == NORM_L2 || normType == NORM_L2SQR ?
2599                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
2600                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8s_C1MR :
2601                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
2602                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
2603                 0) : 0;
2604             if( ippFuncC1 )
2605             {
2606                 Ipp64f norm;
2607                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
2608                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2609
2610                 setIppErrorStatus();
2611             }
2612             /*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
2613             ippiMaskNormFuncC3 ippFuncC3 =
2614                 normType == NORM_INF ?
2615                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
2616                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8s_C3CMR :
2617                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
2618                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
2619                 0) :
2620             normType == NORM_L1 ?
2621                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
2622                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8s_C3CMR :
2623                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
2624                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
2625                 0) :
2626             normType == NORM_L2 || normType == NORM_L2SQR ?
2627                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
2628                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8s_C3CMR :
2629                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
2630                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
2631                 0) : 0;
2632             if( ippFuncC3 )
2633             {
2634                 Ipp64f norm1, norm2, norm3;
2635                 if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
2636                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
2637                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
2638                 {
2639                     Ipp64f norm =
2640                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
2641                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
2642                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
2643                         0;
2644                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2645                 }
2646                 setIppErrorStatus();
2647             }*/
2648         }
2649         else
2650         {
2651             typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
2652             typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
2653             ippiNormFuncHint ippFuncHint =
2654                 normType == NORM_L1 ?
2655                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
2656                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L1_32f_C3R :
2657                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L1_32f_C4R :
2658                 0) :
2659                 normType == NORM_L2 || normType == NORM_L2SQR ?
2660                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
2661                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L2_32f_C3R :
2662                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L2_32f_C4R :
2663                 0) : 0;
2664             ippiNormFuncNoHint ippFuncNoHint =
2665                 normType == NORM_INF ?
2666                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
2667                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C3R :
2668                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C4R :
2669                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
2670                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C3R :
2671                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C4R :
2672                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
2673 #if (IPP_VERSION_X100 >= 801)
2674                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2675                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2676 #endif
2677                 type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
2678                 type == CV_32FC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C3R :
2679                 type == CV_32FC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C4R :
2680                 0) :
2681                 normType == NORM_L1 ?
2682                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
2683                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C3R :
2684                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C4R :
2685                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
2686                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C3R :
2687                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C4R :
2688                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
2689                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C3R :
2690                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C4R :
2691                 0) :
2692                 normType == NORM_L2 || normType == NORM_L2SQR ?
2693                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
2694                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C3R :
2695                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C4R :
2696                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
2697                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C3R :
2698                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C4R :
2699                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
2700                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C3R :
2701                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C4R :
2702                 0) : 0;
2703             // Make sure only zero or one version of the function pointer is valid
2704             CV_Assert(!ippFuncHint || !ippFuncNoHint);
2705             if( ippFuncHint || ippFuncNoHint )
2706             {
2707                 Ipp64f norm_array[4];
2708                 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, norm_array, ippAlgHintAccurate) :
2709                                 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, norm_array);
2710                 if( ret >= 0 )
2711                 {
2712                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
2713                     for( int i = 1; i < cn; i++ )
2714                     {
2715                         norm =
2716                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
2717                             normType == NORM_L1 ? norm + norm_array[i] :
2718                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
2719                             0;
2720                     }
2721                     return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
2722                 }
2723                 setIppErrorStatus();
2724             }
2725         }
2726     }
2727 #endif
2728
2729     if( src.isContinuous() && mask.empty() )
2730     {
2731         size_t len = src.total()*cn;
2732         if( len == (size_t)(int)len )
2733         {
2734             if( depth == CV_32F )
2735             {
2736                 const float* data = src.ptr<float>();
2737
2738                 if( normType == NORM_L2 )
2739                 {
2740                     double result = 0;
2741                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
2742                     return std::sqrt(result);
2743                 }
2744                 if( normType == NORM_L2SQR )
2745                 {
2746                     double result = 0;
2747                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
2748                     return result;
2749                 }
2750                 if( normType == NORM_L1 )
2751                 {
2752                     double result = 0;
2753                     GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
2754                     return result;
2755                 }
2756                 if( normType == NORM_INF )
2757                 {
2758                     float result = 0;
2759                     GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
2760                     return result;
2761                 }
2762             }
2763             if( depth == CV_8U )
2764             {
2765                 const uchar* data = src.ptr<uchar>();
2766
2767                 if( normType == NORM_HAMMING )
2768                     return normHamming(data, (int)len);
2769
2770                 if( normType == NORM_HAMMING2 )
2771                     return normHamming(data, (int)len, 2);
2772             }
2773         }
2774     }
2775
2776     CV_Assert( mask.empty() || mask.type() == CV_8U );
2777
2778     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
2779     {
2780         if( !mask.empty() )
2781         {
2782             Mat temp;
2783             bitwise_and(src, mask, temp);
2784             return norm(temp, normType);
2785         }
2786         int cellSize = normType == NORM_HAMMING ? 1 : 2;
2787
2788         const Mat* arrays[] = {&src, 0};
2789         uchar* ptrs[1];
2790         NAryMatIterator it(arrays, ptrs);
2791         int total = (int)it.size;
2792         int result = 0;
2793
2794         for( size_t i = 0; i < it.nplanes; i++, ++it )
2795             result += normHamming(ptrs[0], total, cellSize);
2796
2797         return result;
2798     }
2799
2800     NormFunc func = getNormFunc(normType >> 1, depth);
2801     CV_Assert( func != 0 );
2802
2803     const Mat* arrays[] = {&src, &mask, 0};
2804     uchar* ptrs[2];
2805     union
2806     {
2807         double d;
2808         int i;
2809         float f;
2810     }
2811     result;
2812     result.d = 0;
2813     NAryMatIterator it(arrays, ptrs);
2814     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
2815     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
2816             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
2817     int isum = 0;
2818     int *ibuf = &result.i;
2819     size_t esz = 0;
2820
2821     if( blockSum )
2822     {
2823         intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
2824         blockSize = std::min(blockSize, intSumBlockSize);
2825         ibuf = &isum;
2826         esz = src.elemSize();
2827     }
2828
2829     for( size_t i = 0; i < it.nplanes; i++, ++it )
2830     {
2831         for( j = 0; j < total; j += blockSize )
2832         {
2833             int bsz = std::min(total - j, blockSize);
2834             func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
2835             count += bsz;
2836             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
2837             {
2838                 result.d += isum;
2839                 isum = 0;
2840                 count = 0;
2841             }
2842             ptrs[0] += bsz*esz;
2843             if( ptrs[1] )
2844                 ptrs[1] += bsz;
2845         }
2846     }
2847
2848     if( normType == NORM_INF )
2849     {
2850         if( depth == CV_64F )
2851             ;
2852         else if( depth == CV_32F )
2853             result.d = result.f;
2854         else
2855             result.d = result.i;
2856     }
2857     else if( normType == NORM_L2 )
2858         result.d = std::sqrt(result.d);
2859
2860     return result.d;
2861 }
2862
2863 #ifdef HAVE_OPENCL
2864
2865 namespace cv {
2866
2867 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
2868 {
2869     Scalar sc1, sc2;
2870     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2871     bool relative = (normType & NORM_RELATIVE) != 0;
2872     normType &= ~NORM_RELATIVE;
2873     bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
2874
2875     if (normsum)
2876     {
2877         if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
2878                      OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
2879             return false;
2880     }
2881     else
2882     {
2883         if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
2884                            false, _src2, relative ? &sc2[0] : NULL))
2885             return false;
2886         cn = 1;
2887     }
2888
2889     double s2 = 0;
2890     for (int i = 0; i < cn; ++i)
2891     {
2892         result += sc1[i];
2893         if (relative)
2894             s2 += sc2[i];
2895     }
2896
2897     if (normType == NORM_L2)
2898     {
2899         result = std::sqrt(result);
2900         if (relative)
2901             s2 = std::sqrt(s2);
2902     }
2903
2904     if (relative)
2905         result /= (s2 + DBL_EPSILON);
2906
2907     return true;
2908 }
2909
2910 }
2911
2912 #endif
2913
2914 double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
2915 {
2916     CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );
2917
2918 #ifdef HAVE_OPENCL
2919     double _result = 0;
2920     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
2921                 ocl_norm(_src1, _src2, normType, _mask, _result),
2922                 _result)
2923 #endif
2924
2925     if( normType & CV_RELATIVE )
2926     {
2927 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
2928         Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
2929
2930         normType &= NORM_TYPE_MASK;
2931         CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
2932                 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
2933         size_t total_size = src1.total();
2934         int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
2935         if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
2936             && cols > 0 && (size_t)rows*cols == total_size
2937             && (normType == NORM_INF || normType == NORM_L1 ||
2938                 normType == NORM_L2 || normType == NORM_L2SQR) )
2939         {
2940             IppiSize sz = { cols, rows };
2941             int type = src1.type();
2942             if( !mask.empty() )
2943             {
2944                 typedef IppStatus (CV_STDCALL* ippiMaskNormRelFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
2945                 ippiMaskNormRelFuncC1 ippFuncC1 =
2946                     normType == NORM_INF ?
2947                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8u_C1MR :
2948 #ifndef __APPLE__
2949                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8s_C1MR :
2950 #endif
2951                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_16u_C1MR :
2952                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_32f_C1MR :
2953                     0) :
2954                     normType == NORM_L1 ?
2955                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8u_C1MR :
2956 #ifndef __APPLE__
2957                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8s_C1MR :
2958 #endif
2959                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_16u_C1MR :
2960                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_32f_C1MR :
2961                     0) :
2962                     normType == NORM_L2 || normType == NORM_L2SQR ?
2963                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8u_C1MR :
2964                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8s_C1MR :
2965                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_16u_C1MR :
2966                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_32f_C1MR :
2967                     0) : 0;
2968                 if( ippFuncC1 )
2969                 {
2970                     Ipp64f norm;
2971                     if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
2972                         return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2973                     setIppErrorStatus();
2974                 }
2975             }
2976             else
2977             {
2978                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
2979                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
2980                 ippiNormRelFuncNoHint ippFuncNoHint =
2981                     normType == NORM_INF ?
2982                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
2983                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
2984                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
2985                     type == CV_32FC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
2986                     0) :
2987                     normType == NORM_L1 ?
2988                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
2989                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
2990                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
2991                     0) :
2992                     normType == NORM_L2 || normType == NORM_L2SQR ?
2993                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
2994                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
2995                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
2996                     0) : 0;
2997                 ippiNormRelFuncHint ippFuncHint =
2998                     normType == NORM_L1 ?
2999                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
3000                     0) :
3001                     normType == NORM_L2 || normType == NORM_L2SQR ?
3002                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
3003                     0) : 0;
3004                 if (ippFuncNoHint)
3005                 {
3006                     Ipp64f norm;
3007                     if( ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm) >= 0 )
3008                         return (double)norm;
3009                     setIppErrorStatus();
3010                 }
3011                 if (ippFuncHint)
3012                 {
3013                     Ipp64f norm;
3014                     if( ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) >= 0 )
3015                         return (double)norm;
3016                     setIppErrorStatus();
3017                 }
3018             }
3019         }
3020 #endif
3021         return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
3022     }
3023
3024     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
3025     int depth = src1.depth(), cn = src1.channels();
3026
3027     normType &= 7;
3028     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
3029                normType == NORM_L2 || normType == NORM_L2SQR ||
3030               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
3031
3032 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
3033     size_t total_size = src1.total();
3034     int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
3035     if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
3036         && cols > 0 && (size_t)rows*cols == total_size
3037         && (normType == NORM_INF || normType == NORM_L1 ||
3038             normType == NORM_L2 || normType == NORM_L2SQR) )
3039     {
3040         IppiSize sz = { cols, rows };
3041         int type = src1.type();
3042         if( !mask.empty() )
3043         {
3044             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
3045             ippiMaskNormDiffFuncC1 ippFuncC1 =
3046                 normType == NORM_INF ?
3047                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
3048                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8s_C1MR :
3049                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
3050                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
3051                 0) :
3052                 normType == NORM_L1 ?
3053                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
3054 #ifndef __APPLE__
3055                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
3056 #endif
3057                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
3058                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
3059                 0) :
3060                 normType == NORM_L2 || normType == NORM_L2SQR ?
3061                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
3062                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8s_C1MR :
3063                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
3064                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
3065                 0) : 0;
3066             if( ippFuncC1 )
3067             {
3068                 Ipp64f norm;
3069                 if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
3070                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
3071                 setIppErrorStatus();
3072             }
3073 #ifndef __APPLE__
3074             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
3075             ippiMaskNormDiffFuncC3 ippFuncC3 =
3076                 normType == NORM_INF ?
3077                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
3078                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8s_C3CMR :
3079                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
3080                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
3081                 0) :
3082                 normType == NORM_L1 ?
3083                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
3084                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8s_C3CMR :
3085                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
3086                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
3087                 0) :
3088                 normType == NORM_L2 || normType == NORM_L2SQR ?
3089                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
3090                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8s_C3CMR :
3091                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
3092                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
3093                 0) : 0;
3094             if( ippFuncC3 )
3095             {
3096                 Ipp64f norm1, norm2, norm3;
3097                 if( ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
3098                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
3099                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
3100                 {
3101                     Ipp64f norm =
3102                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
3103                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
3104                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
3105                         0;
3106                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
3107                 }
3108                 setIppErrorStatus();
3109             }
3110 #endif
3111         }
3112         else
3113         {
3114             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
3115             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
3116             ippiNormDiffFuncHint ippFuncHint =
3117                 normType == NORM_L1 ?
3118                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
3119                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C3R :
3120                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C4R :
3121                 0) :
3122                 normType == NORM_L2 || normType == NORM_L2SQR ?
3123                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
3124                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C3R :
3125                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C4R :
3126                 0) : 0;
3127             ippiNormDiffFuncNoHint ippFuncNoHint =
3128                 normType == NORM_INF ?
3129                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
3130                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C3R :
3131                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C4R :
3132                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
3133                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C3R :
3134                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C4R :
3135                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
3136 #if (IPP_VERSION_X100 >= 801)
3137                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
3138                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
3139 #endif
3140                 type == CV_32FC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
3141                 type == CV_32FC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C3R :
3142                 type == CV_32FC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C4R :
3143                 0) :
3144                 normType == NORM_L1 ?
3145                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
3146                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C3R :
3147                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C4R :
3148                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
3149                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C3R :
3150                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C4R :
3151                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
3152                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C3R :
3153                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C4R :
3154                 0) :
3155                 normType == NORM_L2 || normType == NORM_L2SQR ?
3156                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
3157                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C3R :
3158                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C4R :
3159                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
3160                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C3R :
3161                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C4R :
3162                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
3163                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C3R :
3164                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C4R :
3165                 0) : 0;
3166             // Make sure only zero or one version of the function pointer is valid
3167             CV_Assert(!ippFuncHint || !ippFuncNoHint);
3168             if( ippFuncHint || ippFuncNoHint )
3169             {
3170                 Ipp64f norm_array[4];
3171                 IppStatus ret = ippFuncHint ? ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) :
3172                                 ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array);
3173                 if( ret >= 0 )
3174                 {
3175                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
3176                     for( int i = 1; i < src1.channels(); i++ )
3177                     {
3178                         norm =
3179                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
3180                             normType == NORM_L1 ? norm + norm_array[i] :
3181                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
3182                             0;
3183                     }
3184                     return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
3185                 }
3186                 setIppErrorStatus();
3187             }
3188         }
3189     }
3190 #endif
3191
3192     if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
3193     {
3194         size_t len = src1.total()*src1.channels();
3195         if( len == (size_t)(int)len )
3196         {
3197             if( src1.depth() == CV_32F )
3198             {
3199                 const float* data1 = src1.ptr<float>();
3200                 const float* data2 = src2.ptr<float>();
3201
3202                 if( normType == NORM_L2 )
3203                 {
3204                     double result = 0;
3205                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
3206                     return std::sqrt(result);
3207                 }
3208                 if( normType == NORM_L2SQR )
3209                 {
3210                     double result = 0;
3211                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
3212                     return result;
3213                 }
3214                 if( normType == NORM_L1 )
3215                 {
3216                     double result = 0;
3217                     GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
3218                     return result;
3219                 }
3220                 if( normType == NORM_INF )
3221                 {
3222                     float result = 0;
3223                     GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
3224                     return result;
3225                 }
3226             }
3227         }
3228     }
3229
3230     CV_Assert( mask.empty() || mask.type() == CV_8U );
3231
3232     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
3233     {
3234         if( !mask.empty() )
3235         {
3236             Mat temp;
3237             bitwise_xor(src1, src2, temp);
3238             bitwise_and(temp, mask, temp);
3239             return norm(temp, normType);
3240         }
3241         int cellSize = normType == NORM_HAMMING ? 1 : 2;
3242
3243         const Mat* arrays[] = {&src1, &src2, 0};
3244         uchar* ptrs[2];
3245         NAryMatIterator it(arrays, ptrs);
3246         int total = (int)it.size;
3247         int result = 0;
3248
3249         for( size_t i = 0; i < it.nplanes; i++, ++it )
3250             result += normHamming(ptrs[0], ptrs[1], total, cellSize);
3251
3252         return result;
3253     }
3254
3255     NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
3256     CV_Assert( func != 0 );
3257
3258     const Mat* arrays[] = {&src1, &src2, &mask, 0};
3259     uchar* ptrs[3];
3260     union
3261     {
3262         double d;
3263         float f;
3264         int i;
3265         unsigned u;
3266     }
3267     result;
3268     result.d = 0;
3269     NAryMatIterator it(arrays, ptrs);
3270     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
3271     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
3272             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
3273     unsigned isum = 0;
3274     unsigned *ibuf = &result.u;
3275     size_t esz = 0;
3276
3277     if( blockSum )
3278     {
3279         intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
3280         blockSize = std::min(blockSize, intSumBlockSize);
3281         ibuf = &isum;
3282         esz = src1.elemSize();
3283     }
3284
3285     for( size_t i = 0; i < it.nplanes; i++, ++it )
3286     {
3287         for( j = 0; j < total; j += blockSize )
3288         {
3289             int bsz = std::min(total - j, blockSize);
3290             func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
3291             count += bsz;
3292             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
3293             {
3294                 result.d += isum;
3295                 isum = 0;
3296                 count = 0;
3297             }
3298             ptrs[0] += bsz*esz;
3299             ptrs[1] += bsz*esz;
3300             if( ptrs[2] )
3301                 ptrs[2] += bsz;
3302         }
3303     }
3304
3305     if( normType == NORM_INF )
3306     {
3307         if( depth == CV_64F )
3308             ;
3309         else if( depth == CV_32F )
3310             result.d = result.f;
3311         else
3312             result.d = result.u;
3313     }
3314     else if( normType == NORM_L2 )
3315         result.d = std::sqrt(result.d);
3316
3317     return result.d;
3318 }
3319
3320
3321 ///////////////////////////////////// batch distance ///////////////////////////////////////
3322
3323 namespace cv
3324 {
3325
3326 template<typename _Tp, typename _Rt>
3327 void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2,
3328                   int nvecs, int len, _Rt* dist, const uchar* mask)
3329 {
3330     step2 /= sizeof(src2[0]);
3331     if( !mask )
3332     {
3333         for( int i = 0; i < nvecs; i++ )
3334             dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len);
3335     }
3336     else
3337     {
3338         _Rt val0 = std::numeric_limits<_Rt>::max();
3339         for( int i = 0; i < nvecs; i++ )
3340             dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
3341     }
3342 }
3343
3344 template<typename _Tp, typename _Rt>
3345 void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2,
3346                      int nvecs, int len, _Rt* dist, const uchar* mask)
3347 {
3348     step2 /= sizeof(src2[0]);
3349     if( !mask )
3350     {
3351         for( int i = 0; i < nvecs; i++ )
3352             dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len);
3353     }
3354     else
3355     {
3356         _Rt val0 = std::numeric_limits<_Rt>::max();
3357         for( int i = 0; i < nvecs; i++ )
3358             dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
3359     }
3360 }
3361
3362 template<typename _Tp, typename _Rt>
3363 void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
3364                   int nvecs, int len, _Rt* dist, const uchar* mask)
3365 {
3366     step2 /= sizeof(src2[0]);
3367     if( !mask )
3368     {
3369         for( int i = 0; i < nvecs; i++ )
3370             dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len));
3371     }
3372     else
3373     {
3374         _Rt val0 = std::numeric_limits<_Rt>::max();
3375         for( int i = 0; i < nvecs; i++ )
3376             dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0;
3377     }
3378 }
3379
3380 static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
3381                              int nvecs, int len, int* dist, const uchar* mask)
3382 {
3383     step2 /= sizeof(src2[0]);
3384     if( !mask )
3385     {
3386         for( int i = 0; i < nvecs; i++ )
3387             dist[i] = normHamming(src1, src2 + step2*i, len);
3388     }
3389     else
3390     {
3391         int val0 = INT_MAX;
3392         for( int i = 0; i < nvecs; i++ )
3393             dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len) : val0;
3394     }
3395 }
3396
3397 static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2,
3398                               int nvecs, int len, int* dist, const uchar* mask)
3399 {
3400     step2 /= sizeof(src2[0]);
3401     if( !mask )
3402     {
3403         for( int i = 0; i < nvecs; i++ )
3404             dist[i] = normHamming(src1, src2 + step2*i, len, 2);
3405     }
3406     else
3407     {
3408         int val0 = INT_MAX;
3409         for( int i = 0; i < nvecs; i++ )
3410             dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len, 2) : val0;
3411     }
3412 }
3413
3414 static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2,
3415                                int nvecs, int len, int* dist, const uchar* mask)
3416 {
3417     batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
3418 }
3419
3420 static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2,
3421                                int nvecs, int len, float* dist, const uchar* mask)
3422 {
3423     batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
3424 }
3425
3426 static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2,
3427                                   int nvecs, int len, int* dist, const uchar* mask)
3428 {
3429     batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
3430 }
3431
3432 static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2,
3433                                   int nvecs, int len, float* dist, const uchar* mask)
3434 {
3435     batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
3436 }
3437
3438 static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2,
3439                                int nvecs, int len, float* dist, const uchar* mask)
3440 {
3441     batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
3442 }
3443
3444 static void batchDistL1_32f(const float* src1, const float* src2, size_t step2,
3445                              int nvecs, int len, float* dist, const uchar* mask)
3446 {
3447     batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
3448 }
3449
3450 static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2,
3451                                 int nvecs, int len, float* dist, const uchar* mask)
3452 {
3453     batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
3454 }
3455
3456 static void batchDistL2_32f(const float* src1, const float* src2, size_t step2,
3457                              int nvecs, int len, float* dist, const uchar* mask)
3458 {
3459     batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
3460 }
3461
3462 typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2,
3463                               int nvecs, int len, uchar* dist, const uchar* mask);
3464
3465
3466 struct BatchDistInvoker : public ParallelLoopBody
3467 {
3468     BatchDistInvoker( const Mat& _src1, const Mat& _src2,
3469                       Mat& _dist, Mat& _nidx, int _K,
3470                       const Mat& _mask, int _update,
3471                       BatchDistFunc _func)
3472     {
3473         src1 = &_src1;
3474         src2 = &_src2;
3475         dist = &_dist;
3476         nidx = &_nidx;
3477         K = _K;
3478         mask = &_mask;
3479         update = _update;
3480         func = _func;
3481     }
3482
3483     void operator()(const Range& range) const
3484     {
3485         AutoBuffer<int> buf(src2->rows);
3486         int* bufptr = buf;
3487
3488         for( int i = range.start; i < range.end; i++ )
3489         {
3490             func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols,
3491                  K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0);
3492
3493             if( K > 0 )
3494             {
3495                 int* nidxptr = nidx->ptr<int>(i);
3496                 // since positive float's can be compared just like int's,
3497                 // we handle both CV_32S and CV_32F cases with a single branch
3498                 int* distptr = (int*)dist->ptr(i);
3499
3500                 int j, k;
3501
3502                 for( j = 0; j < src2->rows; j++ )
3503                 {
3504                     int d = bufptr[j];
3505                     if( d < distptr[K-1] )
3506                     {
3507                         for( k = K-2; k >= 0 && distptr[k] > d; k-- )
3508                         {
3509                             nidxptr[k+1] = nidxptr[k];
3510                             distptr[k+1] = distptr[k];
3511                         }
3512                         nidxptr[k+1] = j + update;
3513                         distptr[k+1] = d;
3514                     }
3515                 }
3516             }
3517         }
3518     }
3519
3520     const Mat *src1;
3521     const Mat *src2;
3522     Mat *dist;
3523     Mat *nidx;
3524     const Mat *mask;
3525     int K;
3526     int update;
3527     BatchDistFunc func;
3528 };
3529
3530 }
3531
3532 void cv::batchDistance( InputArray _src1, InputArray _src2,
3533                         OutputArray _dist, int dtype, OutputArray _nidx,
3534                         int normType, int K, InputArray _mask,
3535                         int update, bool crosscheck )
3536 {
3537     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
3538     int type = src1.type();
3539     CV_Assert( type == src2.type() && src1.cols == src2.cols &&
3540                (type == CV_32F || type == CV_8U));
3541     CV_Assert( _nidx.needed() == (K > 0) );
3542
3543     if( dtype == -1 )
3544     {
3545         dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F;
3546     }
3547     CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F);
3548
3549     K = std::min(K, src2.rows);
3550
3551     _dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype);
3552     Mat dist = _dist.getMat(), nidx;
3553     if( _nidx.needed() )
3554     {
3555         _nidx.create(dist.size(), CV_32S);
3556         nidx = _nidx.getMat();
3557     }
3558
3559     if( update == 0 && K > 0 )
3560     {
3561         dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX);
3562         nidx = Scalar::all(-1);
3563     }
3564
3565     if( crosscheck )
3566     {
3567         CV_Assert( K == 1 && update == 0 && mask.empty() );
3568         Mat tdist, tidx;
3569         batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);
3570
3571         // if an idx-th element from src1 appeared to be the nearest to i-th element of src2,
3572         // we update the minimum mutual distance between idx-th element of src1 and the whole src2 set.
3573         // As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest
3574         // to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1.
3575         // If nidx[idx] = -1, it means that there is no such ideal couple for it in src2.
3576         // This O(N) procedure is called cross-check and it helps to eliminate some false matches.
3577         if( dtype == CV_32S )
3578         {
3579             for( int i = 0; i < tdist.rows; i++ )
3580             {
3581                 int idx = tidx.at<int>(i);
3582                 int d = tdist.at<int>(i), d0 = dist.at<int>(idx);
3583                 if( d < d0 )
3584                 {
3585                     dist.at<int>(idx) = d;
3586                     nidx.at<int>(idx) = i + update;
3587                 }
3588             }
3589         }
3590         else
3591         {
3592             for( int i = 0; i < tdist.rows; i++ )
3593             {
3594                 int idx = tidx.at<int>(i);
3595                 float d = tdist.at<float>(i), d0 = dist.at<float>(idx);
3596                 if( d < d0 )
3597                 {
3598                     dist.at<float>(idx) = d;
3599                     nidx.at<int>(idx) = i + update;
3600                 }
3601             }
3602         }
3603         return;
3604     }
3605
3606     BatchDistFunc func = 0;
3607     if( type == CV_8U )
3608     {
3609         if( normType == NORM_L1 && dtype == CV_32S )
3610             func = (BatchDistFunc)batchDistL1_8u32s;
3611         else if( normType == NORM_L1 && dtype == CV_32F )
3612             func = (BatchDistFunc)batchDistL1_8u32f;
3613         else if( normType == NORM_L2SQR && dtype == CV_32S )
3614             func = (BatchDistFunc)batchDistL2Sqr_8u32s;
3615         else if( normType == NORM_L2SQR && dtype == CV_32F )
3616             func = (BatchDistFunc)batchDistL2Sqr_8u32f;
3617         else if( normType == NORM_L2 && dtype == CV_32F )
3618             func = (BatchDistFunc)batchDistL2_8u32f;
3619         else if( normType == NORM_HAMMING && dtype == CV_32S )
3620             func = (BatchDistFunc)batchDistHamming;
3621         else if( normType == NORM_HAMMING2 && dtype == CV_32S )
3622             func = (BatchDistFunc)batchDistHamming2;
3623     }
3624     else if( type == CV_32F && dtype == CV_32F )
3625     {
3626         if( normType == NORM_L1 )
3627             func = (BatchDistFunc)batchDistL1_32f;
3628         else if( normType == NORM_L2SQR )
3629             func = (BatchDistFunc)batchDistL2Sqr_32f;
3630         else if( normType == NORM_L2 )
3631             func = (BatchDistFunc)batchDistL2_32f;
3632     }
3633
3634     if( func == 0 )
3635         CV_Error_(CV_StsUnsupportedFormat,
3636                   ("The combination of type=%d, dtype=%d and normType=%d is not supported",
3637                    type, dtype, normType));
3638
3639     parallel_for_(Range(0, src1.rows),
3640                   BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func));
3641 }
3642
3643
3644 void cv::findNonZero( InputArray _src, OutputArray _idx )
3645 {
3646     Mat src = _src.getMat();
3647     CV_Assert( src.type() == CV_8UC1 );
3648     int n = countNonZero(src);
3649     if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
3650         _idx.release();
3651     _idx.create(n, 1, CV_32SC2);
3652     Mat idx = _idx.getMat();
3653     CV_Assert(idx.isContinuous());
3654     Point* idx_ptr = idx.ptr<Point>();
3655
3656     for( int i = 0; i < src.rows; i++ )
3657     {
3658         const uchar* bin_ptr = src.ptr(i);
3659         for( int j = 0; j < src.cols; j++ )
3660             if( bin_ptr[j] )
3661                 *idx_ptr++ = Point(j, i);
3662     }
3663 }
3664
3665 double cv::PSNR(InputArray _src1, InputArray _src2)
3666 {
3667     CV_Assert( _src1.depth() == CV_8U );
3668     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
3669     return 20*log10(255./(diff+DBL_EPSILON));
3670 }
3671
3672
3673 CV_IMPL CvScalar cvSum( const CvArr* srcarr )
3674 {
3675     cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
3676     if( CV_IS_IMAGE(srcarr) )
3677     {
3678         int coi = cvGetImageCOI((IplImage*)srcarr);
3679         if( coi )
3680         {
3681             CV_Assert( 0 < coi && coi <= 4 );
3682             sum = cv::Scalar(sum[coi-1]);
3683         }
3684     }
3685     return sum;
3686 }
3687
3688 CV_IMPL int cvCountNonZero( const CvArr* imgarr )
3689 {
3690     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
3691     if( img.channels() > 1 )
3692         cv::extractImageCOI(imgarr, img);
3693     return countNonZero(img);
3694 }
3695
3696
3697 CV_IMPL  CvScalar
3698 cvAvg( const void* imgarr, const void* maskarr )
3699 {
3700     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
3701     cv::Scalar mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr));
3702     if( CV_IS_IMAGE(imgarr) )
3703     {
3704         int coi = cvGetImageCOI((IplImage*)imgarr);
3705         if( coi )
3706         {
3707             CV_Assert( 0 < coi && coi <= 4 );
3708             mean = cv::Scalar(mean[coi-1]);
3709         }
3710     }
3711     return mean;
3712 }
3713
3714
3715 CV_IMPL  void
3716 cvAvgSdv( const CvArr* imgarr, CvScalar* _mean, CvScalar* _sdv, const void* maskarr )
3717 {
3718     cv::Scalar mean, sdv;
3719
3720     cv::Mat mask;
3721     if( maskarr )
3722         mask = cv::cvarrToMat(maskarr);
3723
3724     cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask );
3725
3726     if( CV_IS_IMAGE(imgarr) )
3727     {
3728         int coi = cvGetImageCOI((IplImage*)imgarr);
3729         if( coi )
3730         {
3731             CV_Assert( 0 < coi && coi <= 4 );
3732             mean = cv::Scalar(mean[coi-1]);
3733             sdv = cv::Scalar(sdv[coi-1]);
3734         }
3735     }
3736
3737     if( _mean )
3738         *(cv::Scalar*)_mean = mean;
3739     if( _sdv )
3740         *(cv::Scalar*)_sdv = sdv;
3741 }
3742
3743
3744 CV_IMPL void
3745 cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal,
3746              CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr )
3747 {
3748     cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1);
3749     if( maskarr )
3750         mask = cv::cvarrToMat(maskarr);
3751     if( img.channels() > 1 )
3752         cv::extractImageCOI(imgarr, img);
3753
3754     cv::minMaxLoc( img, _minVal, _maxVal,
3755                    (cv::Point*)_minLoc, (cv::Point*)_maxLoc, mask );
3756 }
3757
3758
3759 CV_IMPL  double
3760 cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
3761 {
3762     cv::Mat a, mask;
3763     if( !imgA )
3764     {
3765         imgA = imgB;
3766         imgB = 0;
3767     }
3768
3769     a = cv::cvarrToMat(imgA, false, true, 1);
3770     if( maskarr )
3771         mask = cv::cvarrToMat(maskarr);
3772
3773     if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 )
3774         cv::extractImageCOI(imgA, a);
3775
3776     if( !imgB )
3777         return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask);
3778
3779     cv::Mat b = cv::cvarrToMat(imgB, false, true, 1);
3780     if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 )
3781         cv::extractImageCOI(imgB, b);
3782
3783     return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
3784 }