modules/core/src/stat.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "precomp.hpp"
  44 #include <climits>
  45 #include <limits>
  46
  47 #include "opencl_kernels.hpp"
  48
  49 namespace cv
  50 {
  51
  52 template<typename T> static inline Scalar rawToScalar(const T& v)
  53 {
  54     Scalar s;
  55     typedef typename DataType<T>::channel_type T1;
  56     int i, n = DataType<T>::channels;
  57     for( i = 0; i < n; i++ )
  58         s.val[i] = ((T1*)&v)[i];
  59     return s;
  60 }
  61
  62 /****************************************************************************************\
  63 *                                        sum                                             *
  64 \****************************************************************************************/
  65
  66 template<typename T, typename ST>
  67 static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
  68 {
  69     const T* src = src0;
  70     if( !mask )
  71     {
  72         int i=0;
  73         int k = cn % 4;
  74         if( k == 1 )
  75         {
  76             ST s0 = dst[0];
  77
  78             #if CV_ENABLE_UNROLLED
  79             for(; i <= len - 4; i += 4, src += cn*4 )
  80                 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
  81             #endif
  82             for( ; i < len; i++, src += cn )
  83                 s0 += src[0];
  84             dst[0] = s0;
  85         }
  86         else if( k == 2 )
  87         {
  88             ST s0 = dst[0], s1 = dst[1];
  89             for( i = 0; i < len; i++, src += cn )
  90             {
  91                 s0 += src[0];
  92                 s1 += src[1];
  93             }
  94             dst[0] = s0;
  95             dst[1] = s1;
  96         }
  97         else if( k == 3 )
  98         {
  99             ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
 100             for( i = 0; i < len; i++, src += cn )
 101             {
 102                 s0 += src[0];
 103                 s1 += src[1];
 104                 s2 += src[2];
 105             }
 106             dst[0] = s0;
 107             dst[1] = s1;
 108             dst[2] = s2;
 109         }
 110
 111         for( ; k < cn; k += 4 )
 112         {
 113             src = src0 + k;
 114             ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
 115             for( i = 0; i < len; i++, src += cn )
 116             {
 117                 s0 += src[0]; s1 += src[1];
 118                 s2 += src[2]; s3 += src[3];
 119             }
 120             dst[k] = s0;
 121             dst[k+1] = s1;
 122             dst[k+2] = s2;
 123             dst[k+3] = s3;
 124         }
 125         return len;
 126     }
 127
 128     int i, nzm = 0;
 129     if( cn == 1 )
 130     {
 131         ST s = dst[0];
 132         for( i = 0; i < len; i++ )
 133             if( mask[i] )
 134             {
 135                 s += src[i];
 136                 nzm++;
 137             }
 138         dst[0] = s;
 139     }
 140     else if( cn == 3 )
 141     {
 142         ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
 143         for( i = 0; i < len; i++, src += 3 )
 144             if( mask[i] )
 145             {
 146                 s0 += src[0];
 147                 s1 += src[1];
 148                 s2 += src[2];
 149                 nzm++;
 150             }
 151         dst[0] = s0;
 152         dst[1] = s1;
 153         dst[2] = s2;
 154     }
 155     else
 156     {
 157         for( i = 0; i < len; i++, src += cn )
 158             if( mask[i] )
 159             {
 160                 int k = 0;
 161                 #if CV_ENABLE_UNROLLED
 162                 for( ; k <= cn - 4; k += 4 )
 163                 {
 164                     ST s0, s1;
 165                     s0 = dst[k] + src[k];
 166                     s1 = dst[k+1] + src[k+1];
 167                     dst[k] = s0; dst[k+1] = s1;
 168                     s0 = dst[k+2] + src[k+2];
 169                     s1 = dst[k+3] + src[k+3];
 170                     dst[k+2] = s0; dst[k+3] = s1;
 171                 }
 172                 #endif
 173                 for( ; k < cn; k++ )
 174                     dst[k] += src[k];
 175                 nzm++;
 176             }
 177     }
 178     return nzm;
 179 }
 180
 181
 182 static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn )
 183 { return sum_(src, mask, dst, len, cn); }
 184
 185 static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn )
 186 { return sum_(src, mask, dst, len, cn); }
 187
 188 static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn )
 189 { return sum_(src, mask, dst, len, cn); }
 190
 191 static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn )
 192 { return sum_(src, mask, dst, len, cn); }
 193
 194 static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn )
 195 { return sum_(src, mask, dst, len, cn); }
 196
 197 static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn )
 198 { return sum_(src, mask, dst, len, cn); }
 199
 200 static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn )
 201 { return sum_(src, mask, dst, len, cn); }
 202
 203 typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int);
 204
 205 static SumFunc getSumFunc(int depth)
 206 {
 207     static SumFunc sumTab[] =
 208     {
 209         (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
 210         (SumFunc)sum16u, (SumFunc)sum16s,
 211         (SumFunc)sum32s,
 212         (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f,
 213         0
 214     };
 215
 216     return sumTab[depth];
 217 }
 218
 219 template<typename T>
 220 static int countNonZero_(const T* src, int len )
 221 {
 222     int i=0, nz = 0;
 223     #if CV_ENABLE_UNROLLED
 224     for(; i <= len - 4; i += 4 )
 225         nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
 226     #endif
 227     for( ; i < len; i++ )
 228         nz += src[i] != 0;
 229     return nz;
 230 }
 231
 232 static int countNonZero8u( const uchar* src, int len )
 233 {
 234     int i=0, nz = 0;
 235 #if CV_SSE2
 236     if(USE_SSE2)//5x-6x
 237     {
 238         __m128i pattern = _mm_setzero_si128 ();
 239         static uchar tab[256];
 240         static volatile bool initialized = false;
 241         if( !initialized )
 242         {
 243             // we compute inverse popcount table,
 244             // since we pass (img[x] == 0) mask as index in the table.
 245             for( int j = 0; j < 256; j++ )
 246             {
 247                 int val = 0;
 248                 for( int mask = 1; mask < 256; mask += mask )
 249                     val += (j & mask) == 0;
 250                 tab[j] = (uchar)val;
 251             }
 252             initialized = true;
 253         }
 254
 255         for (; i<=len-16; i+=16)
 256         {
 257             __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
 258             int val = _mm_movemask_epi8(_mm_cmpeq_epi8(r0, pattern));
 259             nz += tab[val & 255] + tab[val >> 8];
 260         }
 261     }
 262 #endif
 263     for( ; i < len; i++ )
 264         nz += src[i] != 0;
 265     return nz;
 266 }
 267
 268 static int countNonZero16u( const ushort* src, int len )
 269 { return countNonZero_(src, len); }
 270
 271 static int countNonZero32s( const int* src, int len )
 272 { return countNonZero_(src, len); }
 273
 274 static int countNonZero32f( const float* src, int len )
 275 { return countNonZero_(src, len); }
 276
 277 static int countNonZero64f( const double* src, int len )
 278 { return countNonZero_(src, len); }
 279
 280 typedef int (*CountNonZeroFunc)(const uchar*, int);
 281
 282 static CountNonZeroFunc getCountNonZeroTab(int depth)
 283 {
 284     static CountNonZeroFunc countNonZeroTab[] =
 285     {
 286         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
 287         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
 288         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),
 289         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0
 290     };
 291
 292     return countNonZeroTab[depth];
 293 }
 294
 295 template<typename T, typename ST, typename SQT>
 296 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
 297 {
 298     const T* src = src0;
 299
 300     if( !mask )
 301     {
 302         int i;
 303         int k = cn % 4;
 304
 305         if( k == 1 )
 306         {
 307             ST s0 = sum[0];
 308             SQT sq0 = sqsum[0];
 309             for( i = 0; i < len; i++, src += cn )
 310             {
 311                 T v = src[0];
 312                 s0 += v; sq0 += (SQT)v*v;
 313             }
 314             sum[0] = s0;
 315             sqsum[0] = sq0;
 316         }
 317         else if( k == 2 )
 318         {
 319             ST s0 = sum[0], s1 = sum[1];
 320             SQT sq0 = sqsum[0], sq1 = sqsum[1];
 321             for( i = 0; i < len; i++, src += cn )
 322             {
 323                 T v0 = src[0], v1 = src[1];
 324                 s0 += v0; sq0 += (SQT)v0*v0;
 325                 s1 += v1; sq1 += (SQT)v1*v1;
 326             }
 327             sum[0] = s0; sum[1] = s1;
 328             sqsum[0] = sq0; sqsum[1] = sq1;
 329         }
 330         else if( k == 3 )
 331         {
 332             ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
 333             SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
 334             for( i = 0; i < len; i++, src += cn )
 335             {
 336                 T v0 = src[0], v1 = src[1], v2 = src[2];
 337                 s0 += v0; sq0 += (SQT)v0*v0;
 338                 s1 += v1; sq1 += (SQT)v1*v1;
 339                 s2 += v2; sq2 += (SQT)v2*v2;
 340             }
 341             sum[0] = s0; sum[1] = s1; sum[2] = s2;
 342             sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
 343         }
 344
 345         for( ; k < cn; k += 4 )
 346         {
 347             src = src0 + k;
 348             ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
 349             SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
 350             for( i = 0; i < len; i++, src += cn )
 351             {
 352                 T v0, v1;
 353                 v0 = src[0], v1 = src[1];
 354                 s0 += v0; sq0 += (SQT)v0*v0;
 355                 s1 += v1; sq1 += (SQT)v1*v1;
 356                 v0 = src[2], v1 = src[3];
 357                 s2 += v0; sq2 += (SQT)v0*v0;
 358                 s3 += v1; sq3 += (SQT)v1*v1;
 359             }
 360             sum[k] = s0; sum[k+1] = s1;
 361             sum[k+2] = s2; sum[k+3] = s3;
 362             sqsum[k] = sq0; sqsum[k+1] = sq1;
 363             sqsum[k+2] = sq2; sqsum[k+3] = sq3;
 364         }
 365         return len;
 366     }
 367
 368     int i, nzm = 0;
 369
 370     if( cn == 1 )
 371     {
 372         ST s0 = sum[0];
 373         SQT sq0 = sqsum[0];
 374         for( i = 0; i < len; i++ )
 375             if( mask[i] )
 376             {
 377                 T v = src[i];
 378                 s0 += v; sq0 += (SQT)v*v;
 379                 nzm++;
 380             }
 381         sum[0] = s0;
 382         sqsum[0] = sq0;
 383     }
 384     else if( cn == 3 )
 385     {
 386         ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
 387         SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
 388         for( i = 0; i < len; i++, src += 3 )
 389             if( mask[i] )
 390             {
 391                 T v0 = src[0], v1 = src[1], v2 = src[2];
 392                 s0 += v0; sq0 += (SQT)v0*v0;
 393                 s1 += v1; sq1 += (SQT)v1*v1;
 394                 s2 += v2; sq2 += (SQT)v2*v2;
 395                 nzm++;
 396             }
 397         sum[0] = s0; sum[1] = s1; sum[2] = s2;
 398         sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
 399     }
 400     else
 401     {
 402         for( i = 0; i < len; i++, src += cn )
 403             if( mask[i] )
 404             {
 405                 for( int k = 0; k < cn; k++ )
 406                 {
 407                     T v = src[k];
 408                     ST s = sum[k] + v;
 409                     SQT sq = sqsum[k] + (SQT)v*v;
 410                     sum[k] = s; sqsum[k] = sq;
 411                 }
 412                 nzm++;
 413             }
 414     }
 415     return nzm;
 416 }
 417
 418
 419 static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
 420 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 421
 422 static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
 423 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 424
 425 static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
 426 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 427
 428 static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
 429 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 430
 431 static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 432 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 433
 434 static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 435 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 436
 437 static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
 438 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
 439
 440 typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int);
 441
 442 static SumSqrFunc getSumSqrTab(int depth)
 443 {
 444     static SumSqrFunc sumSqrTab[] =
 445     {
 446         (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
 447         (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
 448     };
 449
 450     return sumSqrTab[depth];
 451 }
 452
 453 #ifdef HAVE_OPENCL
 454
 455 template <typename T> Scalar ocl_part_sum(Mat m)
 456 {
 457     CV_Assert(m.rows == 1);
 458
 459     Scalar s = Scalar::all(0);
 460     int cn = m.channels();
 461     const T * const ptr = m.ptr<T>(0);
 462
 463     for (int x = 0, w = m.cols * cn; x < w; )
 464         for (int c = 0; c < cn; ++c, ++x)
 465             s[c] += ptr[x];
 466
 467     return s;
 468 }
 469
 470 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
 471
 472 static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() )
 473 {
 474     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
 475
 476     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 477     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 478
 479     if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
 480         return false;
 481
 482     int dbsize = ocl::Device::getDefault().maxComputeUnits();
 483     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
 484
 485     int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
 486             dtype = CV_MAKE_TYPE(ddepth, cn);
 487     bool haveMask = _mask.kind() != _InputArray::NONE;
 488     CV_Assert(!haveMask || _mask.type() == CV_8UC1);
 489
 490     int wgs2_aligned = 1;
 491     while (wgs2_aligned < (int)wgs)
 492         wgs2_aligned <<= 1;
 493     wgs2_aligned >>= 1;
 494
 495     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
 496     char cvt[40];
 497     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
 498                   format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
 499                          " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
 500                          ocl::typeToStr(type), ocl::typeToStr(depth),
 501                          ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
 502                          ocl::convertTypeStr(depth, ddepth, cn, cvt),
 503                          opMap[sum_op], (int)wgs, wgs2_aligned,
 504                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
 505                          haveMask ? " -D HAVE_MASK" : ""));
 506     if (k.empty())
 507         return false;
 508
 509     UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat();
 510
 511     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
 512             dbarg = ocl::KernelArg::PtrWriteOnly(db),
 513             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
 514
 515     if (haveMask)
 516         k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg);
 517     else
 518         k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg);
 519
 520     size_t globalsize = dbsize * wgs;
 521     if (k.run(1, &globalsize, &wgs, false))
 522     {
 523         typedef Scalar (*part_sum)(Mat m);
 524         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
 525                 func = funcs[ddepth - CV_32S];
 526         res = func(db.getMat(ACCESS_READ));
 527         return true;
 528     }
 529     return false;
 530 }
 531
 532 #endif
 533
 534 }
 535
 536 cv::Scalar cv::sum( InputArray _src )
 537 {
 538 #ifdef HAVE_OPENCL
 539     Scalar _res;
 540     CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
 541                 ocl_sum(_src, _res, OCL_OP_SUM),
 542                 _res)
 543 #endif
 544
 545     Mat src = _src.getMat();
 546     int k, cn = src.channels(), depth = src.depth();
 547
 548 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 549     size_t total_size = src.total();
 550     int rows = src.size[0], cols = (int)(total_size/rows);
 551     if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
 552     {
 553         IppiSize sz = { cols, rows };
 554         int type = src.type();
 555         typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
 556         typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
 557         ippiSumFuncHint ippFuncHint =
 558             type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
 559             type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
 560             type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
 561             0;
 562         ippiSumFuncNoHint ippFuncNoHint =
 563             type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
 564             type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
 565             type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
 566             type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
 567             type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
 568             type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
 569             type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
 570             type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
 571             type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
 572             0;
 573         CV_Assert(!ippFuncHint || !ippFuncNoHint);
 574         if( ippFuncHint || ippFuncNoHint )
 575         {
 576             Ipp64f res[4];
 577             IppStatus ret = ippFuncHint ? ippFuncHint(src.data, (int)src.step[0], sz, res, ippAlgHintAccurate) :
 578                             ippFuncNoHint(src.data, (int)src.step[0], sz, res);
 579             if( ret >= 0 )
 580             {
 581                 Scalar sc;
 582                 for( int i = 0; i < cn; i++ )
 583                     sc[i] = res[i];
 584                 return sc;
 585             }
 586             setIppErrorStatus();
 587         }
 588     }
 589 #endif
 590
 591     SumFunc func = getSumFunc(depth);
 592
 593     CV_Assert( cn <= 4 && func != 0 );
 594
 595     const Mat* arrays[] = {&src, 0};
 596     uchar* ptrs[1];
 597     NAryMatIterator it(arrays, ptrs);
 598     Scalar s;
 599     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
 600     int j, count = 0;
 601     AutoBuffer<int> _buf;
 602     int* buf = (int*)&s[0];
 603     size_t esz = 0;
 604     bool blockSum = depth < CV_32S;
 605
 606     if( blockSum )
 607     {
 608         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
 609         blockSize = std::min(blockSize, intSumBlockSize);
 610         _buf.allocate(cn);
 611         buf = _buf;
 612
 613         for( k = 0; k < cn; k++ )
 614             buf[k] = 0;
 615         esz = src.elemSize();
 616     }
 617
 618     for( size_t i = 0; i < it.nplanes; i++, ++it )
 619     {
 620         for( j = 0; j < total; j += blockSize )
 621         {
 622             int bsz = std::min(total - j, blockSize);
 623             func( ptrs[0], 0, (uchar*)buf, bsz, cn );
 624             count += bsz;
 625             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
 626             {
 627                 for( k = 0; k < cn; k++ )
 628                 {
 629                     s[k] += buf[k];
 630                     buf[k] = 0;
 631                 }
 632                 count = 0;
 633             }
 634             ptrs[0] += bsz*esz;
 635         }
 636     }
 637     return s;
 638 }
 639
 640 #ifdef HAVE_OPENCL
 641
 642 namespace cv {
 643
 644 static bool ocl_countNonZero( InputArray _src, int & res )
 645 {
 646     int type = _src.type(), depth = CV_MAT_DEPTH(type);
 647     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 648
 649     if (depth == CV_64F && !doubleSupport)
 650         return false;
 651
 652     int dbsize = ocl::Device::getDefault().maxComputeUnits();
 653     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
 654
 655     int wgs2_aligned = 1;
 656     while (wgs2_aligned < (int)wgs)
 657         wgs2_aligned <<= 1;
 658     wgs2_aligned >>= 1;
 659
 660     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
 661                   format("-D srcT=%s -D OP_COUNT_NON_ZERO -D WGS=%d -D WGS2_ALIGNED=%d%s",
 662                          ocl::typeToStr(type), (int)wgs,
 663                          wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
 664     if (k.empty())
 665         return false;
 666
 667     UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
 668     k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
 669            dbsize, ocl::KernelArg::PtrWriteOnly(db));
 670
 671     size_t globalsize = dbsize * wgs;
 672     if (k.run(1, &globalsize, &wgs, true))
 673         return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
 674     return false;
 675 }
 676
 677 }
 678
 679 #endif
 680
 681 int cv::countNonZero( InputArray _src )
 682 {
 683     int type = _src.type(), cn = CV_MAT_CN(type);
 684     CV_Assert( cn == 1 );
 685
 686 #ifdef HAVE_OPENCL
 687     int res = -1;
 688     CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
 689                 ocl_countNonZero(_src, res),
 690                 res)
 691 #endif
 692
 693     Mat src = _src.getMat();
 694
 695 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && 0
 696     if (src.dims <= 2 || src.isContinuous())
 697     {
 698         IppiSize roiSize = { src.cols, src.rows };
 699         Ipp32s count = 0, srcstep = (Ipp32s)src.step;
 700         IppStatus status = (IppStatus)-1;
 701
 702         if (src.isContinuous())
 703         {
 704             roiSize.width = (Ipp32s)src.total();
 705             roiSize.height = 1;
 706             srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type);
 707         }
 708
 709         int depth = CV_MAT_DEPTH(type);
 710         if (depth == CV_8U)
 711             status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0);
 712         else if (depth == CV_32F)
 713             status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0);
 714
 715         if (status >= 0)
 716             return (Ipp32s)src.total() - count;
 717         setIppErrorStatus();
 718     }
 719 #endif
 720
 721     CountNonZeroFunc func = getCountNonZeroTab(src.depth());
 722     CV_Assert( func != 0 );
 723
 724     const Mat* arrays[] = {&src, 0};
 725     uchar* ptrs[1];
 726     NAryMatIterator it(arrays, ptrs);
 727     int total = (int)it.size, nz = 0;
 728
 729     for( size_t i = 0; i < it.nplanes; i++, ++it )
 730         nz += func( ptrs[0], total );
 731
 732     return nz;
 733 }
 734
 735 cv::Scalar cv::mean( InputArray _src, InputArray _mask )
 736 {
 737     Mat src = _src.getMat(), mask = _mask.getMat();
 738     CV_Assert( mask.empty() || mask.type() == CV_8U );
 739
 740     int k, cn = src.channels(), depth = src.depth();
 741
 742 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 743     size_t total_size = src.total();
 744     int rows = src.size[0], cols = (int)(total_size/rows);
 745     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
 746     {
 747         IppiSize sz = { cols, rows };
 748         int type = src.type();
 749         if( !mask.empty() )
 750         {
 751             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, void *, int, IppiSize, Ipp64f *);
 752             ippiMaskMeanFuncC1 ippFuncC1 =
 753             type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
 754             type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
 755             type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
 756             0;
 757             if( ippFuncC1 )
 758             {
 759                 Ipp64f res;
 760                 if( ippFuncC1(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, &res) >= 0 )
 761                     return Scalar(res);
 762                 setIppErrorStatus();
 763             }
 764             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, void *, int, IppiSize, int, Ipp64f *);
 765             ippiMaskMeanFuncC3 ippFuncC3 =
 766             type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
 767             type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
 768             type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
 769             0;
 770             if( ippFuncC3 )
 771             {
 772                 Ipp64f res1, res2, res3;
 773                 if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &res1) >= 0 &&
 774                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &res2) >= 0 &&
 775                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &res3) >= 0 )
 776                 {
 777                     return Scalar(res1, res2, res3);
 778                 }
 779                 setIppErrorStatus();
 780             }
 781         }
 782         else
 783         {
 784             typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
 785             typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
 786             ippiMeanFuncHint ippFuncHint =
 787                 type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
 788                 type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
 789                 type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
 790                 0;
 791             ippiMeanFuncNoHint ippFuncNoHint =
 792                 type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
 793                 type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
 794                 type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
 795                 type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
 796                 type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
 797                 type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
 798                 type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
 799                 type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
 800                 type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
 801                 0;
 802             // Make sure only zero or one version of the function pointer is valid
 803             CV_Assert(!ippFuncHint || !ippFuncNoHint);
 804             if( ippFuncHint || ippFuncNoHint )
 805             {
 806                 Ipp64f res[4];
 807                 IppStatus ret = ippFuncHint ? ippFuncHint(src.data, (int)src.step[0], sz, res, ippAlgHintAccurate) :
 808                                 ippFuncNoHint(src.data, (int)src.step[0], sz, res);
 809                 if( ret >= 0 )
 810                 {
 811                     Scalar sc;
 812                     for( int i = 0; i < cn; i++ )
 813                         sc[i] = res[i];
 814                     return sc;
 815                 }
 816                 setIppErrorStatus();
 817             }
 818         }
 819     }
 820 #endif
 821
 822     SumFunc func = getSumFunc(depth);
 823
 824     CV_Assert( cn <= 4 && func != 0 );
 825
 826     const Mat* arrays[] = {&src, &mask, 0};
 827     uchar* ptrs[2];
 828     NAryMatIterator it(arrays, ptrs);
 829     Scalar s;
 830     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
 831     int j, count = 0;
 832     AutoBuffer<int> _buf;
 833     int* buf = (int*)&s[0];
 834     bool blockSum = depth <= CV_16S;
 835     size_t esz = 0, nz0 = 0;
 836
 837     if( blockSum )
 838     {
 839         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
 840         blockSize = std::min(blockSize, intSumBlockSize);
 841         _buf.allocate(cn);
 842         buf = _buf;
 843
 844         for( k = 0; k < cn; k++ )
 845             buf[k] = 0;
 846         esz = src.elemSize();
 847     }
 848
 849     for( size_t i = 0; i < it.nplanes; i++, ++it )
 850     {
 851         for( j = 0; j < total; j += blockSize )
 852         {
 853             int bsz = std::min(total - j, blockSize);
 854             int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn );
 855             count += nz;
 856             nz0 += nz;
 857             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
 858             {
 859                 for( k = 0; k < cn; k++ )
 860                 {
 861                     s[k] += buf[k];
 862                     buf[k] = 0;
 863                 }
 864                 count = 0;
 865             }
 866             ptrs[0] += bsz*esz;
 867             if( ptrs[1] )
 868                 ptrs[1] += bsz;
 869         }
 870     }
 871     return s*(nz0 ? 1./nz0 : 0);
 872 }
 873
 874 #ifdef HAVE_OPENCL
 875
 876 namespace cv {
 877
 878 static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
 879 {
 880     bool haveMask = _mask.kind() != _InputArray::NONE;
 881
 882     Scalar mean, stddev;
 883     if (!ocl_sum(_src, mean, OCL_OP_SUM, _mask))
 884         return false;
 885     if (!ocl_sum(_src, stddev, OCL_OP_SUM_SQR, _mask))
 886         return false;
 887
 888     int nz = haveMask ? countNonZero(_mask) : (int)_src.total();
 889     double total = nz != 0 ? 1.0 / nz : 0;
 890     int k, j, cn = _src.channels();
 891     for (int i = 0; i < cn; ++i)
 892     {
 893         mean[i] *= total;
 894         stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.));
 895     }
 896
 897     for( j = 0; j < 2; j++ )
 898     {
 899         const double * const sptr = j == 0 ? &mean[0] : &stddev[0];
 900         _OutputArray _dst = j == 0 ? _mean : _sdv;
 901         if( !_dst.needed() )
 902             continue;
 903
 904         if( !_dst.fixedSize() )
 905             _dst.create(cn, 1, CV_64F, -1, true);
 906         Mat dst = _dst.getMat();
 907         int dcn = (int)dst.total();
 908         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
 909                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
 910         double* dptr = dst.ptr<double>();
 911         for( k = 0; k < cn; k++ )
 912             dptr[k] = sptr[k];
 913         for( ; k < dcn; k++ )
 914             dptr[k] = 0;
 915     }
 916
 917     return true;
 918 }
 919
 920 }
 921
 922 #endif
 923
 924 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
 925 {
 926     CV_OCL_RUN(_src.isUMat() && _src.dims() <= 2,
 927                ocl_meanStdDev(_src, _mean, _sdv, _mask))
 928
 929     Mat src = _src.getMat(), mask = _mask.getMat();
 930     CV_Assert( mask.empty() || mask.type() == CV_8U );
 931
 932     int k, cn = src.channels(), depth = src.depth();
 933
 934 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 935     size_t total_size = src.total();
 936     int rows = src.size[0], cols = (int)(total_size/rows);
 937     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
 938     {
 939         Ipp64f mean_temp[3];
 940         Ipp64f stddev_temp[3];
 941         Ipp64f *pmean = &mean_temp[0];
 942         Ipp64f *pstddev = &stddev_temp[0];
 943         Mat mean, stddev;
 944         int dcn_mean = -1;
 945         if( _mean.needed() )
 946         {
 947             if( !_mean.fixedSize() )
 948                 _mean.create(cn, 1, CV_64F, -1, true);
 949             mean = _mean.getMat();
 950             dcn_mean = (int)mean.total();
 951             pmean = (Ipp64f *)mean.data;
 952         }
 953         int dcn_stddev = -1;
 954         if( _sdv.needed() )
 955         {
 956             if( !_sdv.fixedSize() )
 957                 _sdv.create(cn, 1, CV_64F, -1, true);
 958             stddev = _sdv.getMat();
 959             dcn_stddev = (int)stddev.total();
 960             pstddev = (Ipp64f *)stddev.data;
 961         }
 962         for( int c = cn; c < dcn_mean; c++ )
 963             pmean[c] = 0;
 964         for( int c = cn; c < dcn_stddev; c++ )
 965             pstddev[c] = 0;
 966         IppiSize sz = { cols, rows };
 967         int type = src.type();
 968         if( !mask.empty() )
 969         {
 970             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, void *, int, IppiSize, Ipp64f *, Ipp64f *);
 971             ippiMaskMeanStdDevFuncC1 ippFuncC1 =
 972             type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
 973             type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
 974             type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
 975             0;
 976             if( ippFuncC1 )
 977             {
 978                 if( ippFuncC1(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, pmean, pstddev) >= 0 )
 979                     return;
 980                 setIppErrorStatus();
 981             }
 982             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
 983             ippiMaskMeanStdDevFuncC3 ippFuncC3 =
 984             type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
 985             type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
 986             type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
 987             0;
 988             if( ippFuncC3 )
 989             {
 990                 if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
 991                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
 992                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
 993                     return;
 994                 setIppErrorStatus();
 995             }
 996         }
 997         else
 998         {
 999             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
1000             ippiMeanStdDevFuncC1 ippFuncC1 =
1001             type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
1002             type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
1003 #if (IPP_VERSION_X100 >= 801)
1004             type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
1005 #endif
1006             0;
1007             if( ippFuncC1 )
1008             {
1009                 if( ippFuncC1(src.data, (int)src.step[0], sz, pmean, pstddev) >= 0 )
1010                     return;
1011                 setIppErrorStatus();
1012             }
1013             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
1014             ippiMeanStdDevFuncC3 ippFuncC3 =
1015             type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
1016             type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
1017             type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
1018             0;
1019             if( ippFuncC3 )
1020             {
1021                 if( ippFuncC3(src.data, (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
1022                     ippFuncC3(src.data, (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
1023                     ippFuncC3(src.data, (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
1024                     return;
1025                 setIppErrorStatus();
1026             }
1027         }
1028     }
1029 #endif
1030
1031
1032     SumSqrFunc func = getSumSqrTab(depth);
1033
1034     CV_Assert( func != 0 );
1035
1036     const Mat* arrays[] = {&src, &mask, 0};
1037     uchar* ptrs[2];
1038     NAryMatIterator it(arrays, ptrs);
1039     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
1040     int j, count = 0, nz0 = 0;
1041     AutoBuffer<double> _buf(cn*4);
1042     double *s = (double*)_buf, *sq = s + cn;
1043     int *sbuf = (int*)s, *sqbuf = (int*)sq;
1044     bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S;
1045     size_t esz = 0;
1046
1047     for( k = 0; k < cn; k++ )
1048         s[k] = sq[k] = 0;
1049
1050     if( blockSum )
1051     {
1052         intSumBlockSize = 1 << 15;
1053         blockSize = std::min(blockSize, intSumBlockSize);
1054         sbuf = (int*)(sq + cn);
1055         if( blockSqSum )
1056             sqbuf = sbuf + cn;
1057         for( k = 0; k < cn; k++ )
1058             sbuf[k] = sqbuf[k] = 0;
1059         esz = src.elemSize();
1060     }
1061
1062     for( size_t i = 0; i < it.nplanes; i++, ++it )
1063     {
1064         for( j = 0; j < total; j += blockSize )
1065         {
1066             int bsz = std::min(total - j, blockSize);
1067             int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn );
1068             count += nz;
1069             nz0 += nz;
1070             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
1071             {
1072                 for( k = 0; k < cn; k++ )
1073                 {
1074                     s[k] += sbuf[k];
1075                     sbuf[k] = 0;
1076                 }
1077                 if( blockSqSum )
1078                 {
1079                     for( k = 0; k < cn; k++ )
1080                     {
1081                         sq[k] += sqbuf[k];
1082                         sqbuf[k] = 0;
1083                     }
1084                 }
1085                 count = 0;
1086             }
1087             ptrs[0] += bsz*esz;
1088             if( ptrs[1] )
1089                 ptrs[1] += bsz;
1090         }
1091     }
1092
1093     double scale = nz0 ? 1./nz0 : 0.;
1094     for( k = 0; k < cn; k++ )
1095     {
1096         s[k] *= scale;
1097         sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
1098     }
1099
1100     for( j = 0; j < 2; j++ )
1101     {
1102         const double* sptr = j == 0 ? s : sq;
1103         _OutputArray _dst = j == 0 ? _mean : _sdv;
1104         if( !_dst.needed() )
1105             continue;
1106
1107         if( !_dst.fixedSize() )
1108             _dst.create(cn, 1, CV_64F, -1, true);
1109         Mat dst = _dst.getMat();
1110         int dcn = (int)dst.total();
1111         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
1112                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
1113         double* dptr = dst.ptr<double>();
1114         for( k = 0; k < cn; k++ )
1115             dptr[k] = sptr[k];
1116         for( ; k < dcn; k++ )
1117             dptr[k] = 0;
1118     }
1119 }
1120
1121 /****************************************************************************************\
1122 *                                       minMaxLoc                                        *
1123 \****************************************************************************************/
1124
1125 namespace cv
1126 {
1127
1128 template<typename T, typename WT> static void
1129 minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal,
1130             size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx )
1131 {
1132     WT minVal = *_minVal, maxVal = *_maxVal;
1133     size_t minIdx = *_minIdx, maxIdx = *_maxIdx;
1134
1135     if( !mask )
1136     {
1137         for( int i = 0; i < len; i++ )
1138         {
1139             T val = src[i];
1140             if( val < minVal )
1141             {
1142                 minVal = val;
1143                 minIdx = startIdx + i;
1144             }
1145             if( val > maxVal )
1146             {
1147                 maxVal = val;
1148                 maxIdx = startIdx + i;
1149             }
1150         }
1151     }
1152     else
1153     {
1154         for( int i = 0; i < len; i++ )
1155         {
1156             T val = src[i];
1157             if( mask[i] && val < minVal )
1158             {
1159                 minVal = val;
1160                 minIdx = startIdx + i;
1161             }
1162             if( mask[i] && val > maxVal )
1163             {
1164                 maxVal = val;
1165                 maxIdx = startIdx + i;
1166             }
1167         }
1168     }
1169
1170     *_minIdx = minIdx;
1171     *_maxIdx = maxIdx;
1172     *_minVal = minVal;
1173     *_maxVal = maxVal;
1174 }
1175
1176 static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval,
1177                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
1178 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1179
1180 static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval,
1181                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
1182 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1183
1184 static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval,
1185                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1186 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1187
1188 static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval,
1189                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1190 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1191
1192 static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval,
1193                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1194 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1195
1196 static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval,
1197                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1198 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1199
1200 static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval,
1201                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
1202 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
1203
1204 typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t);
1205
1206 static MinMaxIdxFunc getMinmaxTab(int depth)
1207 {
1208     static MinMaxIdxFunc minmaxTab[] =
1209     {
1210         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s),
1211         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s),
1212         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s),
1213         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f),
1214         0
1215     };
1216
1217     return minmaxTab[depth];
1218 }
1219
1220 static void ofs2idx(const Mat& a, size_t ofs, int* idx)
1221 {
1222     int i, d = a.dims;
1223     if( ofs > 0 )
1224     {
1225         ofs--;
1226         for( i = d-1; i >= 0; i-- )
1227         {
1228             int sz = a.size[i];
1229             idx[i] = (int)(ofs % sz);
1230             ofs /= sz;
1231         }
1232     }
1233     else
1234     {
1235         for( i = d-1; i >= 0; i-- )
1236             idx[i] = -1;
1237     }
1238 }
1239
1240 #ifdef HAVE_OPENCL
1241
1242 template <typename T>
1243 void getMinMaxRes(const Mat &minv, const Mat &maxv, const Mat &minl, const Mat &maxl, double* minVal,
1244                   double* maxVal, int* minLoc, int* maxLoc, const int groupnum, const int cn, const int cols)
1245 {
1246     T min = std::numeric_limits<T>::max();
1247     T max = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
1248     int minloc = INT_MAX, maxloc = INT_MAX;
1249     for (int i = 0; i < groupnum; i++)
1250     {
1251         T current_min = minv.at<T>(0,i);
1252         T current_max = maxv.at<T>(0,i);
1253         T oldmin = min, oldmax = max;
1254         min = std::min(min, current_min);
1255         max = std::max(max, current_max);
1256         if (cn == 1)
1257         {
1258             int current_minloc = minl.at<int>(0,i);
1259             int current_maxloc = maxl.at<int>(0,i);
1260             if(current_minloc < 0 || current_maxloc < 0) continue;
1261             minloc = (oldmin == current_min) ? std::min(minloc, current_minloc) : (oldmin < current_min) ? minloc : current_minloc;
1262             maxloc = (oldmax == current_max) ? std::min(maxloc, current_maxloc) : (oldmax > current_max) ? maxloc : current_maxloc;
1263         }
1264     }
1265     bool zero_mask = (maxloc == INT_MAX) || (minloc == INT_MAX);
1266     if (minVal)
1267         *minVal = zero_mask ? 0 : (double)min;
1268     if (maxVal)
1269         *maxVal = zero_mask ? 0 : (double)max;
1270     if (minLoc)
1271     {
1272         minLoc[0] = zero_mask ? -1 : minloc/cols;
1273         minLoc[1] = zero_mask ? -1 : minloc%cols;
1274     }
1275     if (maxLoc)
1276     {
1277         maxLoc[0] = zero_mask ? -1 : maxloc/cols;
1278         maxLoc[1] = zero_mask ? -1 : maxloc%cols;
1279     }
1280 }
1281
1282 typedef void (*getMinMaxResFunc)(const Mat &minv, const Mat &maxv, const Mat &minl, const Mat &maxl, double *minVal,
1283                                  double *maxVal, int *minLoc, int *maxLoc, const int gropunum, const int cn, const int cols);
1284
1285 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask)
1286 {
1287     CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
1288         (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
1289
1290     int type = _src.type(), depth = CV_MAT_DEPTH(type);
1291     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
1292
1293     if (depth == CV_64F && !doubleSupport)
1294         return false;
1295
1296     int groupnum = ocl::Device::getDefault().maxComputeUnits();
1297     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
1298
1299     int wgs2_aligned = 1;
1300     while (wgs2_aligned < (int)wgs)
1301         wgs2_aligned <<= 1;
1302     wgs2_aligned >>= 1;
1303
1304     String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s",
1305                          depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs,
1306                          wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "");
1307
1308     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
1309     if (k.empty())
1310         return false;
1311
1312     UMat src = _src.getUMat(), minval(1, groupnum, src.type()),
1313         maxval(1, groupnum, src.type()), minloc( 1, groupnum, CV_32SC1),
1314         maxloc( 1, groupnum, CV_32SC1), mask;
1315     if (!_mask.empty())
1316         mask = _mask.getUMat();
1317
1318     if (src.channels() > 1)
1319         src = src.reshape(1);
1320
1321     if (mask.empty())
1322         k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
1323             groupnum, ocl::KernelArg::PtrWriteOnly(minval), ocl::KernelArg::PtrWriteOnly(maxval),
1324             ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc));
1325     else
1326         k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), groupnum,
1327             ocl::KernelArg::PtrWriteOnly(minval), ocl::KernelArg::PtrWriteOnly(maxval),
1328             ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc), ocl::KernelArg::ReadOnlyNoSize(mask));
1329
1330     size_t globalsize = groupnum * wgs;
1331     if (!k.run(1, &globalsize, &wgs, false))
1332         return false;
1333
1334     Mat minv = minval.getMat(ACCESS_READ), maxv = maxval.getMat(ACCESS_READ),
1335         minl = minloc.getMat(ACCESS_READ), maxl = maxloc.getMat(ACCESS_READ);
1336
1337     static getMinMaxResFunc functab[7] =
1338     {
1339         getMinMaxRes<uchar>,
1340         getMinMaxRes<char>,
1341         getMinMaxRes<ushort>,
1342         getMinMaxRes<short>,
1343         getMinMaxRes<int>,
1344         getMinMaxRes<float>,
1345         getMinMaxRes<double>
1346     };
1347
1348     getMinMaxResFunc func;
1349
1350     func = functab[depth];
1351     func(minv, maxv, minl, maxl, minVal, maxVal, minLoc, maxLoc, groupnum, src.channels(), src.cols);
1352
1353     return true;
1354 }
1355
1356 #endif
1357
1358 }
1359
1360 void cv::minMaxIdx(InputArray _src, double* minVal,
1361                    double* maxVal, int* minIdx, int* maxIdx,
1362                    InputArray _mask)
1363 {
1364     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1365     CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
1366         (cn > 1 && _mask.empty() && !minIdx && !maxIdx) );
1367
1368     CV_OCL_RUN(_src.isUMat() && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
1369                ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))
1370
1371     Mat src = _src.getMat(), mask = _mask.getMat();
1372
1373 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
1374     size_t total_size = src.total();
1375     int rows = src.size[0], cols = (int)(total_size/rows);
1376     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1377     {
1378         IppiSize sz = { cols * cn, rows };
1379
1380         if( !mask.empty() )
1381         {
1382             typedef IppStatus (CV_STDCALL* ippiMaskMinMaxIndxFuncC1)(const void *, int, const void *, int,
1383                                                                      IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
1384
1385             CV_SUPPRESS_DEPRECATED_START
1386             ippiMaskMinMaxIndxFuncC1 ippFuncC1 =
1387                 type == CV_8UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1MR :
1388                 type == CV_8SC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1MR :
1389                 type == CV_16UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1MR :
1390                 type == CV_32FC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1MR : 0;
1391             CV_SUPPRESS_DEPRECATED_END
1392
1393             if( ippFuncC1 )
1394             {
1395                 Ipp32f min, max;
1396                 IppiPoint minp, maxp;
1397                 if( ippFuncC1(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
1398                 {
1399                     if( minVal )
1400                         *minVal = (double)min;
1401                     if( maxVal )
1402                         *maxVal = (double)max;
1403                     if( !minp.x && !minp.y && !maxp.x && !maxp.y && !mask.data[0] )
1404                         minp.x = maxp.x = -1;
1405                     if( minIdx )
1406                     {
1407                         size_t minidx = minp.y * cols + minp.x + 1;
1408                         ofs2idx(src, minidx, minIdx);
1409                     }
1410                     if( maxIdx )
1411                     {
1412                         size_t maxidx = maxp.y * cols + maxp.x + 1;
1413                         ofs2idx(src, maxidx, maxIdx);
1414                     }
1415                     return;
1416                 }
1417                 setIppErrorStatus();
1418             }
1419         }
1420         else
1421         {
1422             typedef IppStatus (CV_STDCALL* ippiMinMaxIndxFuncC1)(const void *, int, IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
1423
1424             CV_SUPPRESS_DEPRECATED_START
1425             ippiMinMaxIndxFuncC1 ippFuncC1 =
1426                 depth == CV_8U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1R :
1427                 depth == CV_8S ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1R :
1428                 depth == CV_16U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1R :
1429                 depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R : 0;
1430             CV_SUPPRESS_DEPRECATED_END
1431
1432             if( ippFuncC1 )
1433             {
1434                 Ipp32f min, max;
1435                 IppiPoint minp, maxp;
1436                 if( ippFuncC1(src.data, (int)src.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
1437                 {
1438                     if( minVal )
1439                         *minVal = (double)min;
1440                     if( maxVal )
1441                         *maxVal = (double)max;
1442                     if( minIdx )
1443                     {
1444                         size_t minidx = minp.y * cols + minp.x + 1;
1445                         ofs2idx(src, minidx, minIdx);
1446                     }
1447                     if( maxIdx )
1448                     {
1449                         size_t maxidx = maxp.y * cols + maxp.x + 1;
1450                         ofs2idx(src, maxidx, maxIdx);
1451                     }
1452                     return;
1453                 }
1454                 setIppErrorStatus();
1455             }
1456         }
1457     }
1458 #endif
1459
1460     MinMaxIdxFunc func = getMinmaxTab(depth);
1461     CV_Assert( func != 0 );
1462
1463     const Mat* arrays[] = {&src, &mask, 0};
1464     uchar* ptrs[2];
1465     NAryMatIterator it(arrays, ptrs);
1466
1467     size_t minidx = 0, maxidx = 0;
1468     int iminval = INT_MAX, imaxval = INT_MIN;
1469     float fminval = FLT_MAX, fmaxval = -FLT_MAX;
1470     double dminval = DBL_MAX, dmaxval = -DBL_MAX;
1471     size_t startidx = 1;
1472     int *minval = &iminval, *maxval = &imaxval;
1473     int planeSize = (int)it.size*cn;
1474
1475     if( depth == CV_32F )
1476         minval = (int*)&fminval, maxval = (int*)&fmaxval;
1477     else if( depth == CV_64F )
1478         minval = (int*)&dminval, maxval = (int*)&dmaxval;
1479
1480     for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize )
1481         func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx );
1482
1483     if( minidx == 0 )
1484         dminval = dmaxval = 0;
1485     else if( depth == CV_32F )
1486         dminval = fminval, dmaxval = fmaxval;
1487     else if( depth <= CV_32S )
1488         dminval = iminval, dmaxval = imaxval;
1489
1490     if( minVal )
1491         *minVal = dminval;
1492     if( maxVal )
1493         *maxVal = dmaxval;
1494
1495     if( minIdx )
1496         ofs2idx(src, minidx, minIdx);
1497     if( maxIdx )
1498         ofs2idx(src, maxidx, maxIdx);
1499 }
1500
1501 void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
1502                     Point* minLoc, Point* maxLoc, InputArray mask )
1503 {
1504     CV_Assert(_img.dims() <= 2);
1505
1506     minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask);
1507     if( minLoc )
1508         std::swap(minLoc->x, minLoc->y);
1509     if( maxLoc )
1510         std::swap(maxLoc->x, maxLoc->y);
1511 }
1512
1513 /****************************************************************************************\
1514 *                                         norm                                           *
1515 \****************************************************************************************/
1516
1517 namespace cv
1518 {
1519
1520 float normL2Sqr_(const float* a, const float* b, int n)
1521 {
1522     int j = 0; float d = 0.f;
1523 #if CV_SSE
1524     if( USE_SSE2 )
1525     {
1526         float CV_DECL_ALIGNED(16) buf[4];
1527         __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
1528
1529         for( ; j <= n - 8; j += 8 )
1530         {
1531             __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
1532             __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
1533             d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
1534             d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
1535         }
1536         _mm_store_ps(buf, _mm_add_ps(d0, d1));
1537         d = buf[0] + buf[1] + buf[2] + buf[3];
1538     }
1539     else
1540 #endif
1541     {
1542         for( ; j <= n - 4; j += 4 )
1543         {
1544             float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
1545             d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
1546         }
1547     }
1548
1549     for( ; j < n; j++ )
1550     {
1551         float t = a[j] - b[j];
1552         d += t*t;
1553     }
1554     return d;
1555 }
1556
1557
1558 float normL1_(const float* a, const float* b, int n)
1559 {
1560     int j = 0; float d = 0.f;
1561 #if CV_SSE
1562     if( USE_SSE2 )
1563     {
1564         float CV_DECL_ALIGNED(16) buf[4];
1565         static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
1566         __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
1567         __m128 absmask = _mm_load_ps((const float*)absbuf);
1568
1569         for( ; j <= n - 8; j += 8 )
1570         {
1571             __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
1572             __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
1573             d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
1574             d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
1575         }
1576         _mm_store_ps(buf, _mm_add_ps(d0, d1));
1577         d = buf[0] + buf[1] + buf[2] + buf[3];
1578     }
1579     else
1580 #endif
1581     {
1582         for( ; j <= n - 4; j += 4 )
1583         {
1584             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
1585                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
1586         }
1587     }
1588
1589     for( ; j < n; j++ )
1590         d += std::abs(a[j] - b[j]);
1591     return d;
1592 }
1593
1594 int normL1_(const uchar* a, const uchar* b, int n)
1595 {
1596     int j = 0, d = 0;
1597 #if CV_SSE
1598     if( USE_SSE2 )
1599     {
1600         __m128i d0 = _mm_setzero_si128();
1601
1602         for( ; j <= n - 16; j += 16 )
1603         {
1604             __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
1605             __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
1606
1607             d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
1608         }
1609
1610         for( ; j <= n - 4; j += 4 )
1611         {
1612             __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
1613             __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
1614
1615             d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
1616         }
1617         d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
1618     }
1619     else
1620 #endif
1621     {
1622         for( ; j <= n - 4; j += 4 )
1623         {
1624             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
1625                     std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
1626         }
1627     }
1628     for( ; j < n; j++ )
1629         d += std::abs(a[j] - b[j]);
1630     return d;
1631 }
1632
1633 static const uchar popCountTable[] =
1634 {
1635     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1636     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1637     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1638     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1639     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1640     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1641     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1642     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
1643 };
1644
1645 static const uchar popCountTable2[] =
1646 {
1647     0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
1648     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
1649     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
1650     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
1651     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
1652     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
1653     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
1654     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
1655 };
1656
1657 static const uchar popCountTable4[] =
1658 {
1659     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1660     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1661     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1662     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1663     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1664     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1665     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1666     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
1667 };
1668
1669 static int normHamming(const uchar* a, int n)
1670 {
1671     int i = 0, result = 0;
1672 #if CV_NEON
1673     {
1674         uint32x4_t bits = vmovq_n_u32(0);
1675         for (; i <= n - 16; i += 16) {
1676             uint8x16_t A_vec = vld1q_u8 (a + i);
1677             uint8x16_t bitsSet = vcntq_u8 (A_vec);
1678             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
1679             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
1680             bits = vaddq_u32(bits, bitSet4);
1681         }
1682         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
1683         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
1684         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
1685     }
1686 #endif
1687         for( ; i <= n - 4; i += 4 )
1688             result += popCountTable[a[i]] + popCountTable[a[i+1]] +
1689             popCountTable[a[i+2]] + popCountTable[a[i+3]];
1690     for( ; i < n; i++ )
1691         result += popCountTable[a[i]];
1692     return result;
1693 }
1694
1695 int normHamming(const uchar* a, const uchar* b, int n)
1696 {
1697     int i = 0, result = 0;
1698 #if CV_NEON
1699     {
1700         uint32x4_t bits = vmovq_n_u32(0);
1701         for (; i <= n - 16; i += 16) {
1702             uint8x16_t A_vec = vld1q_u8 (a + i);
1703             uint8x16_t B_vec = vld1q_u8 (b + i);
1704             uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
1705             uint8x16_t bitsSet = vcntq_u8 (AxorB);
1706             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
1707             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
1708             bits = vaddq_u32(bits, bitSet4);
1709         }
1710         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
1711         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
1712         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
1713     }
1714 #endif
1715         for( ; i <= n - 4; i += 4 )
1716             result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
1717                     popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
1718     for( ; i < n; i++ )
1719         result += popCountTable[a[i] ^ b[i]];
1720     return result;
1721 }
1722
1723 static int normHamming(const uchar* a, int n, int cellSize)
1724 {
1725     if( cellSize == 1 )
1726         return normHamming(a, n);
1727     const uchar* tab = 0;
1728     if( cellSize == 2 )
1729         tab = popCountTable2;
1730     else if( cellSize == 4 )
1731         tab = popCountTable4;
1732     else
1733         CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
1734     int i = 0, result = 0;
1735 #if CV_ENABLE_UNROLLED
1736     for( ; i <= n - 4; i += 4 )
1737         result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
1738 #endif
1739     for( ; i < n; i++ )
1740         result += tab[a[i]];
1741     return result;
1742 }
1743
1744 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
1745 {
1746     if( cellSize == 1 )
1747         return normHamming(a, b, n);
1748     const uchar* tab = 0;
1749     if( cellSize == 2 )
1750         tab = popCountTable2;
1751     else if( cellSize == 4 )
1752         tab = popCountTable4;
1753     else
1754         CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
1755     int i = 0, result = 0;
1756     #if CV_ENABLE_UNROLLED
1757     for( ; i <= n - 4; i += 4 )
1758         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
1759                 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
1760     #endif
1761     for( ; i < n; i++ )
1762         result += tab[a[i] ^ b[i]];
1763     return result;
1764 }
1765
1766
1767 template<typename T, typename ST> int
1768 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
1769 {
1770     ST result = *_result;
1771     if( !mask )
1772     {
1773         result = std::max(result, normInf<T, ST>(src, len*cn));
1774     }
1775     else
1776     {
1777         for( int i = 0; i < len; i++, src += cn )
1778             if( mask[i] )
1779             {
1780                 for( int k = 0; k < cn; k++ )
1781                     result = std::max(result, ST(std::abs(src[k])));
1782             }
1783     }
1784     *_result = result;
1785     return 0;
1786 }
1787
1788 template<typename T, typename ST> int
1789 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
1790 {
1791     ST result = *_result;
1792     if( !mask )
1793     {
1794         result += normL1<T, ST>(src, len*cn);
1795     }
1796     else
1797     {
1798         for( int i = 0; i < len; i++, src += cn )
1799             if( mask[i] )
1800             {
1801                 for( int k = 0; k < cn; k++ )
1802                     result += std::abs(src[k]);
1803             }
1804     }
1805     *_result = result;
1806     return 0;
1807 }
1808
1809 template<typename T, typename ST> int
1810 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
1811 {
1812     ST result = *_result;
1813     if( !mask )
1814     {
1815         result += normL2Sqr<T, ST>(src, len*cn);
1816     }
1817     else
1818     {
1819         for( int i = 0; i < len; i++, src += cn )
1820             if( mask[i] )
1821             {
1822                 for( int k = 0; k < cn; k++ )
1823                 {
1824                     T v = src[k];
1825                     result += (ST)v*v;
1826                 }
1827             }
1828     }
1829     *_result = result;
1830     return 0;
1831 }
1832
1833 template<typename T, typename ST> int
1834 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
1835 {
1836     ST result = *_result;
1837     if( !mask )
1838     {
1839         result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
1840     }
1841     else
1842     {
1843         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
1844             if( mask[i] )
1845             {
1846                 for( int k = 0; k < cn; k++ )
1847                     result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
1848             }
1849     }
1850     *_result = result;
1851     return 0;
1852 }
1853
1854 template<typename T, typename ST> int
1855 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
1856 {
1857     ST result = *_result;
1858     if( !mask )
1859     {
1860         result += normL1<T, ST>(src1, src2, len*cn);
1861     }
1862     else
1863     {
1864         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
1865             if( mask[i] )
1866             {
1867                 for( int k = 0; k < cn; k++ )
1868                     result += std::abs(src1[k] - src2[k]);
1869             }
1870     }
1871     *_result = result;
1872     return 0;
1873 }
1874
1875 template<typename T, typename ST> int
1876 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
1877 {
1878     ST result = *_result;
1879     if( !mask )
1880     {
1881         result += normL2Sqr<T, ST>(src1, src2, len*cn);
1882     }
1883     else
1884     {
1885         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
1886             if( mask[i] )
1887             {
1888                 for( int k = 0; k < cn; k++ )
1889                 {
1890                     ST v = src1[k] - src2[k];
1891                     result += v*v;
1892                 }
1893             }
1894     }
1895     *_result = result;
1896     return 0;
1897 }
1898
1899
1900 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
1901     static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
1902 { return norm##L##_(src, mask, r, len, cn); } \
1903     static int normDiff##L##_##suffix(const type* src1, const type* src2, \
1904     const uchar* mask, ntype* r, int len, int cn) \
1905 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
1906
1907 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
1908     CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
1909     CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
1910     CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
1911
1912 CV_DEF_NORM_ALL(8u, uchar, int, int, int)
1913 CV_DEF_NORM_ALL(8s, schar, int, int, int)
1914 CV_DEF_NORM_ALL(16u, ushort, int, int, double)
1915 CV_DEF_NORM_ALL(16s, short, int, int, double)
1916 CV_DEF_NORM_ALL(32s, int, int, double, double)
1917 CV_DEF_NORM_ALL(32f, float, float, double, double)
1918 CV_DEF_NORM_ALL(64f, double, double, double, double)
1919
1920
1921 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
1922 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
1923
1924 static NormFunc getNormFunc(int normType, int depth)
1925 {
1926     static NormFunc normTab[3][8] =
1927     {
1928         {
1929             (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
1930             (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
1931         },
1932         {
1933             (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
1934             (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
1935         },
1936         {
1937             (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
1938             (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
1939         }
1940     };
1941
1942     return normTab[normType][depth];
1943 }
1944
1945 static NormDiffFunc getNormDiffFunc(int normType, int depth)
1946 {
1947     static NormDiffFunc normDiffTab[3][8] =
1948     {
1949         {
1950             (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
1951             (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
1952             (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
1953             (NormDiffFunc)normDiffInf_64f, 0
1954         },
1955         {
1956             (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
1957             (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
1958             (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
1959             (NormDiffFunc)normDiffL1_64f, 0
1960         },
1961         {
1962             (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
1963             (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
1964             (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
1965             (NormDiffFunc)normDiffL2_64f, 0
1966         }
1967     };
1968
1969     return normDiffTab[normType][depth];
1970 }
1971
1972 #ifdef HAVE_OPENCL
1973
1974 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
1975 {
1976     const ocl::Device & d = ocl::Device::getDefault();
1977     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1978     bool doubleSupport = d.doubleFPConfig() > 0,
1979             haveMask = _mask.kind() != _InputArray::NONE;
1980
1981     if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
1982          (!doubleSupport && depth == CV_64F))
1983         return false;
1984
1985     UMat src = _src.getUMat();
1986
1987     if (normType == NORM_INF)
1988     {
1989         if (cn == 1 || !haveMask)
1990         {
1991             UMat abssrc;
1992
1993             if (depth != CV_8U && depth != CV_16U)
1994             {
1995                 int wdepth = std::max(CV_32S, depth), rowsPerWI = d.isIntel() ? 4 : 1;
1996                 char cvt[50];
1997
1998                 ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc,
1999                                  format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s"
2000                                         " -D convertToDT=%s -D rowsPerWI=%d%s",
2001                                         ocl::typeToStr(wdepth), ocl::typeToStr(depth),
2002                                         ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
2003                                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
2004                 if (kabs.empty())
2005                     return false;
2006
2007                 abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn));
2008                 kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn));
2009
2010                 size_t globalsize[2] = { src.cols * cn, (src.rows + rowsPerWI - 1) / rowsPerWI };
2011                 if (!kabs.run(2, globalsize, NULL, false))
2012                     return false;
2013             }
2014             else
2015                 abssrc = src;
2016
2017             cv::minMaxIdx(haveMask ? abssrc : abssrc.reshape(1), NULL, &result, NULL, NULL, _mask);
2018         }
2019         else
2020         {
2021             int dbsize = d.maxComputeUnits();
2022             size_t wgs = d.maxWorkGroupSize();
2023
2024             int wgs2_aligned = 1;
2025             while (wgs2_aligned < (int)wgs)
2026                 wgs2_aligned <<= 1;
2027             wgs2_aligned >>= 1;
2028
2029             ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
2030                           format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d"
2031                                  " -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s",
2032                                  depth, ocl::typeToStr(type), ocl::typeToStr(depth),
2033                                  wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
2034             if (k.empty())
2035                 return false;
2036
2037             UMat db(1, dbsize, type), mask = _mask.getUMat();
2038             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
2039                    dbsize, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
2040
2041             size_t globalsize = dbsize * wgs;
2042             if (!k.run(1, &globalsize, &wgs, true))
2043                 return false;
2044
2045             minMaxIdx(db.getMat(ACCESS_READ), NULL, &result, NULL, NULL, noArray());
2046         }
2047     }
2048     else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
2049     {
2050         Scalar sc;
2051         bool unstype = depth == CV_8U || depth == CV_16U;
2052
2053         if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
2054                     OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
2055             return false;
2056
2057         if (!haveMask)
2058             cn = 1;
2059
2060         double s = 0.0;
2061         for (int i = 0; i < cn; ++i)
2062             s += sc[i];
2063
2064         result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
2065     }
2066
2067     return true;
2068 }
2069
2070 #endif
2071
2072 }
2073
2074 double cv::norm( InputArray _src, int normType, InputArray _mask )
2075 {
2076     normType &= NORM_TYPE_MASK;
2077     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
2078                normType == NORM_L2 || normType == NORM_L2SQR ||
2079                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
2080
2081 #ifdef HAVE_OPENCL
2082     double _result = 0;
2083     CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
2084                 ocl_norm(_src, normType, _mask, _result),
2085                 _result)
2086 #endif
2087
2088     Mat src = _src.getMat(), mask = _mask.getMat();
2089     int depth = src.depth(), cn = src.channels();
2090
2091 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
2092     size_t total_size = src.total();
2093     int rows = src.size[0], cols = (int)(total_size/rows);
2094
2095     if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
2096         && cols > 0 && (size_t)rows*cols == total_size
2097         && (normType == NORM_INF || normType == NORM_L1 ||
2098             normType == NORM_L2 || normType == NORM_L2SQR) )
2099     {
2100         IppiSize sz = { cols, rows };
2101         int type = src.type();
2102         if( !mask.empty() )
2103         {
2104             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
2105             ippiMaskNormFuncC1 ippFuncC1 =
2106                 normType == NORM_INF ?
2107                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
2108                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8s_C1MR :
2109 //                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
2110                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
2111                 0) :
2112             normType == NORM_L1 ?
2113                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
2114                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8s_C1MR :
2115                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
2116                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
2117                 0) :
2118             normType == NORM_L2 || normType == NORM_L2SQR ?
2119                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
2120                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8s_C1MR :
2121                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
2122                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
2123                 0) : 0;
2124             if( ippFuncC1 )
2125             {
2126                 Ipp64f norm;
2127                 if( ippFuncC1(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, &norm) >= 0 )
2128                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2129
2130                 setIppErrorStatus();
2131             }
2132             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
2133             ippiMaskNormFuncC3 ippFuncC3 =
2134                 normType == NORM_INF ?
2135                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
2136                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8s_C3CMR :
2137                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
2138                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
2139                 0) :
2140             normType == NORM_L1 ?
2141                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
2142                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8s_C3CMR :
2143                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
2144                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
2145                 0) :
2146             normType == NORM_L2 || normType == NORM_L2SQR ?
2147                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
2148                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8s_C3CMR :
2149                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
2150                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
2151                 0) : 0;
2152             if( ippFuncC3 )
2153             {
2154                 Ipp64f norm1, norm2, norm3;
2155                 if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
2156                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
2157                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
2158                 {
2159                     Ipp64f norm =
2160                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
2161                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
2162                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
2163                         0;
2164                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2165                 }
2166                 setIppErrorStatus();
2167             }
2168         }
2169         else
2170         {
2171             typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
2172             typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
2173             ippiNormFuncHint ippFuncHint =
2174                 normType == NORM_L1 ?
2175                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
2176                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L1_32f_C3R :
2177                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L1_32f_C4R :
2178                 0) :
2179                 normType == NORM_L2 || normType == NORM_L2SQR ?
2180                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
2181                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L2_32f_C3R :
2182                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L2_32f_C4R :
2183                 0) : 0;
2184             ippiNormFuncNoHint ippFuncNoHint =
2185                 normType == NORM_INF ?
2186                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
2187                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C3R :
2188                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C4R :
2189                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
2190                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C3R :
2191                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C4R :
2192                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
2193 #if (IPP_VERSION_X100 >= 801)
2194                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2195                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2196 #endif
2197                 type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
2198                 type == CV_32FC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C3R :
2199                 type == CV_32FC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C4R :
2200                 0) :
2201                 normType == NORM_L1 ?
2202                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
2203                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C3R :
2204                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C4R :
2205                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
2206                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C3R :
2207                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C4R :
2208                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
2209                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C3R :
2210                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C4R :
2211                 0) :
2212                 normType == NORM_L2 || normType == NORM_L2SQR ?
2213                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
2214                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C3R :
2215                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C4R :
2216                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
2217                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C3R :
2218                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C4R :
2219                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
2220                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C3R :
2221                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C4R :
2222                 0) : 0;
2223             // Make sure only zero or one version of the function pointer is valid
2224             CV_Assert(!ippFuncHint || !ippFuncNoHint);
2225             if( ippFuncHint || ippFuncNoHint )
2226             {
2227                 Ipp64f norm_array[4];
2228                 IppStatus ret = ippFuncHint ? ippFuncHint(src.data, (int)src.step[0], sz, norm_array, ippAlgHintAccurate) :
2229                                 ippFuncNoHint(src.data, (int)src.step[0], sz, norm_array);
2230                 if( ret >= 0 )
2231                 {
2232                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
2233                     for( int i = 1; i < cn; i++ )
2234                     {
2235                         norm =
2236                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
2237                             normType == NORM_L1 ? norm + norm_array[i] :
2238                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
2239                             0;
2240                     }
2241                     return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
2242                 }
2243                 setIppErrorStatus();
2244             }
2245         }
2246     }
2247 #endif
2248
2249     if( src.isContinuous() && mask.empty() )
2250     {
2251         size_t len = src.total()*cn;
2252         if( len == (size_t)(int)len )
2253         {
2254             if( depth == CV_32F )
2255             {
2256                 const float* data = src.ptr<float>();
2257
2258                 if( normType == NORM_L2 )
2259                 {
2260                     double result = 0;
2261                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
2262                     return std::sqrt(result);
2263                 }
2264                 if( normType == NORM_L2SQR )
2265                 {
2266                     double result = 0;
2267                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
2268                     return result;
2269                 }
2270                 if( normType == NORM_L1 )
2271                 {
2272                     double result = 0;
2273                     GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
2274                     return result;
2275                 }
2276                 if( normType == NORM_INF )
2277                 {
2278                     float result = 0;
2279                     GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
2280                     return result;
2281                 }
2282             }
2283             if( depth == CV_8U )
2284             {
2285                 const uchar* data = src.ptr<uchar>();
2286
2287                 if( normType == NORM_HAMMING )
2288                     return normHamming(data, (int)len);
2289
2290                 if( normType == NORM_HAMMING2 )
2291                     return normHamming(data, (int)len, 2);
2292             }
2293         }
2294     }
2295
2296     CV_Assert( mask.empty() || mask.type() == CV_8U );
2297
2298     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
2299     {
2300         if( !mask.empty() )
2301         {
2302             Mat temp;
2303             bitwise_and(src, mask, temp);
2304             return norm(temp, normType);
2305         }
2306         int cellSize = normType == NORM_HAMMING ? 1 : 2;
2307
2308         const Mat* arrays[] = {&src, 0};
2309         uchar* ptrs[1];
2310         NAryMatIterator it(arrays, ptrs);
2311         int total = (int)it.size;
2312         int result = 0;
2313
2314         for( size_t i = 0; i < it.nplanes; i++, ++it )
2315             result += normHamming(ptrs[0], total, cellSize);
2316
2317         return result;
2318     }
2319
2320     NormFunc func = getNormFunc(normType >> 1, depth);
2321     CV_Assert( func != 0 );
2322
2323     const Mat* arrays[] = {&src, &mask, 0};
2324     uchar* ptrs[2];
2325     union
2326     {
2327         double d;
2328         int i;
2329         float f;
2330     }
2331     result;
2332     result.d = 0;
2333     NAryMatIterator it(arrays, ptrs);
2334     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
2335     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
2336             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
2337     int isum = 0;
2338     int *ibuf = &result.i;
2339     size_t esz = 0;
2340
2341     if( blockSum )
2342     {
2343         intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
2344         blockSize = std::min(blockSize, intSumBlockSize);
2345         ibuf = &isum;
2346         esz = src.elemSize();
2347     }
2348
2349     for( size_t i = 0; i < it.nplanes; i++, ++it )
2350     {
2351         for( j = 0; j < total; j += blockSize )
2352         {
2353             int bsz = std::min(total - j, blockSize);
2354             func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
2355             count += bsz;
2356             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
2357             {
2358                 result.d += isum;
2359                 isum = 0;
2360                 count = 0;
2361             }
2362             ptrs[0] += bsz*esz;
2363             if( ptrs[1] )
2364                 ptrs[1] += bsz;
2365         }
2366     }
2367
2368     if( normType == NORM_INF )
2369     {
2370         if( depth == CV_64F )
2371             ;
2372         else if( depth == CV_32F )
2373             result.d = result.f;
2374         else
2375             result.d = result.i;
2376     }
2377     else if( normType == NORM_L2 )
2378         result.d = std::sqrt(result.d);
2379
2380     return result.d;
2381 }
2382
2383 #ifdef HAVE_OPENCL
2384
2385 namespace cv {
2386
2387 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
2388 {
2389     const ocl::Device & d = ocl::Device::getDefault();
2390     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1;
2391     bool doubleSupport = d.doubleFPConfig() > 0;
2392     bool relative = (normType & NORM_RELATIVE) != 0;
2393     normType &= ~NORM_RELATIVE;
2394
2395     if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
2396          (!doubleSupport && depth == CV_64F))
2397         return false;
2398
2399     int wdepth = std::max(CV_32S, depth);
2400     char cvt[50];
2401     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
2402                   format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1"
2403                          " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s",
2404                          ocl::typeToStr(wdepth), ocl::typeToStr(depth),
2405                          ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
2406                          doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
2407     if (k.empty())
2408         return false;
2409
2410     UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), diff(src1.size(), CV_MAKE_TYPE(wdepth, cn));
2411     k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2),
2412            ocl::KernelArg::WriteOnly(diff, cn));
2413
2414     size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI };
2415     if (!k.run(2, globalsize, NULL, false))
2416         return false;
2417
2418     result = cv::norm(diff, normType, _mask);
2419     if (relative)
2420         result /= cv::norm(src2, normType, _mask) + DBL_EPSILON;
2421
2422     return true;
2423 }
2424
2425 }
2426
2427 #endif
2428
2429 double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
2430 {
2431     CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );
2432
2433 #ifdef HAVE_OPENCL
2434     double _result = 0;
2435     CV_OCL_RUN_(_src1.isUMat() && _src2.isUMat() &&
2436                 _src1.dims() <= 2 && _src2.dims() <= 2,
2437                 ocl_norm(_src1, _src2, normType, _mask, _result),
2438                 _result)
2439 #endif
2440
2441     if( normType & CV_RELATIVE )
2442     {
2443 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
2444         Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
2445
2446         normType &= NORM_TYPE_MASK;
2447         CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
2448                 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
2449         size_t total_size = src1.total();
2450         int rows = src1.size[0], cols = (int)(total_size/rows);
2451         if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
2452             && cols > 0 && (size_t)rows*cols == total_size
2453             && (normType == NORM_INF || normType == NORM_L1 ||
2454                 normType == NORM_L2 || normType == NORM_L2SQR) )
2455         {
2456             IppiSize sz = { cols, rows };
2457             int type = src1.type();
2458             if( !mask.empty() )
2459             {
2460                 typedef IppStatus (CV_STDCALL* ippiMaskNormRelFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
2461                 ippiMaskNormRelFuncC1 ippFuncC1 =
2462                     normType == NORM_INF ?
2463                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8u_C1MR :
2464                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8s_C1MR :
2465                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_16u_C1MR :
2466                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_32f_C1MR :
2467                     0) :
2468                     normType == NORM_L1 ?
2469                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8u_C1MR :
2470                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8s_C1MR :
2471                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_16u_C1MR :
2472                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_32f_C1MR :
2473                     0) :
2474                     normType == NORM_L2 || normType == NORM_L2SQR ?
2475                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8u_C1MR :
2476                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8s_C1MR :
2477                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_16u_C1MR :
2478                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_32f_C1MR :
2479                     0) : 0;
2480                 if( ippFuncC1 )
2481                 {
2482                     Ipp64f norm;
2483                     if( ippFuncC1(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, &norm) >= 0 )
2484                         return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2485                     setIppErrorStatus();
2486                 }
2487             }
2488             else
2489             {
2490                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
2491                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
2492                 ippiNormRelFuncNoHint ippFuncNoHint =
2493                     normType == NORM_INF ?
2494                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
2495                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
2496                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
2497                     type == CV_32FC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
2498                     0) :
2499                     normType == NORM_L1 ?
2500                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
2501                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
2502                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
2503                     0) :
2504                     normType == NORM_L2 || normType == NORM_L2SQR ?
2505                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
2506                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
2507                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
2508                     0) : 0;
2509                 ippiNormRelFuncHint ippFuncHint =
2510                     normType == NORM_L1 ?
2511                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
2512                     0) :
2513                     normType == NORM_L2 || normType == NORM_L2SQR ?
2514                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
2515                     0) : 0;
2516                 if (ippFuncNoHint)
2517                 {
2518                     Ipp64f norm;
2519                     if( ippFuncNoHint(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], sz, &norm) >= 0 )
2520                         return (double)norm;
2521                     setIppErrorStatus();
2522                 }
2523                 if (ippFuncHint)
2524                 {
2525                     Ipp64f norm;
2526                     if( ippFuncHint(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], sz, &norm, ippAlgHintAccurate) >= 0 )
2527                         return (double)norm;
2528                     setIppErrorStatus();
2529                 }
2530             }
2531         }
2532 #endif
2533         return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
2534     }
2535
2536     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
2537     int depth = src1.depth(), cn = src1.channels();
2538
2539     normType &= 7;
2540     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
2541                normType == NORM_L2 || normType == NORM_L2SQR ||
2542               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
2543
2544 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
2545     size_t total_size = src1.total();
2546     int rows = src1.size[0], cols = (int)(total_size/rows);
2547     if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
2548         && cols > 0 && (size_t)rows*cols == total_size
2549         && (normType == NORM_INF || normType == NORM_L1 ||
2550             normType == NORM_L2 || normType == NORM_L2SQR) )
2551     {
2552         IppiSize sz = { cols, rows };
2553         int type = src1.type();
2554         if( !mask.empty() )
2555         {
2556             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
2557             ippiMaskNormDiffFuncC1 ippFuncC1 =
2558                 normType == NORM_INF ?
2559                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
2560                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8s_C1MR :
2561                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
2562                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
2563                 0) :
2564                 normType == NORM_L1 ?
2565                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
2566                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
2567                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
2568                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
2569                 0) :
2570                 normType == NORM_L2 || normType == NORM_L2SQR ?
2571                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
2572                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8s_C1MR :
2573                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
2574                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
2575                 0) : 0;
2576             if( ippFuncC1 )
2577             {
2578                 Ipp64f norm;
2579                 if( ippFuncC1(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, &norm) >= 0 )
2580                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2581                 setIppErrorStatus();
2582             }
2583             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
2584             ippiMaskNormDiffFuncC3 ippFuncC3 =
2585                 normType == NORM_INF ?
2586                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
2587                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8s_C3CMR :
2588                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
2589                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
2590                 0) :
2591                 normType == NORM_L1 ?
2592                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
2593                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8s_C3CMR :
2594                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
2595                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
2596                 0) :
2597                 normType == NORM_L2 || normType == NORM_L2SQR ?
2598                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
2599                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8s_C3CMR :
2600                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
2601                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
2602                 0) : 0;
2603             if( ippFuncC3 )
2604             {
2605                 Ipp64f norm1, norm2, norm3;
2606                 if( ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
2607                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
2608                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
2609                 {
2610                     Ipp64f norm =
2611                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
2612                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
2613                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
2614                         0;
2615                     return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
2616                 }
2617                 setIppErrorStatus();
2618             }
2619         }
2620         else
2621         {
2622             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
2623             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
2624             ippiNormDiffFuncHint ippFuncHint =
2625                 normType == NORM_L1 ?
2626                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
2627                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C3R :
2628                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C4R :
2629                 0) :
2630                 normType == NORM_L2 || normType == NORM_L2SQR ?
2631                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
2632                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C3R :
2633                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C4R :
2634                 0) : 0;
2635             ippiNormDiffFuncNoHint ippFuncNoHint =
2636                 normType == NORM_INF ?
2637                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
2638                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C3R :
2639                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C4R :
2640                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
2641                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C3R :
2642                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C4R :
2643                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
2644 #if (IPP_VERSION_X100 >= 801)
2645                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2646                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
2647 #endif
2648                 type == CV_32FC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
2649                 type == CV_32FC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C3R :
2650                 type == CV_32FC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C4R :
2651                 0) :
2652                 normType == NORM_L1 ?
2653                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
2654                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C3R :
2655                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C4R :
2656                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
2657                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C3R :
2658                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C4R :
2659                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
2660                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C3R :
2661                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C4R :
2662                 0) :
2663                 normType == NORM_L2 || normType == NORM_L2SQR ?
2664                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
2665                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C3R :
2666                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C4R :
2667                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
2668                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C3R :
2669                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C4R :
2670                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
2671                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C3R :
2672                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C4R :
2673                 0) : 0;
2674             // Make sure only zero or one version of the function pointer is valid
2675             CV_Assert(!ippFuncHint || !ippFuncNoHint);
2676             if( ippFuncHint || ippFuncNoHint )
2677             {
2678                 Ipp64f norm_array[4];
2679                 IppStatus ret = ippFuncHint ? ippFuncHint(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) :
2680                                 ippFuncNoHint(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], sz, norm_array);
2681                 if( ret >= 0 )
2682                 {
2683                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
2684                     for( int i = 1; i < src1.channels(); i++ )
2685                     {
2686                         norm =
2687                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
2688                             normType == NORM_L1 ? norm + norm_array[i] :
2689                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
2690                             0;
2691                     }
2692                     return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
2693                 }
2694                 setIppErrorStatus();
2695             }
2696         }
2697     }
2698 #endif
2699
2700     if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
2701     {
2702         size_t len = src1.total()*src1.channels();
2703         if( len == (size_t)(int)len )
2704         {
2705             if( src1.depth() == CV_32F )
2706             {
2707                 const float* data1 = src1.ptr<float>();
2708                 const float* data2 = src2.ptr<float>();
2709
2710                 if( normType == NORM_L2 )
2711                 {
2712                     double result = 0;
2713                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
2714                     return std::sqrt(result);
2715                 }
2716                 if( normType == NORM_L2SQR )
2717                 {
2718                     double result = 0;
2719                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
2720                     return result;
2721                 }
2722                 if( normType == NORM_L1 )
2723                 {
2724                     double result = 0;
2725                     GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
2726                     return result;
2727                 }
2728                 if( normType == NORM_INF )
2729                 {
2730                     float result = 0;
2731                     GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
2732                     return result;
2733                 }
2734             }
2735         }
2736     }
2737
2738     CV_Assert( mask.empty() || mask.type() == CV_8U );
2739
2740     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
2741     {
2742         if( !mask.empty() )
2743         {
2744             Mat temp;
2745             bitwise_xor(src1, src2, temp);
2746             bitwise_and(temp, mask, temp);
2747             return norm(temp, normType);
2748         }
2749         int cellSize = normType == NORM_HAMMING ? 1 : 2;
2750
2751         const Mat* arrays[] = {&src1, &src2, 0};
2752         uchar* ptrs[2];
2753         NAryMatIterator it(arrays, ptrs);
2754         int total = (int)it.size;
2755         int result = 0;
2756
2757         for( size_t i = 0; i < it.nplanes; i++, ++it )
2758             result += normHamming(ptrs[0], ptrs[1], total, cellSize);
2759
2760         return result;
2761     }
2762
2763     NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
2764     CV_Assert( func != 0 );
2765
2766     const Mat* arrays[] = {&src1, &src2, &mask, 0};
2767     uchar* ptrs[3];
2768     union
2769     {
2770         double d;
2771         float f;
2772         int i;
2773         unsigned u;
2774     }
2775     result;
2776     result.d = 0;
2777     NAryMatIterator it(arrays, ptrs);
2778     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
2779     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
2780             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
2781     unsigned isum = 0;
2782     unsigned *ibuf = &result.u;
2783     size_t esz = 0;
2784
2785     if( blockSum )
2786     {
2787         intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
2788         blockSize = std::min(blockSize, intSumBlockSize);
2789         ibuf = &isum;
2790         esz = src1.elemSize();
2791     }
2792
2793     for( size_t i = 0; i < it.nplanes; i++, ++it )
2794     {
2795         for( j = 0; j < total; j += blockSize )
2796         {
2797             int bsz = std::min(total - j, blockSize);
2798             func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
2799             count += bsz;
2800             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
2801             {
2802                 result.d += isum;
2803                 isum = 0;
2804                 count = 0;
2805             }
2806             ptrs[0] += bsz*esz;
2807             ptrs[1] += bsz*esz;
2808             if( ptrs[2] )
2809                 ptrs[2] += bsz;
2810         }
2811     }
2812
2813     if( normType == NORM_INF )
2814     {
2815         if( depth == CV_64F )
2816             ;
2817         else if( depth == CV_32F )
2818             result.d = result.f;
2819         else
2820             result.d = result.u;
2821     }
2822     else if( normType == NORM_L2 )
2823         result.d = std::sqrt(result.d);
2824
2825     return result.d;
2826 }
2827
2828
2829 ///////////////////////////////////// batch distance ///////////////////////////////////////
2830
2831 namespace cv
2832 {
2833
2834 template<typename _Tp, typename _Rt>
2835 void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2,
2836                   int nvecs, int len, _Rt* dist, const uchar* mask)
2837 {
2838     step2 /= sizeof(src2[0]);
2839     if( !mask )
2840     {
2841         for( int i = 0; i < nvecs; i++ )
2842             dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len);
2843     }
2844     else
2845     {
2846         _Rt val0 = std::numeric_limits<_Rt>::max();
2847         for( int i = 0; i < nvecs; i++ )
2848             dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
2849     }
2850 }
2851
2852 template<typename _Tp, typename _Rt>
2853 void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2,
2854                      int nvecs, int len, _Rt* dist, const uchar* mask)
2855 {
2856     step2 /= sizeof(src2[0]);
2857     if( !mask )
2858     {
2859         for( int i = 0; i < nvecs; i++ )
2860             dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len);
2861     }
2862     else
2863     {
2864         _Rt val0 = std::numeric_limits<_Rt>::max();
2865         for( int i = 0; i < nvecs; i++ )
2866             dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
2867     }
2868 }
2869
2870 template<typename _Tp, typename _Rt>
2871 void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
2872                   int nvecs, int len, _Rt* dist, const uchar* mask)
2873 {
2874     step2 /= sizeof(src2[0]);
2875     if( !mask )
2876     {
2877         for( int i = 0; i < nvecs; i++ )
2878             dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len));
2879     }
2880     else
2881     {
2882         _Rt val0 = std::numeric_limits<_Rt>::max();
2883         for( int i = 0; i < nvecs; i++ )
2884             dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0;
2885     }
2886 }
2887
2888 static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
2889                              int nvecs, int len, int* dist, const uchar* mask)
2890 {
2891     step2 /= sizeof(src2[0]);
2892     if( !mask )
2893     {
2894         for( int i = 0; i < nvecs; i++ )
2895             dist[i] = normHamming(src1, src2 + step2*i, len);
2896     }
2897     else
2898     {
2899         int val0 = INT_MAX;
2900         for( int i = 0; i < nvecs; i++ )
2901             dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len) : val0;
2902     }
2903 }
2904
2905 static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2,
2906                               int nvecs, int len, int* dist, const uchar* mask)
2907 {
2908     step2 /= sizeof(src2[0]);
2909     if( !mask )
2910     {
2911         for( int i = 0; i < nvecs; i++ )
2912             dist[i] = normHamming(src1, src2 + step2*i, len, 2);
2913     }
2914     else
2915     {
2916         int val0 = INT_MAX;
2917         for( int i = 0; i < nvecs; i++ )
2918             dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len, 2) : val0;
2919     }
2920 }
2921
2922 static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2,
2923                                int nvecs, int len, int* dist, const uchar* mask)
2924 {
2925     batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
2926 }
2927
2928 static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2,
2929                                int nvecs, int len, float* dist, const uchar* mask)
2930 {
2931     batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
2932 }
2933
2934 static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2,
2935                                   int nvecs, int len, int* dist, const uchar* mask)
2936 {
2937     batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
2938 }
2939
2940 static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2,
2941                                   int nvecs, int len, float* dist, const uchar* mask)
2942 {
2943     batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
2944 }
2945
2946 static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2,
2947                                int nvecs, int len, float* dist, const uchar* mask)
2948 {
2949     batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
2950 }
2951
2952 static void batchDistL1_32f(const float* src1, const float* src2, size_t step2,
2953                              int nvecs, int len, float* dist, const uchar* mask)
2954 {
2955     batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
2956 }
2957
2958 static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2,
2959                                 int nvecs, int len, float* dist, const uchar* mask)
2960 {
2961     batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
2962 }
2963
2964 static void batchDistL2_32f(const float* src1, const float* src2, size_t step2,
2965                              int nvecs, int len, float* dist, const uchar* mask)
2966 {
2967     batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
2968 }
2969
2970 typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2,
2971                               int nvecs, int len, uchar* dist, const uchar* mask);
2972
2973
2974 struct BatchDistInvoker : public ParallelLoopBody
2975 {
2976     BatchDistInvoker( const Mat& _src1, const Mat& _src2,
2977                       Mat& _dist, Mat& _nidx, int _K,
2978                       const Mat& _mask, int _update,
2979                       BatchDistFunc _func)
2980     {
2981         src1 = &_src1;
2982         src2 = &_src2;
2983         dist = &_dist;
2984         nidx = &_nidx;
2985         K = _K;
2986         mask = &_mask;
2987         update = _update;
2988         func = _func;
2989     }
2990
2991     void operator()(const Range& range) const
2992     {
2993         AutoBuffer<int> buf(src2->rows);
2994         int* bufptr = buf;
2995
2996         for( int i = range.start; i < range.end; i++ )
2997         {
2998             func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols,
2999                  K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0);
3000
3001             if( K > 0 )
3002             {
3003                 int* nidxptr = nidx->ptr<int>(i);
3004                 // since positive float's can be compared just like int's,
3005                 // we handle both CV_32S and CV_32F cases with a single branch
3006                 int* distptr = (int*)dist->ptr(i);
3007
3008                 int j, k;
3009
3010                 for( j = 0; j < src2->rows; j++ )
3011                 {
3012                     int d = bufptr[j];
3013                     if( d < distptr[K-1] )
3014                     {
3015                         for( k = K-2; k >= 0 && distptr[k] > d; k-- )
3016                         {
3017                             nidxptr[k+1] = nidxptr[k];
3018                             distptr[k+1] = distptr[k];
3019                         }
3020                         nidxptr[k+1] = j + update;
3021                         distptr[k+1] = d;
3022                     }
3023                 }
3024             }
3025         }
3026     }
3027
3028     const Mat *src1;
3029     const Mat *src2;
3030     Mat *dist;
3031     Mat *nidx;
3032     const Mat *mask;
3033     int K;
3034     int update;
3035     BatchDistFunc func;
3036 };
3037
3038 }
3039
3040 void cv::batchDistance( InputArray _src1, InputArray _src2,
3041                         OutputArray _dist, int dtype, OutputArray _nidx,
3042                         int normType, int K, InputArray _mask,
3043                         int update, bool crosscheck )
3044 {
3045     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
3046     int type = src1.type();
3047     CV_Assert( type == src2.type() && src1.cols == src2.cols &&
3048                (type == CV_32F || type == CV_8U));
3049     CV_Assert( _nidx.needed() == (K > 0) );
3050
3051     if( dtype == -1 )
3052     {
3053         dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F;
3054     }
3055     CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F);
3056
3057     K = std::min(K, src2.rows);
3058
3059     _dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype);
3060     Mat dist = _dist.getMat(), nidx;
3061     if( _nidx.needed() )
3062     {
3063         _nidx.create(dist.size(), CV_32S);
3064         nidx = _nidx.getMat();
3065     }
3066
3067     if( update == 0 && K > 0 )
3068     {
3069         dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX);
3070         nidx = Scalar::all(-1);
3071     }
3072
3073     if( crosscheck )
3074     {
3075         CV_Assert( K == 1 && update == 0 && mask.empty() );
3076         Mat tdist, tidx;
3077         batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);
3078
3079         // if an idx-th element from src1 appeared to be the nearest to i-th element of src2,
3080         // we update the minimum mutual distance between idx-th element of src1 and the whole src2 set.
3081         // As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest
3082         // to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1.
3083         // If nidx[idx] = -1, it means that there is no such ideal couple for it in src2.
3084         // This O(N) procedure is called cross-check and it helps to eliminate some false matches.
3085         if( dtype == CV_32S )
3086         {
3087             for( int i = 0; i < tdist.rows; i++ )
3088             {
3089                 int idx = tidx.at<int>(i);
3090                 int d = tdist.at<int>(i), d0 = dist.at<int>(idx);
3091                 if( d < d0 )
3092                 {
3093                     dist.at<int>(idx) = d;
3094                     nidx.at<int>(idx) = i + update;
3095                 }
3096             }
3097         }
3098         else
3099         {
3100             for( int i = 0; i < tdist.rows; i++ )
3101             {
3102                 int idx = tidx.at<int>(i);
3103                 float d = tdist.at<float>(i), d0 = dist.at<float>(idx);
3104                 if( d < d0 )
3105                 {
3106                     dist.at<float>(idx) = d;
3107                     nidx.at<int>(idx) = i + update;
3108                 }
3109             }
3110         }
3111         return;
3112     }
3113
3114     BatchDistFunc func = 0;
3115     if( type == CV_8U )
3116     {
3117         if( normType == NORM_L1 && dtype == CV_32S )
3118             func = (BatchDistFunc)batchDistL1_8u32s;
3119         else if( normType == NORM_L1 && dtype == CV_32F )
3120             func = (BatchDistFunc)batchDistL1_8u32f;
3121         else if( normType == NORM_L2SQR && dtype == CV_32S )
3122             func = (BatchDistFunc)batchDistL2Sqr_8u32s;
3123         else if( normType == NORM_L2SQR && dtype == CV_32F )
3124             func = (BatchDistFunc)batchDistL2Sqr_8u32f;
3125         else if( normType == NORM_L2 && dtype == CV_32F )
3126             func = (BatchDistFunc)batchDistL2_8u32f;
3127         else if( normType == NORM_HAMMING && dtype == CV_32S )
3128             func = (BatchDistFunc)batchDistHamming;
3129         else if( normType == NORM_HAMMING2 && dtype == CV_32S )
3130             func = (BatchDistFunc)batchDistHamming2;
3131     }
3132     else if( type == CV_32F && dtype == CV_32F )
3133     {
3134         if( normType == NORM_L1 )
3135             func = (BatchDistFunc)batchDistL1_32f;
3136         else if( normType == NORM_L2SQR )
3137             func = (BatchDistFunc)batchDistL2Sqr_32f;
3138         else if( normType == NORM_L2 )
3139             func = (BatchDistFunc)batchDistL2_32f;
3140     }
3141
3142     if( func == 0 )
3143         CV_Error_(CV_StsUnsupportedFormat,
3144                   ("The combination of type=%d, dtype=%d and normType=%d is not supported",
3145                    type, dtype, normType));
3146
3147     parallel_for_(Range(0, src1.rows),
3148                   BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func));
3149 }
3150
3151
3152 void cv::findNonZero( InputArray _src, OutputArray _idx )
3153 {
3154     Mat src = _src.getMat();
3155     CV_Assert( src.type() == CV_8UC1 );
3156     int n = countNonZero(src);
3157     if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
3158         _idx.release();
3159     _idx.create(n, 1, CV_32SC2);
3160     Mat idx = _idx.getMat();
3161     CV_Assert(idx.isContinuous());
3162     Point* idx_ptr = (Point*)idx.data;
3163
3164     for( int i = 0; i < src.rows; i++ )
3165     {
3166         const uchar* bin_ptr = src.ptr(i);
3167         for( int j = 0; j < src.cols; j++ )
3168             if( bin_ptr[j] )
3169                 *idx_ptr++ = Point(j, i);
3170     }
3171 }
3172
3173 double cv::PSNR(InputArray _src1, InputArray _src2)
3174 {
3175     CV_Assert( _src1.depth() == CV_8U );
3176     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
3177     return 20*log10(255./(diff+DBL_EPSILON));
3178 }
3179
3180
3181 CV_IMPL CvScalar cvSum( const CvArr* srcarr )
3182 {
3183     cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
3184     if( CV_IS_IMAGE(srcarr) )
3185     {
3186         int coi = cvGetImageCOI((IplImage*)srcarr);
3187         if( coi )
3188         {
3189             CV_Assert( 0 < coi && coi <= 4 );
3190             sum = cv::Scalar(sum[coi-1]);
3191         }
3192     }
3193     return sum;
3194 }
3195
3196 CV_IMPL int cvCountNonZero( const CvArr* imgarr )
3197 {
3198     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
3199     if( img.channels() > 1 )
3200         cv::extractImageCOI(imgarr, img);
3201     return countNonZero(img);
3202 }
3203
3204
3205 CV_IMPL  CvScalar
3206 cvAvg( const void* imgarr, const void* maskarr )
3207 {
3208     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
3209     cv::Scalar mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr));
3210     if( CV_IS_IMAGE(imgarr) )
3211     {
3212         int coi = cvGetImageCOI((IplImage*)imgarr);
3213         if( coi )
3214         {
3215             CV_Assert( 0 < coi && coi <= 4 );
3216             mean = cv::Scalar(mean[coi-1]);
3217         }
3218     }
3219     return mean;
3220 }
3221
3222
3223 CV_IMPL  void
3224 cvAvgSdv( const CvArr* imgarr, CvScalar* _mean, CvScalar* _sdv, const void* maskarr )
3225 {
3226     cv::Scalar mean, sdv;
3227
3228     cv::Mat mask;
3229     if( maskarr )
3230         mask = cv::cvarrToMat(maskarr);
3231
3232     cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask );
3233
3234     if( CV_IS_IMAGE(imgarr) )
3235     {
3236         int coi = cvGetImageCOI((IplImage*)imgarr);
3237         if( coi )
3238         {
3239             CV_Assert( 0 < coi && coi <= 4 );
3240             mean = cv::Scalar(mean[coi-1]);
3241             sdv = cv::Scalar(sdv[coi-1]);
3242         }
3243     }
3244
3245     if( _mean )
3246         *(cv::Scalar*)_mean = mean;
3247     if( _sdv )
3248         *(cv::Scalar*)_sdv = sdv;
3249 }
3250
3251
3252 CV_IMPL void
3253 cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal,
3254              CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr )
3255 {
3256     cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1);
3257     if( maskarr )
3258         mask = cv::cvarrToMat(maskarr);
3259     if( img.channels() > 1 )
3260         cv::extractImageCOI(imgarr, img);
3261
3262     cv::minMaxLoc( img, _minVal, _maxVal,
3263                    (cv::Point*)_minLoc, (cv::Point*)_maxLoc, mask );
3264 }
3265
3266
3267 CV_IMPL  double
3268 cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
3269 {
3270     cv::Mat a, mask;
3271     if( !imgA )
3272     {
3273         imgA = imgB;
3274         imgB = 0;
3275     }
3276
3277     a = cv::cvarrToMat(imgA, false, true, 1);
3278     if( maskarr )
3279         mask = cv::cvarrToMat(maskarr);
3280
3281     if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 )
3282         cv::extractImageCOI(imgA, a);
3283
3284     if( !imgB )
3285         return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask);
3286
3287     cv::Mat b = cv::cvarrToMat(imgB, false, true, 1);
3288     if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 )
3289         cv::extractImageCOI(imgB, b);
3290
3291     return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
3292 }