modules/core/src/norm.cpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html
   4
   5
   6 #include "precomp.hpp"
   7 #include "opencl_kernels_core.hpp"
   8 #include "stat.hpp"
   9
  10 /****************************************************************************************\
  11 *                                         norm                                           *
  12 \****************************************************************************************/
  13
  14 namespace cv { namespace hal {
  15
  16 extern const uchar popCountTable[256] =
  17 {
  18     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  19     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  20     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  21     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  22     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  23     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  24     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  25     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
  26 };
  27
  28 static const uchar popCountTable2[] =
  29 {
  30     0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
  31     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
  32     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
  33     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
  34     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
  35     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
  36     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
  37     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
  38 };
  39
  40 static const uchar popCountTable4[] =
  41 {
  42     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  43     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  44     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  45     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  46     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  47     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  48     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  49     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  50 };
  51
  52
  53 int normHamming(const uchar* a, int n, int cellSize)
  54 {
  55     if( cellSize == 1 )
  56         return normHamming(a, n);
  57     const uchar* tab = 0;
  58     if( cellSize == 2 )
  59         tab = popCountTable2;
  60     else if( cellSize == 4 )
  61         tab = popCountTable4;
  62     else
  63         return -1;
  64     int i = 0;
  65     int result = 0;
  66 #if CV_SIMD
  67     v_uint64 t = vx_setzero_u64();
  68     if ( cellSize == 2)
  69     {
  70         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
  71         for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
  72         {
  73             v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
  74             t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask));
  75         }
  76     }
  77     else    // cellSize == 4
  78     {
  79         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
  80         for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
  81         {
  82             v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
  83             v_uint16 a1 = a0 | (a0 >> 2);
  84             t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask));
  85
  86         }
  87     }
  88     result += (int)v_reduce_sum(t);
  89     vx_cleanup();
  90 #elif CV_ENABLE_UNROLLED
  91     for( ; i <= n - 4; i += 4 )
  92         result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
  93 #endif
  94     for( ; i < n; i++ )
  95         result += tab[a[i]];
  96     return result;
  97 }
  98
  99 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 100 {
 101     if( cellSize == 1 )
 102         return normHamming(a, b, n);
 103     const uchar* tab = 0;
 104     if( cellSize == 2 )
 105         tab = popCountTable2;
 106     else if( cellSize == 4 )
 107         tab = popCountTable4;
 108     else
 109         return -1;
 110     int i = 0;
 111     int result = 0;
 112 #if CV_SIMD
 113     v_uint64 t = vx_setzero_u64();
 114     if ( cellSize == 2)
 115     {
 116         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
 117         for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
 118         {
 119             v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
 120             t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask));
 121         }
 122     }
 123     else    // cellSize == 4
 124     {
 125         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
 126         for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
 127         {
 128             v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
 129             v_uint16 ab1 = ab0 | (ab0 >> 2);
 130             t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask));
 131         }
 132     }
 133     result += (int)v_reduce_sum(t);
 134     vx_cleanup();
 135 #elif CV_ENABLE_UNROLLED
 136     for( ; i <= n - 4; i += 4 )
 137         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
 138                 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
 139 #endif
 140     for( ; i < n; i++ )
 141         result += tab[a[i] ^ b[i]];
 142     return result;
 143 }
 144
 145 float normL2Sqr_(const float* a, const float* b, int n)
 146 {
 147     int j = 0; float d = 0.f;
 148 #if CV_SIMD
 149     v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
 150     v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
 151     for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
 152     {
 153         v_float32 t0 = vx_load(a + j) - vx_load(b + j);
 154         v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
 155         v_d0 = v_muladd(t0, t0, v_d0);
 156         v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
 157         v_d1 = v_muladd(t1, t1, v_d1);
 158         v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
 159         v_d2 = v_muladd(t2, t2, v_d2);
 160         v_d3 = v_muladd(t3, t3, v_d3);
 161     }
 162     d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
 163 #endif
 164     for( ; j < n; j++ )
 165     {
 166         float t = a[j] - b[j];
 167         d += t*t;
 168     }
 169     return d;
 170 }
 171
 172
 173 float normL1_(const float* a, const float* b, int n)
 174 {
 175     int j = 0; float d = 0.f;
 176 #if CV_SIMD
 177     v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
 178     v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
 179     for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
 180     {
 181         v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j));
 182         v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes));
 183         v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes));
 184         v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes));
 185     }
 186     d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
 187 #endif
 188     for( ; j < n; j++ )
 189         d += std::abs(a[j] - b[j]);
 190     return d;
 191 }
 192
 193 int normL1_(const uchar* a, const uchar* b, int n)
 194 {
 195     int j = 0, d = 0;
 196 #if CV_SIMD
 197     for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes)
 198         d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) +
 199              v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) +
 200              v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) +
 201              v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes));
 202 #endif
 203     for( ; j < n; j++ )
 204         d += std::abs(a[j] - b[j]);
 205     return d;
 206 }
 207
 208 } //cv::hal
 209
 210 //==================================================================================================
 211
 212 template<typename T, typename ST> int
 213 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 214 {
 215     ST result = *_result;
 216     if( !mask )
 217     {
 218         result = std::max(result, normInf<T, ST>(src, len*cn));
 219     }
 220     else
 221     {
 222         for( int i = 0; i < len; i++, src += cn )
 223             if( mask[i] )
 224             {
 225                 for( int k = 0; k < cn; k++ )
 226                     result = std::max(result, ST(cv_abs(src[k])));
 227             }
 228     }
 229     *_result = result;
 230     return 0;
 231 }
 232
 233 template<typename T, typename ST> int
 234 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 235 {
 236     ST result = *_result;
 237     if( !mask )
 238     {
 239         result += normL1<T, ST>(src, len*cn);
 240     }
 241     else
 242     {
 243         for( int i = 0; i < len; i++, src += cn )
 244             if( mask[i] )
 245             {
 246                 for( int k = 0; k < cn; k++ )
 247                     result += cv_abs(src[k]);
 248             }
 249     }
 250     *_result = result;
 251     return 0;
 252 }
 253
 254 template<typename T, typename ST> int
 255 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 256 {
 257     ST result = *_result;
 258     if( !mask )
 259     {
 260         result += normL2Sqr<T, ST>(src, len*cn);
 261     }
 262     else
 263     {
 264         for( int i = 0; i < len; i++, src += cn )
 265             if( mask[i] )
 266             {
 267                 for( int k = 0; k < cn; k++ )
 268                 {
 269                     T v = src[k];
 270                     result += (ST)v*v;
 271                 }
 272             }
 273     }
 274     *_result = result;
 275     return 0;
 276 }
 277
 278 template<typename T, typename ST> int
 279 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
 280 {
 281     ST result = *_result;
 282     if( !mask )
 283     {
 284         result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
 285     }
 286     else
 287     {
 288         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
 289             if( mask[i] )
 290             {
 291                 for( int k = 0; k < cn; k++ )
 292                     result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
 293             }
 294     }
 295     *_result = result;
 296     return 0;
 297 }
 298
 299 template<typename T, typename ST> int
 300 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
 301 {
 302     ST result = *_result;
 303     if( !mask )
 304     {
 305         result += normL1<T, ST>(src1, src2, len*cn);
 306     }
 307     else
 308     {
 309         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
 310             if( mask[i] )
 311             {
 312                 for( int k = 0; k < cn; k++ )
 313                     result += std::abs(src1[k] - src2[k]);
 314             }
 315     }
 316     *_result = result;
 317     return 0;
 318 }
 319
 320 template<typename T, typename ST> int
 321 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
 322 {
 323     ST result = *_result;
 324     if( !mask )
 325     {
 326         result += normL2Sqr<T, ST>(src1, src2, len*cn);
 327     }
 328     else
 329     {
 330         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
 331             if( mask[i] )
 332             {
 333                 for( int k = 0; k < cn; k++ )
 334                 {
 335                     ST v = src1[k] - src2[k];
 336                     result += v*v;
 337                 }
 338             }
 339     }
 340     *_result = result;
 341     return 0;
 342 }
 343
 344 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
 345     static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
 346 { return norm##L##_(src, mask, r, len, cn); } \
 347     static int normDiff##L##_##suffix(const type* src1, const type* src2, \
 348     const uchar* mask, ntype* r, int len, int cn) \
 349 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
 350
 351 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
 352     CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
 353     CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
 354     CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
 355
 356 CV_DEF_NORM_ALL(8u, uchar, int, int, int)
 357 CV_DEF_NORM_ALL(8s, schar, int, int, int)
 358 CV_DEF_NORM_ALL(16u, ushort, int, int, double)
 359 CV_DEF_NORM_ALL(16s, short, int, int, double)
 360 CV_DEF_NORM_ALL(32s, int, int, double, double)
 361 CV_DEF_NORM_ALL(32f, float, float, double, double)
 362 CV_DEF_NORM_ALL(64f, double, double, double, double)
 363
 364
 365 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
 366 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
 367
 368 static NormFunc getNormFunc(int normType, int depth)
 369 {
 370     static NormFunc normTab[3][8] =
 371     {
 372         {
 373             (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
 374             (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
 375         },
 376         {
 377             (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
 378             (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
 379         },
 380         {
 381             (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
 382             (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
 383         }
 384     };
 385
 386     return normTab[normType][depth];
 387 }
 388
 389 static NormDiffFunc getNormDiffFunc(int normType, int depth)
 390 {
 391     static NormDiffFunc normDiffTab[3][8] =
 392     {
 393         {
 394             (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
 395             (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
 396             (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
 397             (NormDiffFunc)normDiffInf_64f, 0
 398         },
 399         {
 400             (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
 401             (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
 402             (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
 403             (NormDiffFunc)normDiffL1_64f, 0
 404         },
 405         {
 406             (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
 407             (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
 408             (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
 409             (NormDiffFunc)normDiffL2_64f, 0
 410         }
 411     };
 412
 413     return normDiffTab[normType][depth];
 414 }
 415
 416 #ifdef HAVE_OPENCL
 417
 418 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
 419 {
 420     const ocl::Device & d = ocl::Device::getDefault();
 421
 422 #ifdef __ANDROID__
 423     if (d.isNVidia())
 424         return false;
 425 #endif
 426     const int cn = _src.channels();
 427     if (cn > 4)
 428         return false;
 429     int type = _src.type(), depth = CV_MAT_DEPTH(type);
 430     bool doubleSupport = d.doubleFPConfig() > 0,
 431             haveMask = _mask.kind() != _InputArray::NONE;
 432
 433     if (depth >= CV_16F)
 434         return false;  // TODO: support FP16
 435
 436     if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
 437          (!doubleSupport && depth == CV_64F))
 438         return false;
 439
 440     UMat src = _src.getUMat();
 441
 442     if (normType == NORM_INF)
 443     {
 444         if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
 445                            std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
 446             return false;
 447     }
 448     else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
 449     {
 450         Scalar sc;
 451         bool unstype = depth == CV_8U || depth == CV_16U;
 452
 453         if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
 454                     OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
 455             return false;
 456
 457         double s = 0.0;
 458         for (int i = 0; i < (haveMask ? cn : 1); ++i)
 459             s += sc[i];
 460
 461         result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
 462     }
 463
 464     return true;
 465 }
 466
 467 #endif
 468
 469 #ifdef HAVE_IPP
 470 static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
 471 {
 472     CV_INSTRUMENT_REGION_IPP();
 473
 474 #if IPP_VERSION_X100 >= 700
 475     size_t total_size = src.total();
 476     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
 477
 478     if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
 479         && cols > 0 && (size_t)rows*cols == total_size )
 480     {
 481         if( !mask.empty() )
 482         {
 483             IppiSize sz = { cols, rows };
 484             int type = src.type();
 485
 486             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
 487             ippiMaskNormFuncC1 ippiNorm_C1MR =
 488                 normType == NORM_INF ?
 489                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
 490                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
 491                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
 492                 0) :
 493             normType == NORM_L1 ?
 494                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
 495                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
 496                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
 497                 0) :
 498             normType == NORM_L2 || normType == NORM_L2SQR ?
 499                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
 500                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
 501                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
 502                 0) : 0;
 503             if( ippiNorm_C1MR )
 504             {
 505                 Ipp64f norm;
 506                 if( CV_INSTRUMENT_FUN_IPP(ippiNorm_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
 507                 {
 508                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
 509                     return true;
 510                 }
 511             }
 512             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
 513             ippiMaskNormFuncC3 ippiNorm_C3CMR =
 514                 normType == NORM_INF ?
 515                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
 516                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
 517                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
 518                 0) :
 519             normType == NORM_L1 ?
 520                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
 521                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
 522                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
 523                 0) :
 524             normType == NORM_L2 || normType == NORM_L2SQR ?
 525                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
 526                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
 527                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
 528                 0) : 0;
 529             if( ippiNorm_C3CMR )
 530             {
 531                 Ipp64f norm1, norm2, norm3;
 532                 if( CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
 533                     CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
 534                     CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
 535                 {
 536                     Ipp64f norm =
 537                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
 538                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
 539                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
 540                         0;
 541                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
 542                     return true;
 543                 }
 544             }
 545         }
 546         else
 547         {
 548             IppiSize sz = { cols*src.channels(), rows };
 549             int type = src.depth();
 550
 551             typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
 552             typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
 553             ippiNormFuncHint ippiNormHint =
 554                 normType == NORM_L1 ?
 555                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
 556                 0) :
 557                 normType == NORM_L2 || normType == NORM_L2SQR ?
 558                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
 559                 0) : 0;
 560             ippiNormFuncNoHint ippiNorm =
 561                 normType == NORM_INF ?
 562                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
 563                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
 564                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
 565                 type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
 566                 0) :
 567                 normType == NORM_L1 ?
 568                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
 569                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
 570                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
 571                 0) :
 572                 normType == NORM_L2 || normType == NORM_L2SQR ?
 573                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
 574                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
 575                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
 576                 0) : 0;
 577             if( ippiNormHint || ippiNorm )
 578             {
 579                 Ipp64f norm;
 580                 IppStatus ret = ippiNormHint ? CV_INSTRUMENT_FUN_IPP(ippiNormHint, src.ptr(), (int)src.step[0], sz, &norm, ippAlgHintAccurate) :
 581                                 CV_INSTRUMENT_FUN_IPP(ippiNorm, src.ptr(), (int)src.step[0], sz, &norm);
 582                 if( ret >= 0 )
 583                 {
 584                     result = (normType == NORM_L2SQR) ? norm * norm : norm;
 585                     return true;
 586                 }
 587             }
 588         }
 589     }
 590 #else
 591     CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result);
 592 #endif
 593     return false;
 594 }  // ipp_norm()
 595 #endif  // HAVE_IPP
 596
 597 double norm( InputArray _src, int normType, InputArray _mask )
 598 {
 599     CV_INSTRUMENT_REGION();
 600
 601     normType &= NORM_TYPE_MASK;
 602     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
 603                normType == NORM_L2 || normType == NORM_L2SQR ||
 604                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
 605
 606 #if defined HAVE_OPENCL || defined HAVE_IPP
 607     double _result = 0;
 608 #endif
 609
 610 #ifdef HAVE_OPENCL
 611     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
 612                 ocl_norm(_src, normType, _mask, _result),
 613                 _result)
 614 #endif
 615
 616     Mat src = _src.getMat(), mask = _mask.getMat();
 617     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result);
 618
 619     int depth = src.depth(), cn = src.channels();
 620     if( src.isContinuous() && mask.empty() )
 621     {
 622         size_t len = src.total()*cn;
 623         if( len == (size_t)(int)len )
 624         {
 625             if( depth == CV_32F )
 626             {
 627                 const float* data = src.ptr<float>();
 628
 629                 if( normType == NORM_L2 )
 630                 {
 631                     double result = 0;
 632                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
 633                     return std::sqrt(result);
 634                 }
 635                 if( normType == NORM_L2SQR )
 636                 {
 637                     double result = 0;
 638                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
 639                     return result;
 640                 }
 641                 if( normType == NORM_L1 )
 642                 {
 643                     double result = 0;
 644                     GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
 645                     return result;
 646                 }
 647                 if( normType == NORM_INF )
 648                 {
 649                     float result = 0;
 650                     GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
 651                     return result;
 652                 }
 653             }
 654             if( depth == CV_8U )
 655             {
 656                 const uchar* data = src.ptr<uchar>();
 657
 658                 if( normType == NORM_HAMMING )
 659                 {
 660                     return hal::normHamming(data, (int)len);
 661                 }
 662
 663                 if( normType == NORM_HAMMING2 )
 664                 {
 665                     return hal::normHamming(data, (int)len, 2);
 666                 }
 667             }
 668         }
 669     }
 670
 671     CV_Assert( mask.empty() || mask.type() == CV_8U );
 672
 673     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
 674     {
 675         if( !mask.empty() )
 676         {
 677             Mat temp;
 678             bitwise_and(src, mask, temp);
 679             return norm(temp, normType);
 680         }
 681         int cellSize = normType == NORM_HAMMING ? 1 : 2;
 682
 683         const Mat* arrays[] = {&src, 0};
 684         uchar* ptrs[1] = {};
 685         NAryMatIterator it(arrays, ptrs);
 686         int total = (int)it.size;
 687         int result = 0;
 688
 689         for( size_t i = 0; i < it.nplanes; i++, ++it )
 690         {
 691             result += hal::normHamming(ptrs[0], total, cellSize);
 692         }
 693
 694         return result;
 695     }
 696
 697     NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
 698     CV_Assert( func != 0 );
 699
 700     const Mat* arrays[] = {&src, &mask, 0};
 701     uchar* ptrs[2] = {};
 702     union
 703     {
 704         double d;
 705         int i;
 706         float f;
 707     }
 708     result;
 709     result.d = 0;
 710     NAryMatIterator it(arrays, ptrs);
 711     CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");
 712
 713     if ((normType == NORM_L1 && depth <= CV_16S) ||
 714         ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
 715     {
 716         // special case to handle "integer" overflow in accumulator
 717         const size_t esz = src.elemSize();
 718         const int total = (int)it.size;
 719         const int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
 720         const int blockSize = std::min(total, intSumBlockSize);
 721         int isum = 0;
 722         int count = 0;
 723
 724         for (size_t i = 0; i < it.nplanes; i++, ++it)
 725         {
 726             for (int j = 0; j < total; j += blockSize)
 727             {
 728                 int bsz = std::min(total - j, blockSize);
 729                 func(ptrs[0], ptrs[1], (uchar*)&isum, bsz, cn);
 730                 count += bsz;
 731                 if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
 732                 {
 733                     result.d += isum;
 734                     isum = 0;
 735                     count = 0;
 736                 }
 737                 ptrs[0] += bsz*esz;
 738                 if (ptrs[1])
 739                     ptrs[1] += bsz;
 740             }
 741         }
 742     }
 743     else if (depth == CV_16F)
 744     {
 745         const size_t esz = src.elemSize();
 746         const int total = (int)it.size;
 747         const int blockSize = std::min(total, divUp(1024, cn));
 748         AutoBuffer<float, 1026/*divUp(1024,3)*3*/> fltbuf(blockSize * cn);
 749         float* data0 = fltbuf.data();
 750         for (size_t i = 0; i < it.nplanes; i++, ++it)
 751         {
 752             for (int j = 0; j < total; j += blockSize)
 753             {
 754                 int bsz = std::min(total - j, blockSize);
 755                 hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
 756                 func((uchar*)data0, ptrs[1], (uchar*)&result.d, bsz, cn);
 757                 ptrs[0] += bsz*esz;
 758                 if (ptrs[1])
 759                     ptrs[1] += bsz;
 760             }
 761         }
 762     }
 763     else
 764     {
 765         // generic implementation
 766         for (size_t i = 0; i < it.nplanes; i++, ++it)
 767         {
 768             func(ptrs[0], ptrs[1], (uchar*)&result, (int)it.size, cn);
 769         }
 770     }
 771
 772     if( normType == NORM_INF )
 773     {
 774         if(depth == CV_64F || depth == CV_16F)
 775             return result.d;
 776         else if (depth == CV_32F)
 777             return result.f;
 778         else
 779             return result.i;
 780     }
 781     else if( normType == NORM_L2 )
 782         return std::sqrt(result.d);
 783
 784     return result.d;
 785 }
 786
 787 //==================================================================================================
 788
 789 #ifdef HAVE_OPENCL
 790 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
 791 {
 792 #ifdef __ANDROID__
 793     if (ocl::Device::getDefault().isNVidia())
 794         return false;
 795 #endif
 796
 797     Scalar sc1, sc2;
 798     int cn = _src1.channels();
 799     if (cn > 4)
 800         return false;
 801     int type = _src1.type(), depth = CV_MAT_DEPTH(type);
 802     bool relative = (normType & NORM_RELATIVE) != 0;
 803     normType &= ~NORM_RELATIVE;
 804     bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
 805
 806 #ifdef __APPLE__
 807     if(normType == NORM_L1 && type == CV_16UC3 && !_mask.empty())
 808         return false;
 809 #endif
 810
 811     if (normsum)
 812     {
 813         if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
 814                      OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
 815             return false;
 816     }
 817     else
 818     {
 819         if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
 820                            false, _src2, relative ? &sc2[0] : NULL))
 821             return false;
 822         cn = 1;
 823     }
 824
 825     double s2 = 0;
 826     for (int i = 0; i < cn; ++i)
 827     {
 828         result += sc1[i];
 829         if (relative)
 830             s2 += sc2[i];
 831     }
 832
 833     if (normType == NORM_L2)
 834     {
 835         result = std::sqrt(result);
 836         if (relative)
 837             s2 = std::sqrt(s2);
 838     }
 839
 840     if (relative)
 841         result /= (s2 + DBL_EPSILON);
 842
 843     return true;
 844 }  // ocl_norm()
 845 #endif  // HAVE_OPENCL
 846
 847 #ifdef HAVE_IPP
 848 static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result)
 849 {
 850     CV_INSTRUMENT_REGION_IPP();
 851
 852 #if IPP_VERSION_X100 >= 700
 853     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
 854
 855     if( normType & CV_RELATIVE )
 856     {
 857         normType &= NORM_TYPE_MASK;
 858
 859         size_t total_size = src1.total();
 860         int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
 861         if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
 862             && cols > 0 && (size_t)rows*cols == total_size )
 863         {
 864             if( !mask.empty() )
 865             {
 866                 IppiSize sz = { cols, rows };
 867                 int type = src1.type();
 868
 869                 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
 870                 ippiMaskNormDiffFuncC1 ippiNormRel_C1MR =
 871                     normType == NORM_INF ?
 872                     (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_8u_C1MR :
 873                     type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_16u_C1MR :
 874                     type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_32f_C1MR :
 875                     0) :
 876                     normType == NORM_L1 ?
 877                     (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_8u_C1MR :
 878                     type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_16u_C1MR :
 879                     type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_32f_C1MR :
 880                     0) :
 881                     normType == NORM_L2 || normType == NORM_L2SQR ?
 882                     (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_8u_C1MR :
 883                     type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_16u_C1MR :
 884                     type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_32f_C1MR :
 885                     0) : 0;
 886                 if( ippiNormRel_C1MR )
 887                 {
 888                     Ipp64f norm;
 889                     if( CV_INSTRUMENT_FUN_IPP(ippiNormRel_C1MR, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
 890                     {
 891                         result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
 892                         return true;
 893                     }
 894                 }
 895             }
 896             else
 897             {
 898                 IppiSize sz = { cols*src1.channels(), rows };
 899                 int type = src1.depth();
 900
 901                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
 902                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
 903                 ippiNormRelFuncHint ippiNormRelHint =
 904                     normType == NORM_L1 ?
 905                     (type == CV_32F ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
 906                     0) :
 907                     normType == NORM_L2 || normType == NORM_L2SQR ?
 908                     (type == CV_32F ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
 909                     0) : 0;
 910                 ippiNormRelFuncNoHint ippiNormRel =
 911                     normType == NORM_INF ?
 912                     (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
 913                     type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
 914                     type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
 915                     type == CV_32F ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
 916                     0) :
 917                     normType == NORM_L1 ?
 918                     (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
 919                     type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
 920                     type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
 921                     0) :
 922                     normType == NORM_L2 || normType == NORM_L2SQR ?
 923                     (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
 924                     type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
 925                     type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
 926                     0) : 0;
 927                 if( ippiNormRelHint || ippiNormRel )
 928                 {
 929                     Ipp64f norm;
 930                     IppStatus ret = ippiNormRelHint ? CV_INSTRUMENT_FUN_IPP(ippiNormRelHint, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) :
 931                                     CV_INSTRUMENT_FUN_IPP(ippiNormRel, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm);
 932                     if( ret >= 0 )
 933                     {
 934                         result = (normType == NORM_L2SQR) ? norm * norm : norm;
 935                         return true;
 936                     }
 937                 }
 938             }
 939         }
 940         return false;
 941     }
 942
 943     normType &= NORM_TYPE_MASK;
 944
 945     size_t total_size = src1.total();
 946     int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
 947     if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
 948         && cols > 0 && (size_t)rows*cols == total_size )
 949     {
 950         if( !mask.empty() )
 951         {
 952             IppiSize sz = { cols, rows };
 953             int type = src1.type();
 954
 955             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
 956             ippiMaskNormDiffFuncC1 ippiNormDiff_C1MR =
 957                 normType == NORM_INF ?
 958                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
 959                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
 960                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
 961                 0) :
 962                 normType == NORM_L1 ?
 963                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
 964                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
 965                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
 966                 0) :
 967                 normType == NORM_L2 || normType == NORM_L2SQR ?
 968                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
 969                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
 970                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
 971                 0) : 0;
 972             if( ippiNormDiff_C1MR )
 973             {
 974                 Ipp64f norm;
 975                 if( CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C1MR, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
 976                 {
 977                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
 978                     return true;
 979                 }
 980             }
 981             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
 982             ippiMaskNormDiffFuncC3 ippiNormDiff_C3CMR =
 983                 normType == NORM_INF ?
 984                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
 985                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
 986                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
 987                 0) :
 988                 normType == NORM_L1 ?
 989                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
 990                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
 991                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
 992                 0) :
 993                 normType == NORM_L2 || normType == NORM_L2SQR ?
 994                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
 995                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
 996                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
 997                 0) : 0;
 998             if (cv::ipp::getIppTopFeatures() & (
 999 #if IPP_VERSION_X100 >= 201700
1000                     ippCPUID_AVX512F |
1001 #endif
1002                     ippCPUID_AVX2)
1003             ) // IPP_DISABLE_NORM_16UC3_mask_small (#11399)
1004             {
1005                 if (normType == NORM_L1 && type == CV_16UC3 && sz.width < 16)
1006                     return false;
1007             }
1008             if( ippiNormDiff_C3CMR )
1009             {
1010                 Ipp64f norm1, norm2, norm3;
1011                 if( CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
1012                     CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
1013                     CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
1014                 {
1015                     Ipp64f norm =
1016                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
1017                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
1018                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
1019                         0;
1020                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
1021                     return true;
1022                 }
1023             }
1024         }
1025         else
1026         {
1027             IppiSize sz = { cols*src1.channels(), rows };
1028             int type = src1.depth();
1029
1030             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
1031             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
1032             ippiNormDiffFuncHint ippiNormDiffHint =
1033                 normType == NORM_L1 ?
1034                 (type == CV_32F ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
1035                 0) :
1036                 normType == NORM_L2 || normType == NORM_L2SQR ?
1037                 (type == CV_32F ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
1038                 0) : 0;
1039             ippiNormDiffFuncNoHint ippiNormDiff =
1040                 normType == NORM_INF ?
1041                 (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
1042                 type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
1043                 type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
1044                 type == CV_32F ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
1045                 0) :
1046                 normType == NORM_L1 ?
1047                 (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
1048                 type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
1049                 type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
1050                 0) :
1051                 normType == NORM_L2 || normType == NORM_L2SQR ?
1052                 (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
1053                 type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
1054                 type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
1055                 0) : 0;
1056             if( ippiNormDiffHint || ippiNormDiff )
1057             {
1058                 Ipp64f norm;
1059                 IppStatus ret = ippiNormDiffHint ? CV_INSTRUMENT_FUN_IPP(ippiNormDiffHint, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) :
1060                                 CV_INSTRUMENT_FUN_IPP(ippiNormDiff, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm);
1061                 if( ret >= 0 )
1062                 {
1063                     result = (normType == NORM_L2SQR) ? norm * norm : norm;
1064                     return true;
1065                 }
1066             }
1067         }
1068     }
1069 #else
1070     CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result);
1071 #endif
1072     return false;
1073 }  // ipp_norm
1074 #endif  // HAVE_IPP
1075
1076
1077 double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
1078 {
1079     CV_INSTRUMENT_REGION();
1080
1081     CV_CheckTypeEQ(_src1.type(), _src2.type(), "Input type mismatch");
1082     CV_Assert(_src1.sameSize(_src2));
1083
1084 #if defined HAVE_OPENCL || defined HAVE_IPP
1085     double _result = 0;
1086 #endif
1087
1088 #ifdef HAVE_OPENCL
1089     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
1090                 ocl_norm(_src1, _src2, normType, _mask, _result),
1091                 _result)
1092 #endif
1093
1094     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(_src1, _src2, normType, _mask, _result), _result);
1095
1096     if( normType & CV_RELATIVE )
1097     {
1098         return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
1099     }
1100
1101     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
1102     int depth = src1.depth(), cn = src1.channels();
1103
1104     normType &= 7;
1105     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
1106                normType == NORM_L2 || normType == NORM_L2SQR ||
1107               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
1108
1109     if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
1110     {
1111         size_t len = src1.total()*src1.channels();
1112         if( len == (size_t)(int)len )
1113         {
1114             if( src1.depth() == CV_32F )
1115             {
1116                 const float* data1 = src1.ptr<float>();
1117                 const float* data2 = src2.ptr<float>();
1118
1119                 if( normType == NORM_L2 )
1120                 {
1121                     double result = 0;
1122                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
1123                     return std::sqrt(result);
1124                 }
1125                 if( normType == NORM_L2SQR )
1126                 {
1127                     double result = 0;
1128                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
1129                     return result;
1130                 }
1131                 if( normType == NORM_L1 )
1132                 {
1133                     double result = 0;
1134                     GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
1135                     return result;
1136                 }
1137                 if( normType == NORM_INF )
1138                 {
1139                     float result = 0;
1140                     GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
1141                     return result;
1142                 }
1143             }
1144         }
1145     }
1146
1147     CV_Assert( mask.empty() || mask.type() == CV_8U );
1148
1149     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
1150     {
1151         if( !mask.empty() )
1152         {
1153             Mat temp;
1154             bitwise_xor(src1, src2, temp);
1155             bitwise_and(temp, mask, temp);
1156             return norm(temp, normType);
1157         }
1158         int cellSize = normType == NORM_HAMMING ? 1 : 2;
1159
1160         const Mat* arrays[] = {&src1, &src2, 0};
1161         uchar* ptrs[2] = {};
1162         NAryMatIterator it(arrays, ptrs);
1163         int total = (int)it.size;
1164         int result = 0;
1165
1166         for( size_t i = 0; i < it.nplanes; i++, ++it )
1167         {
1168             result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
1169         }
1170
1171         return result;
1172     }
1173
1174     NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
1175     CV_Assert( func != 0 );
1176
1177     const Mat* arrays[] = {&src1, &src2, &mask, 0};
1178     uchar* ptrs[3] = {};
1179     union
1180     {
1181         double d;
1182         float f;
1183         int i;
1184         unsigned u;
1185     }
1186     result;
1187     result.d = 0;
1188     NAryMatIterator it(arrays, ptrs);
1189     CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");
1190
1191     if ((normType == NORM_L1 && depth <= CV_16S) ||
1192         ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
1193     {
1194         // special case to handle "integer" overflow in accumulator
1195         const size_t esz = src1.elemSize();
1196         const int total = (int)it.size;
1197         const int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
1198         const int blockSize = std::min(total, intSumBlockSize);
1199         int isum = 0;
1200         int count = 0;
1201
1202         for (size_t i = 0; i < it.nplanes; i++, ++it)
1203         {
1204             for (int j = 0; j < total; j += blockSize)
1205             {
1206                 int bsz = std::min(total - j, blockSize);
1207                 func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&isum, bsz, cn);
1208                 count += bsz;
1209                 if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
1210                 {
1211                     result.d += isum;
1212                     isum = 0;
1213                     count = 0;
1214                 }
1215                 ptrs[0] += bsz*esz;
1216                 ptrs[1] += bsz*esz;
1217                 if (ptrs[2])
1218                     ptrs[2] += bsz;
1219             }
1220         }
1221     }
1222     else if (depth == CV_16F)
1223     {
1224         const size_t esz = src1.elemSize();
1225         const int total = (int)it.size;
1226         const int blockSize = std::min(total, divUp(512, cn));
1227         AutoBuffer<float, 1026/*divUp(512,3)*3*2*/> fltbuf(blockSize * cn * 2);
1228         float* data0 = fltbuf.data();
1229         float* data1 = fltbuf.data() + blockSize * cn;
1230         for (size_t i = 0; i < it.nplanes; i++, ++it)
1231         {
1232             for (int j = 0; j < total; j += blockSize)
1233             {
1234                 int bsz = std::min(total - j, blockSize);
1235                 hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
1236                 hal::cvt16f32f((const float16_t*)ptrs[1], data1, bsz * cn);
1237                 func((uchar*)data0, (uchar*)data1, ptrs[2], (uchar*)&result.d, bsz, cn);
1238                 ptrs[0] += bsz*esz;
1239                 ptrs[1] += bsz*esz;
1240                 if (ptrs[2])
1241                     ptrs[2] += bsz;
1242             }
1243         }
1244     }
1245     else
1246     {
1247         // generic implementation
1248         for (size_t i = 0; i < it.nplanes; i++, ++it)
1249         {
1250             func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&result, (int)it.size, cn);
1251         }
1252     }
1253
1254     if( normType == NORM_INF )
1255     {
1256         if (depth == CV_64F || depth == CV_16F)
1257             return result.d;
1258         else if (depth == CV_32F)
1259             return result.f;
1260         else
1261             return result.u;
1262     }
1263     else if( normType == NORM_L2 )
1264         return std::sqrt(result.d);
1265
1266     return result.d;
1267 }
1268
1269 cv::Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
1270 {
1271     return cv::hal::normHamming(a, b, size);
1272 }
1273
1274 double PSNR(InputArray _src1, InputArray _src2, double R)
1275 {
1276     CV_INSTRUMENT_REGION();
1277
1278     //Input arrays must have depth CV_8U
1279     CV_Assert( _src1.type() == _src2.type() );
1280
1281     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
1282     return 20*log10(R/(diff+DBL_EPSILON));
1283 }
1284
1285
1286 #ifdef HAVE_OPENCL
1287 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
1288                            double scale, double delta )
1289 {
1290     UMat src = _src.getUMat();
1291
1292     if( _mask.empty() )
1293         src.convertTo( _dst, dtype, scale, delta );
1294     else if (src.channels() <= 4)
1295     {
1296         const ocl::Device & dev = ocl::Device::getDefault();
1297
1298         int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
1299                 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
1300                 rowsPerWI = dev.isIntel() ? 4 : 1;
1301
1302         float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
1303         bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
1304                 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
1305                 haveDelta = std::fabs(delta) > DBL_EPSILON,
1306                 doubleSupport = dev.doubleFPConfig() > 0;
1307
1308         if (!haveScale && !haveDelta && stype == dtype)
1309         {
1310             _src.copyTo(_dst, _mask);
1311             return true;
1312         }
1313         if (haveZeroScale)
1314         {
1315             _dst.setTo(Scalar(delta), _mask);
1316             return true;
1317         }
1318
1319         if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
1320             return false;
1321
1322         char cvt[2][40];
1323         String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
1324                              " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
1325                              ocl::typeToStr(stype), ocl::typeToStr(dtype),
1326                              ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
1327                              rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
1328                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
1329                              doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1330                              haveScale ? " -D HAVE_SCALE" : "",
1331                              haveDelta ? " -D HAVE_DELTA" : "",
1332                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
1333
1334         ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
1335         if (k.empty())
1336             return false;
1337
1338         UMat mask = _mask.getUMat(), dst = _dst.getUMat();
1339
1340         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
1341                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
1342                 dstarg = ocl::KernelArg::ReadWrite(dst);
1343
1344         if (haveScale)
1345         {
1346             if (haveDelta)
1347                 k.args(srcarg, maskarg, dstarg, fscale, fdelta);
1348             else
1349                 k.args(srcarg, maskarg, dstarg, fscale);
1350         }
1351         else
1352         {
1353             if (haveDelta)
1354                 k.args(srcarg, maskarg, dstarg, fdelta);
1355             else
1356                 k.args(srcarg, maskarg, dstarg);
1357         }
1358
1359         size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
1360         return k.run(2, globalsize, NULL, false);
1361     }
1362     else
1363     {
1364         UMat temp;
1365         src.convertTo( temp, dtype, scale, delta );
1366         temp.copyTo( _dst, _mask );
1367     }
1368
1369     return true;
1370 }  // ocl_normalize
1371 #endif  // HAVE_OPENCL
1372
1373 void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
1374                int norm_type, int rtype, InputArray _mask)
1375 {
1376     CV_INSTRUMENT_REGION();
1377
1378     double scale = 1, shift = 0;
1379     int type = _src.type(), depth = CV_MAT_DEPTH(type);
1380
1381     if( rtype < 0 )
1382         rtype = _dst.fixedType() ? _dst.depth() : depth;
1383
1384     if( norm_type == CV_MINMAX )
1385     {
1386         double smin = 0, smax = 0;
1387         double dmin = MIN( a, b ), dmax = MAX( a, b );
1388         minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
1389         scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
1390         if( rtype == CV_32F )
1391         {
1392             scale = (float)scale;
1393             shift = (float)dmin - (float)(smin*scale);
1394         }
1395         else
1396             shift = dmin - smin*scale;
1397     }
1398     else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
1399     {
1400         scale = norm( _src, norm_type, _mask );
1401         scale = scale > DBL_EPSILON ? a/scale : 0.;
1402         shift = 0;
1403     }
1404     else
1405         CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
1406
1407     CV_OCL_RUN(_dst.isUMat(),
1408                ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
1409
1410     Mat src = _src.getMat();
1411     if( _mask.empty() )
1412         src.convertTo( _dst, rtype, scale, shift );
1413     else
1414     {
1415         Mat temp;
1416         src.convertTo( temp, rtype, scale, shift );
1417         temp.copyTo( _dst, _mask );
1418     }
1419 }
1420
1421 }  // namespace