1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
7 #include "opencl_kernels_core.hpp"
10 /****************************************************************************************\
12 \****************************************************************************************/
14 namespace cv { namespace hal {
16 extern const uchar popCountTable[256] =
18 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
19 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
20 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
21 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
22 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
23 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
24 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
25 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
28 static const uchar popCountTable2[] =
30 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
31 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
32 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
33 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
34 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
35 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
36 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
37 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
40 static const uchar popCountTable4[] =
42 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
53 int normHamming(const uchar* a, int n, int cellSize)
56 return normHamming(a, n);
60 else if( cellSize == 4 )
67 v_uint64 t = vx_setzero_u64();
70 v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
71 for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
73 v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
74 t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask));
79 v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
80 for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
82 v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
83 v_uint16 a1 = a0 | (a0 >> 2);
84 t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask));
88 result += (int)v_reduce_sum(t);
90 #elif CV_ENABLE_UNROLLED
91 for( ; i <= n - 4; i += 4 )
92 result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
99 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
102 return normHamming(a, b, n);
103 const uchar* tab = 0;
105 tab = popCountTable2;
106 else if( cellSize == 4 )
107 tab = popCountTable4;
113 v_uint64 t = vx_setzero_u64();
116 v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
117 for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
119 v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
120 t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask));
123 else // cellSize == 4
125 v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
126 for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
128 v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
129 v_uint16 ab1 = ab0 | (ab0 >> 2);
130 t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask));
133 result += (int)v_reduce_sum(t);
135 #elif CV_ENABLE_UNROLLED
136 for( ; i <= n - 4; i += 4 )
137 result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
138 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
141 result += tab[a[i] ^ b[i]];
145 float normL2Sqr_(const float* a, const float* b, int n)
147 int j = 0; float d = 0.f;
149 v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
150 v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
151 for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
153 v_float32 t0 = vx_load(a + j) - vx_load(b + j);
154 v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
155 v_d0 = v_muladd(t0, t0, v_d0);
156 v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
157 v_d1 = v_muladd(t1, t1, v_d1);
158 v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
159 v_d2 = v_muladd(t2, t2, v_d2);
160 v_d3 = v_muladd(t3, t3, v_d3);
162 d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
166 float t = a[j] - b[j];
173 float normL1_(const float* a, const float* b, int n)
175 int j = 0; float d = 0.f;
177 v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
178 v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
179 for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
181 v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j));
182 v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes));
183 v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes));
184 v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes));
186 d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
189 d += std::abs(a[j] - b[j]);
193 int normL1_(const uchar* a, const uchar* b, int n)
197 for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes)
198 d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) +
199 v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) +
200 v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) +
201 v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes));
204 d += std::abs(a[j] - b[j]);
210 //==================================================================================================
212 template<typename T, typename ST> int
213 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
215 ST result = *_result;
218 result = std::max(result, normInf<T, ST>(src, len*cn));
222 for( int i = 0; i < len; i++, src += cn )
225 for( int k = 0; k < cn; k++ )
226 result = std::max(result, ST(cv_abs(src[k])));
233 template<typename T, typename ST> int
234 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
236 ST result = *_result;
239 result += normL1<T, ST>(src, len*cn);
243 for( int i = 0; i < len; i++, src += cn )
246 for( int k = 0; k < cn; k++ )
247 result += cv_abs(src[k]);
254 template<typename T, typename ST> int
255 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
257 ST result = *_result;
260 result += normL2Sqr<T, ST>(src, len*cn);
264 for( int i = 0; i < len; i++, src += cn )
267 for( int k = 0; k < cn; k++ )
278 template<typename T, typename ST> int
279 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
281 ST result = *_result;
284 result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
288 for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
291 for( int k = 0; k < cn; k++ )
292 result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
299 template<typename T, typename ST> int
300 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
302 ST result = *_result;
305 result += normL1<T, ST>(src1, src2, len*cn);
309 for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
312 for( int k = 0; k < cn; k++ )
313 result += std::abs(src1[k] - src2[k]);
320 template<typename T, typename ST> int
321 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
323 ST result = *_result;
326 result += normL2Sqr<T, ST>(src1, src2, len*cn);
330 for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
333 for( int k = 0; k < cn; k++ )
335 ST v = src1[k] - src2[k];
344 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
345 static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
346 { return norm##L##_(src, mask, r, len, cn); } \
347 static int normDiff##L##_##suffix(const type* src1, const type* src2, \
348 const uchar* mask, ntype* r, int len, int cn) \
349 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
351 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
352 CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
353 CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
354 CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
356 CV_DEF_NORM_ALL(8u, uchar, int, int, int)
357 CV_DEF_NORM_ALL(8s, schar, int, int, int)
358 CV_DEF_NORM_ALL(16u, ushort, int, int, double)
359 CV_DEF_NORM_ALL(16s, short, int, int, double)
360 CV_DEF_NORM_ALL(32s, int, int, double, double)
361 CV_DEF_NORM_ALL(32f, float, float, double, double)
362 CV_DEF_NORM_ALL(64f, double, double, double, double)
365 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
366 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
368 static NormFunc getNormFunc(int normType, int depth)
370 static NormFunc normTab[3][8] =
373 (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
374 (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
377 (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
378 (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
381 (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
382 (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
386 return normTab[normType][depth];
389 static NormDiffFunc getNormDiffFunc(int normType, int depth)
391 static NormDiffFunc normDiffTab[3][8] =
394 (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
395 (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
396 (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
397 (NormDiffFunc)normDiffInf_64f, 0
400 (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
401 (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
402 (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
403 (NormDiffFunc)normDiffL1_64f, 0
406 (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
407 (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
408 (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
409 (NormDiffFunc)normDiffL2_64f, 0
413 return normDiffTab[normType][depth];
418 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
420 const ocl::Device & d = ocl::Device::getDefault();
426 const int cn = _src.channels();
429 int type = _src.type(), depth = CV_MAT_DEPTH(type);
430 bool doubleSupport = d.doubleFPConfig() > 0,
431 haveMask = _mask.kind() != _InputArray::NONE;
434 return false; // TODO: support FP16
436 if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
437 (!doubleSupport && depth == CV_64F))
440 UMat src = _src.getUMat();
442 if (normType == NORM_INF)
444 if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
445 std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
448 else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
451 bool unstype = depth == CV_8U || depth == CV_16U;
453 if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
454 OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
458 for (int i = 0; i < (haveMask ? cn : 1); ++i)
461 result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
470 static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
472 CV_INSTRUMENT_REGION_IPP();
474 #if IPP_VERSION_X100 >= 700
475 size_t total_size = src.total();
476 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
478 if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
479 && cols > 0 && (size_t)rows*cols == total_size )
483 IppiSize sz = { cols, rows };
484 int type = src.type();
486 typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
487 ippiMaskNormFuncC1 ippiNorm_C1MR =
488 normType == NORM_INF ?
489 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
490 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
491 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
493 normType == NORM_L1 ?
494 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
495 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
496 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
498 normType == NORM_L2 || normType == NORM_L2SQR ?
499 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
500 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
501 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
506 if( CV_INSTRUMENT_FUN_IPP(ippiNorm_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
508 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
512 typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
513 ippiMaskNormFuncC3 ippiNorm_C3CMR =
514 normType == NORM_INF ?
515 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
516 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
517 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
519 normType == NORM_L1 ?
520 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
521 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
522 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
524 normType == NORM_L2 || normType == NORM_L2SQR ?
525 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
526 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
527 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
531 Ipp64f norm1, norm2, norm3;
532 if( CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
533 CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
534 CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
537 normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
538 normType == NORM_L1 ? norm1 + norm2 + norm3 :
539 normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
541 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
548 IppiSize sz = { cols*src.channels(), rows };
549 int type = src.depth();
551 typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
552 typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
553 ippiNormFuncHint ippiNormHint =
554 normType == NORM_L1 ?
555 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
557 normType == NORM_L2 || normType == NORM_L2SQR ?
558 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
560 ippiNormFuncNoHint ippiNorm =
561 normType == NORM_INF ?
562 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
563 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
564 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
565 type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
567 normType == NORM_L1 ?
568 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
569 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
570 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
572 normType == NORM_L2 || normType == NORM_L2SQR ?
573 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
574 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
575 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
577 if( ippiNormHint || ippiNorm )
580 IppStatus ret = ippiNormHint ? CV_INSTRUMENT_FUN_IPP(ippiNormHint, src.ptr(), (int)src.step[0], sz, &norm, ippAlgHintAccurate) :
581 CV_INSTRUMENT_FUN_IPP(ippiNorm, src.ptr(), (int)src.step[0], sz, &norm);
584 result = (normType == NORM_L2SQR) ? norm * norm : norm;
591 CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result);
597 double norm( InputArray _src, int normType, InputArray _mask )
599 CV_INSTRUMENT_REGION();
601 normType &= NORM_TYPE_MASK;
602 CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
603 normType == NORM_L2 || normType == NORM_L2SQR ||
604 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
606 #if defined HAVE_OPENCL || defined HAVE_IPP
611 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
612 ocl_norm(_src, normType, _mask, _result),
616 Mat src = _src.getMat(), mask = _mask.getMat();
617 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result);
619 int depth = src.depth(), cn = src.channels();
620 if( src.isContinuous() && mask.empty() )
622 size_t len = src.total()*cn;
623 if( len == (size_t)(int)len )
625 if( depth == CV_32F )
627 const float* data = src.ptr<float>();
629 if( normType == NORM_L2 )
632 GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
633 return std::sqrt(result);
635 if( normType == NORM_L2SQR )
638 GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
641 if( normType == NORM_L1 )
644 GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
647 if( normType == NORM_INF )
650 GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
656 const uchar* data = src.ptr<uchar>();
658 if( normType == NORM_HAMMING )
660 return hal::normHamming(data, (int)len);
663 if( normType == NORM_HAMMING2 )
665 return hal::normHamming(data, (int)len, 2);
671 CV_Assert( mask.empty() || mask.type() == CV_8U );
673 if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
678 bitwise_and(src, mask, temp);
679 return norm(temp, normType);
681 int cellSize = normType == NORM_HAMMING ? 1 : 2;
683 const Mat* arrays[] = {&src, 0};
685 NAryMatIterator it(arrays, ptrs);
686 int total = (int)it.size;
689 for( size_t i = 0; i < it.nplanes; i++, ++it )
691 result += hal::normHamming(ptrs[0], total, cellSize);
697 NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
698 CV_Assert( func != 0 );
700 const Mat* arrays[] = {&src, &mask, 0};
710 NAryMatIterator it(arrays, ptrs);
711 CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");
713 if ((normType == NORM_L1 && depth <= CV_16S) ||
714 ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
716 // special case to handle "integer" overflow in accumulator
717 const size_t esz = src.elemSize();
718 const int total = (int)it.size;
719 const int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
720 const int blockSize = std::min(total, intSumBlockSize);
724 for (size_t i = 0; i < it.nplanes; i++, ++it)
726 for (int j = 0; j < total; j += blockSize)
728 int bsz = std::min(total - j, blockSize);
729 func(ptrs[0], ptrs[1], (uchar*)&isum, bsz, cn);
731 if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
743 else if (depth == CV_16F)
745 const size_t esz = src.elemSize();
746 const int total = (int)it.size;
747 const int blockSize = std::min(total, divUp(1024, cn));
748 AutoBuffer<float, 1026/*divUp(1024,3)*3*/> fltbuf(blockSize * cn);
749 float* data0 = fltbuf.data();
750 for (size_t i = 0; i < it.nplanes; i++, ++it)
752 for (int j = 0; j < total; j += blockSize)
754 int bsz = std::min(total - j, blockSize);
755 hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
756 func((uchar*)data0, ptrs[1], (uchar*)&result.d, bsz, cn);
765 // generic implementation
766 for (size_t i = 0; i < it.nplanes; i++, ++it)
768 func(ptrs[0], ptrs[1], (uchar*)&result, (int)it.size, cn);
772 if( normType == NORM_INF )
774 if(depth == CV_64F || depth == CV_16F)
776 else if (depth == CV_32F)
781 else if( normType == NORM_L2 )
782 return std::sqrt(result.d);
787 //==================================================================================================
790 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
793 if (ocl::Device::getDefault().isNVidia())
798 int cn = _src1.channels();
801 int type = _src1.type(), depth = CV_MAT_DEPTH(type);
802 bool relative = (normType & NORM_RELATIVE) != 0;
803 normType &= ~NORM_RELATIVE;
804 bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
807 if(normType == NORM_L1 && type == CV_16UC3 && !_mask.empty())
813 if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
814 OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
819 if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
820 false, _src2, relative ? &sc2[0] : NULL))
826 for (int i = 0; i < cn; ++i)
833 if (normType == NORM_L2)
835 result = std::sqrt(result);
841 result /= (s2 + DBL_EPSILON);
845 #endif // HAVE_OPENCL
848 static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result)
850 CV_INSTRUMENT_REGION_IPP();
852 #if IPP_VERSION_X100 >= 700
853 Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
855 if( normType & CV_RELATIVE )
857 normType &= NORM_TYPE_MASK;
859 size_t total_size = src1.total();
860 int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
861 if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
862 && cols > 0 && (size_t)rows*cols == total_size )
866 IppiSize sz = { cols, rows };
867 int type = src1.type();
869 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
870 ippiMaskNormDiffFuncC1 ippiNormRel_C1MR =
871 normType == NORM_INF ?
872 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_8u_C1MR :
873 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_16u_C1MR :
874 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_32f_C1MR :
876 normType == NORM_L1 ?
877 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_8u_C1MR :
878 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_16u_C1MR :
879 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_32f_C1MR :
881 normType == NORM_L2 || normType == NORM_L2SQR ?
882 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_8u_C1MR :
883 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_16u_C1MR :
884 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_32f_C1MR :
886 if( ippiNormRel_C1MR )
889 if( CV_INSTRUMENT_FUN_IPP(ippiNormRel_C1MR, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
891 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
898 IppiSize sz = { cols*src1.channels(), rows };
899 int type = src1.depth();
901 typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
902 typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
903 ippiNormRelFuncHint ippiNormRelHint =
904 normType == NORM_L1 ?
905 (type == CV_32F ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
907 normType == NORM_L2 || normType == NORM_L2SQR ?
908 (type == CV_32F ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
910 ippiNormRelFuncNoHint ippiNormRel =
911 normType == NORM_INF ?
912 (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
913 type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
914 type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
915 type == CV_32F ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
917 normType == NORM_L1 ?
918 (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
919 type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
920 type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
922 normType == NORM_L2 || normType == NORM_L2SQR ?
923 (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
924 type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
925 type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
927 if( ippiNormRelHint || ippiNormRel )
930 IppStatus ret = ippiNormRelHint ? CV_INSTRUMENT_FUN_IPP(ippiNormRelHint, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) :
931 CV_INSTRUMENT_FUN_IPP(ippiNormRel, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm);
934 result = (normType == NORM_L2SQR) ? norm * norm : norm;
943 normType &= NORM_TYPE_MASK;
945 size_t total_size = src1.total();
946 int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
947 if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
948 && cols > 0 && (size_t)rows*cols == total_size )
952 IppiSize sz = { cols, rows };
953 int type = src1.type();
955 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
956 ippiMaskNormDiffFuncC1 ippiNormDiff_C1MR =
957 normType == NORM_INF ?
958 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
959 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
960 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
962 normType == NORM_L1 ?
963 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
964 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
965 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
967 normType == NORM_L2 || normType == NORM_L2SQR ?
968 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
969 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
970 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
972 if( ippiNormDiff_C1MR )
975 if( CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C1MR, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
977 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
981 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
982 ippiMaskNormDiffFuncC3 ippiNormDiff_C3CMR =
983 normType == NORM_INF ?
984 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
985 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
986 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
988 normType == NORM_L1 ?
989 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
990 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
991 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
993 normType == NORM_L2 || normType == NORM_L2SQR ?
994 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
995 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
996 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
998 if (cv::ipp::getIppTopFeatures() & (
999 #if IPP_VERSION_X100 >= 201700
1003 ) // IPP_DISABLE_NORM_16UC3_mask_small (#11399)
1005 if (normType == NORM_L1 && type == CV_16UC3 && sz.width < 16)
1008 if( ippiNormDiff_C3CMR )
1010 Ipp64f norm1, norm2, norm3;
1011 if( CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
1012 CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
1013 CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
1016 normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
1017 normType == NORM_L1 ? norm1 + norm2 + norm3 :
1018 normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
1020 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
1027 IppiSize sz = { cols*src1.channels(), rows };
1028 int type = src1.depth();
1030 typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
1031 typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
1032 ippiNormDiffFuncHint ippiNormDiffHint =
1033 normType == NORM_L1 ?
1034 (type == CV_32F ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
1036 normType == NORM_L2 || normType == NORM_L2SQR ?
1037 (type == CV_32F ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
1039 ippiNormDiffFuncNoHint ippiNormDiff =
1040 normType == NORM_INF ?
1041 (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
1042 type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
1043 type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
1044 type == CV_32F ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
1046 normType == NORM_L1 ?
1047 (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
1048 type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
1049 type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
1051 normType == NORM_L2 || normType == NORM_L2SQR ?
1052 (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
1053 type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
1054 type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
1056 if( ippiNormDiffHint || ippiNormDiff )
1059 IppStatus ret = ippiNormDiffHint ? CV_INSTRUMENT_FUN_IPP(ippiNormDiffHint, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) :
1060 CV_INSTRUMENT_FUN_IPP(ippiNormDiff, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm);
1063 result = (normType == NORM_L2SQR) ? norm * norm : norm;
1070 CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result);
1077 double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
1079 CV_INSTRUMENT_REGION();
1081 CV_CheckTypeEQ(_src1.type(), _src2.type(), "Input type mismatch");
1082 CV_Assert(_src1.sameSize(_src2));
1084 #if defined HAVE_OPENCL || defined HAVE_IPP
1089 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
1090 ocl_norm(_src1, _src2, normType, _mask, _result),
1094 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(_src1, _src2, normType, _mask, _result), _result);
1096 if( normType & CV_RELATIVE )
1098 return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
1101 Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
1102 int depth = src1.depth(), cn = src1.channels();
1105 CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
1106 normType == NORM_L2 || normType == NORM_L2SQR ||
1107 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
1109 if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
1111 size_t len = src1.total()*src1.channels();
1112 if( len == (size_t)(int)len )
1114 if( src1.depth() == CV_32F )
1116 const float* data1 = src1.ptr<float>();
1117 const float* data2 = src2.ptr<float>();
1119 if( normType == NORM_L2 )
1122 GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
1123 return std::sqrt(result);
1125 if( normType == NORM_L2SQR )
1128 GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
1131 if( normType == NORM_L1 )
1134 GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
1137 if( normType == NORM_INF )
1140 GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
1147 CV_Assert( mask.empty() || mask.type() == CV_8U );
1149 if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
1154 bitwise_xor(src1, src2, temp);
1155 bitwise_and(temp, mask, temp);
1156 return norm(temp, normType);
1158 int cellSize = normType == NORM_HAMMING ? 1 : 2;
1160 const Mat* arrays[] = {&src1, &src2, 0};
1161 uchar* ptrs[2] = {};
1162 NAryMatIterator it(arrays, ptrs);
1163 int total = (int)it.size;
1166 for( size_t i = 0; i < it.nplanes; i++, ++it )
1168 result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
1174 NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
1175 CV_Assert( func != 0 );
1177 const Mat* arrays[] = {&src1, &src2, &mask, 0};
1178 uchar* ptrs[3] = {};
1188 NAryMatIterator it(arrays, ptrs);
1189 CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");
1191 if ((normType == NORM_L1 && depth <= CV_16S) ||
1192 ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
1194 // special case to handle "integer" overflow in accumulator
1195 const size_t esz = src1.elemSize();
1196 const int total = (int)it.size;
1197 const int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
1198 const int blockSize = std::min(total, intSumBlockSize);
1202 for (size_t i = 0; i < it.nplanes; i++, ++it)
1204 for (int j = 0; j < total; j += blockSize)
1206 int bsz = std::min(total - j, blockSize);
1207 func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&isum, bsz, cn);
1209 if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
1222 else if (depth == CV_16F)
1224 const size_t esz = src1.elemSize();
1225 const int total = (int)it.size;
1226 const int blockSize = std::min(total, divUp(512, cn));
1227 AutoBuffer<float, 1026/*divUp(512,3)*3*2*/> fltbuf(blockSize * cn * 2);
1228 float* data0 = fltbuf.data();
1229 float* data1 = fltbuf.data() + blockSize * cn;
1230 for (size_t i = 0; i < it.nplanes; i++, ++it)
1232 for (int j = 0; j < total; j += blockSize)
1234 int bsz = std::min(total - j, blockSize);
1235 hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
1236 hal::cvt16f32f((const float16_t*)ptrs[1], data1, bsz * cn);
1237 func((uchar*)data0, (uchar*)data1, ptrs[2], (uchar*)&result.d, bsz, cn);
1247 // generic implementation
1248 for (size_t i = 0; i < it.nplanes; i++, ++it)
1250 func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&result, (int)it.size, cn);
1254 if( normType == NORM_INF )
1256 if (depth == CV_64F || depth == CV_16F)
1258 else if (depth == CV_32F)
1263 else if( normType == NORM_L2 )
1264 return std::sqrt(result.d);
1269 cv::Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
1271 return cv::hal::normHamming(a, b, size);
1274 double PSNR(InputArray _src1, InputArray _src2, double R)
1276 CV_INSTRUMENT_REGION();
1278 //Input arrays must have depth CV_8U
1279 CV_Assert( _src1.type() == _src2.type() );
1281 double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
1282 return 20*log10(R/(diff+DBL_EPSILON));
1287 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
1288 double scale, double delta )
1290 UMat src = _src.getUMat();
1293 src.convertTo( _dst, dtype, scale, delta );
1294 else if (src.channels() <= 4)
1296 const ocl::Device & dev = ocl::Device::getDefault();
1298 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
1299 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
1300 rowsPerWI = dev.isIntel() ? 4 : 1;
1302 float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
1303 bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
1304 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
1305 haveDelta = std::fabs(delta) > DBL_EPSILON,
1306 doubleSupport = dev.doubleFPConfig() > 0;
1308 if (!haveScale && !haveDelta && stype == dtype)
1310 _src.copyTo(_dst, _mask);
1315 _dst.setTo(Scalar(delta), _mask);
1319 if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
1323 String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
1324 " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
1325 ocl::typeToStr(stype), ocl::typeToStr(dtype),
1326 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
1327 rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
1328 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
1329 doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1330 haveScale ? " -D HAVE_SCALE" : "",
1331 haveDelta ? " -D HAVE_DELTA" : "",
1332 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
1334 ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
1338 UMat mask = _mask.getUMat(), dst = _dst.getUMat();
1340 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
1341 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
1342 dstarg = ocl::KernelArg::ReadWrite(dst);
1347 k.args(srcarg, maskarg, dstarg, fscale, fdelta);
1349 k.args(srcarg, maskarg, dstarg, fscale);
1354 k.args(srcarg, maskarg, dstarg, fdelta);
1356 k.args(srcarg, maskarg, dstarg);
1359 size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
1360 return k.run(2, globalsize, NULL, false);
1365 src.convertTo( temp, dtype, scale, delta );
1366 temp.copyTo( _dst, _mask );
1371 #endif // HAVE_OPENCL
1373 void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
1374 int norm_type, int rtype, InputArray _mask)
1376 CV_INSTRUMENT_REGION();
1378 double scale = 1, shift = 0;
1379 int type = _src.type(), depth = CV_MAT_DEPTH(type);
1382 rtype = _dst.fixedType() ? _dst.depth() : depth;
1384 if( norm_type == CV_MINMAX )
1386 double smin = 0, smax = 0;
1387 double dmin = MIN( a, b ), dmax = MAX( a, b );
1388 minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
1389 scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
1390 if( rtype == CV_32F )
1392 scale = (float)scale;
1393 shift = (float)dmin - (float)(smin*scale);
1396 shift = dmin - smin*scale;
1398 else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
1400 scale = norm( _src, norm_type, _mask );
1401 scale = scale > DBL_EPSILON ? a/scale : 0.;
1405 CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
1407 CV_OCL_RUN(_dst.isUMat(),
1408 ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
1410 Mat src = _src.getMat();
1412 src.convertTo( _dst, rtype, scale, shift );
1416 src.convertTo( temp, rtype, scale, shift );
1417 temp.copyTo( _dst, _mask );