1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
7 #include "opencl_kernels_core.hpp"
8 #include "opencv2/core/openvx/ovx_defs.hpp"
14 static bool ipp_mean( Mat &src, Mat &mask, Scalar &ret )
16 CV_INSTRUMENT_REGION_IPP()
18 #if IPP_VERSION_X100 >= 700
19 size_t total_size = src.total();
20 int cn = src.channels();
23 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
24 if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
26 IppiSize sz = { cols, rows };
27 int type = src.type();
30 typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
31 ippiMaskMeanFuncC1 ippiMean_C1MR =
32 type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
33 type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
34 type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
39 if( CV_INSTRUMENT_FUN_IPP(ippiMean_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 )
45 typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
46 ippiMaskMeanFuncC3 ippiMean_C3MR =
47 type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
48 type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
49 type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
53 Ipp64f res1, res2, res3;
54 if( CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 &&
55 CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 &&
56 CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 )
58 ret = Scalar(res1, res2, res3);
65 typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
66 typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
67 ippiMeanFuncHint ippiMeanHint =
68 type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
69 type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
70 type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
72 ippiMeanFuncNoHint ippiMean =
73 type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
74 type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
75 type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
76 type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
77 type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
78 type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
79 type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
80 type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
81 type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
83 // Make sure only zero or one version of the function pointer is valid
84 CV_Assert(!ippiMeanHint || !ippiMean);
85 if( ippiMeanHint || ippiMean )
88 IppStatus status = ippiMeanHint ? CV_INSTRUMENT_FUN_IPP(ippiMeanHint, src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
89 CV_INSTRUMENT_FUN_IPP(ippiMean, src.ptr(), (int)src.step[0], sz, res);
92 for( int i = 0; i < cn; i++ )
107 cv::Scalar cv::mean( InputArray _src, InputArray _mask )
109 CV_INSTRUMENT_REGION()
111 Mat src = _src.getMat(), mask = _mask.getMat();
112 CV_Assert( mask.empty() || mask.type() == CV_8U );
114 int k, cn = src.channels(), depth = src.depth();
117 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_mean(src, mask, s), s)
119 SumFunc func = getSumFunc(depth);
121 CV_Assert( cn <= 4 && func != 0 );
123 const Mat* arrays[] = {&src, &mask, 0};
125 NAryMatIterator it(arrays, ptrs);
126 int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
128 AutoBuffer<int> _buf;
129 int* buf = (int*)&s[0];
130 bool blockSum = depth <= CV_16S;
131 size_t esz = 0, nz0 = 0;
135 intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
136 blockSize = std::min(blockSize, intSumBlockSize);
140 for( k = 0; k < cn; k++ )
142 esz = src.elemSize();
145 for( size_t i = 0; i < it.nplanes; i++, ++it )
147 for( j = 0; j < total; j += blockSize )
149 int bsz = std::min(total - j, blockSize);
150 int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn );
153 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
155 for( k = 0; k < cn; k++ )
167 return s*(nz0 ? 1./nz0 : 0);
170 //==================================================================================================
174 template <typename T, typename ST, typename SQT>
177 int operator () (const T *, const uchar *, ST *, SQT *, int, int) const
183 template <typename T>
184 inline void addSqrChannels(T * sum, T * sqsum, T * buf, int cn)
186 for (int i = 0; i < 4; ++i)
188 sum[i % cn] += buf[i];
189 sqsum[i % cn] += buf[4 + i];
196 struct SumSqr_SIMD<uchar, int, int>
198 int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
200 if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
204 __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
205 const int len_16 = len & ~15;
207 for ( ; x <= len_16 - 16; )
209 const int len_tmp = min(x + 2048, len_16);
210 __m128i v_sum_tmp = v_zero;
211 for ( ; x <= len_tmp - 16; x += 16)
213 __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
214 __m128i v_half_0 = _mm_unpacklo_epi8(v_src, v_zero);
215 __m128i v_half_1 = _mm_unpackhi_epi8(v_src, v_zero);
216 v_sum_tmp = _mm_add_epi16(v_sum_tmp, _mm_add_epi16(v_half_0, v_half_1));
217 __m128i v_half_2 = _mm_unpacklo_epi16(v_half_0, v_half_1);
218 __m128i v_half_3 = _mm_unpackhi_epi16(v_half_0, v_half_1);
219 v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_2, v_half_2));
220 v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_3, v_half_3));
222 v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_sum_tmp, v_zero));
223 v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_sum_tmp, v_zero));
226 for ( ; x <= len - 8; x += 8)
228 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero);
229 __m128i v_half_0 = _mm_unpackhi_epi64(v_src, v_src);
230 __m128i v_sum_tmp = _mm_add_epi16(v_src, v_half_0);
231 __m128i v_half_1 = _mm_unpacklo_epi16(v_src, v_half_0);
233 v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_sum_tmp, v_zero));
234 v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_1, v_half_1));
237 int CV_DECL_ALIGNED(16) ar[8];
238 _mm_store_si128((__m128i*)ar, v_sum);
239 _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
241 addSqrChannels(sum, sqsum, ar, cn);
248 struct SumSqr_SIMD<schar, int, int>
250 int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
252 if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
256 __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
257 const int len_16 = len & ~15;
259 for ( ; x <= len_16 - 16; )
261 const int len_tmp = min(x + 2048, len_16);
262 __m128i v_sum_tmp = v_zero;
263 for ( ; x <= len_tmp - 16; x += 16)
265 __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
266 __m128i v_half_0 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
267 __m128i v_half_1 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
268 v_sum_tmp = _mm_add_epi16(v_sum_tmp, _mm_add_epi16(v_half_0, v_half_1));
269 __m128i v_half_2 = _mm_unpacklo_epi16(v_half_0, v_half_1);
270 __m128i v_half_3 = _mm_unpackhi_epi16(v_half_0, v_half_1);
271 v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_2, v_half_2));
272 v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_3, v_half_3));
274 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_sum_tmp), 16));
275 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_sum_tmp), 16));
278 for ( ; x <= len - 8; x += 8)
280 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
281 __m128i v_half_0 = _mm_unpackhi_epi64(v_src, v_src);
282 __m128i v_sum_tmp = _mm_add_epi16(v_src, v_half_0);
283 __m128i v_half_1 = _mm_unpacklo_epi16(v_src, v_half_0);
285 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_sum_tmp), 16));
286 v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_1, v_half_1));
289 int CV_DECL_ALIGNED(16) ar[8];
290 _mm_store_si128((__m128i*)ar, v_sum);
291 _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
293 addSqrChannels(sum, sqsum, ar, cn);
301 template<typename T, typename ST, typename SQT>
302 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
308 SumSqr_SIMD<T, ST, SQT> vop;
309 int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4;
316 for( ; i < len; i++, src += cn )
319 s0 += v; sq0 += (SQT)v*v;
326 ST s0 = sum[0], s1 = sum[1];
327 SQT sq0 = sqsum[0], sq1 = sqsum[1];
328 for( ; i < len; i++, src += cn )
330 T v0 = src[0], v1 = src[1];
331 s0 += v0; sq0 += (SQT)v0*v0;
332 s1 += v1; sq1 += (SQT)v1*v1;
334 sum[0] = s0; sum[1] = s1;
335 sqsum[0] = sq0; sqsum[1] = sq1;
339 ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
340 SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
341 for( ; i < len; i++, src += cn )
343 T v0 = src[0], v1 = src[1], v2 = src[2];
344 s0 += v0; sq0 += (SQT)v0*v0;
345 s1 += v1; sq1 += (SQT)v1*v1;
346 s2 += v2; sq2 += (SQT)v2*v2;
348 sum[0] = s0; sum[1] = s1; sum[2] = s2;
349 sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
352 for( ; k < cn; k += 4 )
355 ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
356 SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
357 for( ; i < len; i++, src += cn )
360 v0 = src[0], v1 = src[1];
361 s0 += v0; sq0 += (SQT)v0*v0;
362 s1 += v1; sq1 += (SQT)v1*v1;
363 v0 = src[2], v1 = src[3];
364 s2 += v0; sq2 += (SQT)v0*v0;
365 s3 += v1; sq3 += (SQT)v1*v1;
367 sum[k] = s0; sum[k+1] = s1;
368 sum[k+2] = s2; sum[k+3] = s3;
369 sqsum[k] = sq0; sqsum[k+1] = sq1;
370 sqsum[k+2] = sq2; sqsum[k+3] = sq3;
381 for( i = 0; i < len; i++ )
385 s0 += v; sq0 += (SQT)v*v;
393 ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
394 SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
395 for( i = 0; i < len; i++, src += 3 )
398 T v0 = src[0], v1 = src[1], v2 = src[2];
399 s0 += v0; sq0 += (SQT)v0*v0;
400 s1 += v1; sq1 += (SQT)v1*v1;
401 s2 += v2; sq2 += (SQT)v2*v2;
404 sum[0] = s0; sum[1] = s1; sum[2] = s2;
405 sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
409 for( i = 0; i < len; i++, src += cn )
412 for( int k = 0; k < cn; k++ )
416 SQT sq = sqsum[k] + (SQT)v*v;
417 sum[k] = s; sqsum[k] = sq;
426 static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
427 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
429 static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
430 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
432 static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
433 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
435 static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
436 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
438 static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
439 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
441 static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
442 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
444 static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
445 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
447 typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int);
449 static SumSqrFunc getSumSqrTab(int depth)
451 static SumSqrFunc sumSqrTab[] =
453 (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
454 (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
457 return sumSqrTab[depth];
461 static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
463 CV_INSTRUMENT_REGION_OPENCL()
465 bool haveMask = _mask.kind() != _InputArray::NONE;
466 int nz = haveMask ? -1 : (int)_src.total();
467 Scalar mean(0), stddev(0);
468 const int cn = _src.channels();
473 int type = _src.type(), depth = CV_MAT_DEPTH(type);
474 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
475 isContinuous = _src.isContinuous(),
476 isMaskContinuous = _mask.isContinuous();
477 const ocl::Device &defDev = ocl::Device::getDefault();
478 int groups = defDev.maxComputeUnits();
479 if (defDev.isIntel())
481 static const int subSliceEUCount = 10;
482 groups = (groups / subSliceEUCount) * 2;
484 size_t wgs = defDev.maxWorkGroupSize();
486 int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
487 dtype = CV_MAKE_TYPE(ddepth, cn),
488 sqdtype = CV_MAKETYPE(sqddepth, cn);
489 CV_Assert(!haveMask || _mask.type() == CV_8UC1);
491 int wgs2_aligned = 1;
492 while (wgs2_aligned < (int)wgs)
496 if ( (!doubleSupport && depth == CV_64F) )
500 String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
501 " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s"
502 " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
503 ocl::typeToStr(type), ocl::typeToStr(depth),
504 ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
505 ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
506 ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
507 cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
508 isMaskContinuous ? " -D HAVE_MASK_CONT" : "",
509 ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
510 (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
511 doubleSupport ? " -D DOUBLE_SUPPORT" : "");
513 ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts);
517 int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) +
518 CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype));
519 UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
521 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
522 dbarg = ocl::KernelArg::PtrWriteOnly(db),
523 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
526 k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg);
528 k.args(srcarg, src.cols, (int)src.total(), groups, dbarg);
530 size_t globalsize = groups * wgs;
532 if(!k.run(1, &globalsize, &wgs, false))
535 typedef Scalar (* part_sum)(Mat m);
536 part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> };
537 Mat dbm = db.getMat(ACCESS_READ);
539 mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr()));
540 stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype)));
543 nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() +
544 groups * (CV_ELEM_SIZE(dtype) +
545 CV_ELEM_SIZE(sqdtype))))[0]);
548 double total = nz != 0 ? 1.0 / nz : 0;
550 for (int i = 0; i < cn; ++i)
553 stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.));
556 for( j = 0; j < 2; j++ )
558 const double * const sptr = j == 0 ? &mean[0] : &stddev[0];
559 _OutputArray _dst = j == 0 ? _mean : _sdv;
563 if( !_dst.fixedSize() )
564 _dst.create(cn, 1, CV_64F, -1, true);
565 Mat dst = _dst.getMat();
566 int dcn = (int)dst.total();
567 CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
568 (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
569 double* dptr = dst.ptr<double>();
570 for( k = 0; k < cn; k++ )
572 for( ; k < dcn; k++ )
581 static bool openvx_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask)
583 size_t total_size = src.total();
584 int rows = src.size[0], cols = rows ? (int)(total_size / rows) : 0;
585 if (src.type() != CV_8UC1|| !mask.empty() ||
586 (src.dims != 2 && !(src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size))
592 ivx::Context ctx = ovx::getOpenVXContext();
593 #ifndef VX_VERSION_1_1
594 if (ctx.vendorID() == VX_ID_KHRONOS)
595 return false; // Do not use OpenVX meanStdDev estimation for sample 1.0.1 implementation due to lack of accuracy
599 ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
600 ivx::Image::createAddressing(cols, rows, 1, (vx_int32)(src.step[0])), src.ptr());
602 vx_float32 mean_temp, stddev_temp;
603 ivx::IVX_CHECK_STATUS(vxuMeanStdDev(ctx, ia, &mean_temp, &stddev_temp));
607 if (!_mean.fixedSize())
608 _mean.create(1, 1, CV_64F, -1, true);
609 Mat mean = _mean.getMat();
610 CV_Assert(mean.type() == CV_64F && mean.isContinuous() &&
611 (mean.cols == 1 || mean.rows == 1) && mean.total() >= 1);
612 double *pmean = mean.ptr<double>();
613 pmean[0] = mean_temp;
614 for (int c = 1; c < (int)mean.total(); c++)
620 if (!_sdv.fixedSize())
621 _sdv.create(1, 1, CV_64F, -1, true);
622 Mat stddev = _sdv.getMat();
623 CV_Assert(stddev.type() == CV_64F && stddev.isContinuous() &&
624 (stddev.cols == 1 || stddev.rows == 1) && stddev.total() >= 1);
625 double *pstddev = stddev.ptr<double>();
626 pstddev[0] = stddev_temp;
627 for (int c = 1; c < (int)stddev.total(); c++)
631 catch (ivx::RuntimeError & e)
633 VX_DbgThrow(e.what());
635 catch (ivx::WrapperError & e)
637 VX_DbgThrow(e.what());
645 static bool ipp_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask)
647 CV_INSTRUMENT_REGION_IPP()
649 #if IPP_VERSION_X100 >= 700
650 int cn = src.channels();
652 #if IPP_VERSION_X100 < 201801
653 // IPP_DISABLE: C3C functions can read outside of allocated memory
658 size_t total_size = src.total();
659 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
660 if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
663 Ipp64f stddev_temp[3];
664 Ipp64f *pmean = &mean_temp[0];
665 Ipp64f *pstddev = &stddev_temp[0];
670 if( !_mean.fixedSize() )
671 _mean.create(cn, 1, CV_64F, -1, true);
672 mean = _mean.getMat();
673 dcn_mean = (int)mean.total();
674 pmean = mean.ptr<Ipp64f>();
679 if( !_sdv.fixedSize() )
680 _sdv.create(cn, 1, CV_64F, -1, true);
681 stddev = _sdv.getMat();
682 dcn_stddev = (int)stddev.total();
683 pstddev = stddev.ptr<Ipp64f>();
685 for( int c = cn; c < dcn_mean; c++ )
687 for( int c = cn; c < dcn_stddev; c++ )
689 IppiSize sz = { cols, rows };
690 int type = src.type();
693 typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
694 ippiMaskMeanStdDevFuncC1 ippiMean_StdDev_C1MR =
695 type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
696 type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
697 type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
699 if( ippiMean_StdDev_C1MR )
701 if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 )
706 typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
707 ippiMaskMeanStdDevFuncC3 ippiMean_StdDev_C3CMR =
708 type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
709 type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
710 type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
712 if( ippiMean_StdDev_C3CMR )
714 if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
715 CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
716 CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
724 typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
725 ippiMeanStdDevFuncC1 ippiMean_StdDev_C1R =
726 type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
727 type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
728 #if (IPP_VERSION_X100 >= 810)
729 type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
732 if( ippiMean_StdDev_C1R )
734 if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1R, src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 )
739 typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
740 ippiMeanStdDevFuncC3 ippiMean_StdDev_C3CR =
741 type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
742 type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
743 type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
745 if( ippiMean_StdDev_C3CR )
747 if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
748 CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
749 CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
757 CV_UNUSED(src); CV_UNUSED(_mean); CV_UNUSED(_sdv); CV_UNUSED(mask);
765 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
767 CV_INSTRUMENT_REGION()
769 CV_Assert(!_src.empty());
770 CV_Assert( _mask.empty() || _mask.type() == CV_8UC1 );
772 CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
773 ocl_meanStdDev(_src, _mean, _sdv, _mask))
775 Mat src = _src.getMat(), mask = _mask.getMat();
777 CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows),
778 openvx_meanStdDev(src, _mean, _sdv, mask))
780 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_meanStdDev(src, _mean, _sdv, mask));
782 int k, cn = src.channels(), depth = src.depth();
784 SumSqrFunc func = getSumSqrTab(depth);
786 CV_Assert( func != 0 );
788 const Mat* arrays[] = {&src, &mask, 0};
790 NAryMatIterator it(arrays, ptrs);
791 int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
792 int j, count = 0, nz0 = 0;
793 AutoBuffer<double> _buf(cn*4);
794 double *s = (double*)_buf.data(), *sq = s + cn;
795 int *sbuf = (int*)s, *sqbuf = (int*)sq;
796 bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S;
799 for( k = 0; k < cn; k++ )
804 intSumBlockSize = 1 << 15;
805 blockSize = std::min(blockSize, intSumBlockSize);
806 sbuf = (int*)(sq + cn);
809 for( k = 0; k < cn; k++ )
810 sbuf[k] = sqbuf[k] = 0;
811 esz = src.elemSize();
814 for( size_t i = 0; i < it.nplanes; i++, ++it )
816 for( j = 0; j < total; j += blockSize )
818 int bsz = std::min(total - j, blockSize);
819 int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn );
822 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
824 for( k = 0; k < cn; k++ )
831 for( k = 0; k < cn; k++ )
845 double scale = nz0 ? 1./nz0 : 0.;
846 for( k = 0; k < cn; k++ )
849 sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
852 for( j = 0; j < 2; j++ )
854 const double* sptr = j == 0 ? s : sq;
855 _OutputArray _dst = j == 0 ? _mean : _sdv;
859 if( !_dst.fixedSize() )
860 _dst.create(cn, 1, CV_64F, -1, true);
861 Mat dst = _dst.getMat();
862 int dcn = (int)dst.total();
863 CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
864 (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
865 double* dptr = dst.ptr<double>();
866 for( k = 0; k < cn; k++ )
868 for( ; k < dcn; k++ )