1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
7 #include "opencl_kernels_core.hpp"
10 #include "count_non_zero.simd.hpp"
11 #include "count_non_zero.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
15 static CountNonZeroFunc getCountNonZeroTab(int depth)
17 CV_INSTRUMENT_REGION();
18 CV_CPU_DISPATCH(getCountNonZeroTab, (depth),
19 CV_CPU_DISPATCH_MODES_ALL);
23 static bool ocl_countNonZero( InputArray _src, int & res )
25 int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
26 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
28 if (depth == CV_64F && !doubleSupport)
31 int dbsize = ocl::Device::getDefault().maxComputeUnits();
32 size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
35 while (wgs2_aligned < (int)wgs)
39 ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
40 format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
41 " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
42 ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
43 ocl::typeToStr(depth), (int)wgs, kercn,
44 wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
45 _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
49 UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
50 k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
51 dbsize, ocl::KernelArg::PtrWriteOnly(db));
53 size_t globalsize = dbsize * wgs;
54 if (k.run(1, &globalsize, &wgs, true))
55 return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
61 static bool ipp_countNonZero( Mat &src, int &res )
63 CV_INSTRUMENT_REGION_IPP();
65 // see https://github.com/opencv/opencv/issues/17453
66 if (src.dims <= 2 && src.step > 520000 && cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
69 #if IPP_VERSION_X100 < 201801
70 // Poor performance of SSE42
71 if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
76 int depth = src.depth();
81 IppiSize size = {src.cols*src.channels(), src.rows};
84 status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);
85 else if(depth == CV_32F)
86 status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);
93 res = size.width*size.height - count;
98 const Mat *arrays[] = {&src, NULL};
100 NAryMatIterator it(arrays, planes, 1);
101 IppiSize size = {(int)it.size*src.channels(), 1};
103 for (size_t i = 0; i < it.nplanes; i++, ++it)
106 status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);
107 else if(depth == CV_32F)
108 status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);
112 if(status < 0 || (int)it.planes->total()*src.channels() < count)
115 res += (int)it.planes->total()*src.channels() - count;
123 int countNonZero(InputArray _src)
125 CV_INSTRUMENT_REGION();
127 int type = _src.type(), cn = CV_MAT_CN(type);
128 CV_Assert( cn == 1 );
130 #if defined HAVE_OPENCL || defined HAVE_IPP
135 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
136 ocl_countNonZero(_src, res),
140 Mat src = _src.getMat();
141 CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);
143 CountNonZeroFunc func = getCountNonZeroTab(src.depth());
144 CV_Assert( func != 0 );
146 const Mat* arrays[] = {&src, 0};
148 NAryMatIterator it(arrays, ptrs);
149 int total = (int)it.size, nz = 0;
151 for( size_t i = 0; i < it.nplanes; i++, ++it )
152 nz += func( ptrs[0], total );
157 void findNonZero(InputArray _src, OutputArray _idx)
159 Mat src = _src.getMat();
160 CV_Assert( src.channels() == 1 && src.dims == 2 );
162 int depth = src.depth();
163 std::vector<Point> idxvec;
164 int rows = src.rows, cols = src.cols;
165 AutoBuffer<int> buf_(cols + 1);
166 int* buf = buf_.data();
168 for( int i = 0; i < rows; i++ )
171 const uchar* ptr8 = src.ptr(i);
172 if( depth == CV_8U || depth == CV_8S )
174 for( j = 0; j < cols; j++ )
175 if( ptr8[j] != 0 ) buf[k++] = j;
177 else if( depth == CV_16U || depth == CV_16S )
179 const ushort* ptr16 = (const ushort*)ptr8;
180 for( j = 0; j < cols; j++ )
181 if( ptr16[j] != 0 ) buf[k++] = j;
183 else if( depth == CV_32S )
185 const int* ptr32s = (const int*)ptr8;
186 for( j = 0; j < cols; j++ )
187 if( ptr32s[j] != 0 ) buf[k++] = j;
189 else if( depth == CV_32F )
191 const float* ptr32f = (const float*)ptr8;
192 for( j = 0; j < cols; j++ )
193 if( ptr32f[j] != 0 ) buf[k++] = j;
197 const double* ptr64f = (const double*)ptr8;
198 for( j = 0; j < cols; j++ )
199 if( ptr64f[j] != 0 ) buf[k++] = j;
204 size_t sz = idxvec.size();
205 idxvec.resize(sz + k);
206 for( j = 0; j < k; j++ )
207 idxvec[sz + j] = Point(buf[j], i);
211 if( idxvec.empty() || (_idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous()) )
214 if( !idxvec.empty() )
215 Mat(idxvec).copyTo(_idx);