Instrumentation for OpenCV API regions and IPP functions;
[platform/upstream/opencv.git] / modules / imgproc / src / thresh.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44 #include "opencl_kernels_imgproc.hpp"
45
46 #if CV_NEON && defined(__aarch64__)
47 #include <arm_neon.h>
48 namespace cv {
49 // Workaround with missing definitions of vreinterpretq_u64_f64/vreinterpretq_f64_u64
50 template <typename T> static inline
51 uint64x2_t vreinterpretq_u64_f64(T a)
52 {
53     return (uint64x2_t) a;
54 }
55 template <typename T> static inline
56 float64x2_t vreinterpretq_f64_u64(T a)
57 {
58     return (float64x2_t) a;
59 }
60 } // namespace cv
61 #endif
62
63 namespace cv
64 {
65
66 static void
67 thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
68 {
69     Size roi = _src.size();
70     roi.width *= _src.channels();
71     size_t src_step = _src.step;
72     size_t dst_step = _dst.step;
73
74     if( _src.isContinuous() && _dst.isContinuous() )
75     {
76         roi.width *= roi.height;
77         roi.height = 1;
78         src_step = dst_step = roi.width;
79     }
80
81 #ifdef HAVE_TEGRA_OPTIMIZATION
82     if (tegra::useTegra() && tegra::thresh_8u(_src, _dst, roi.width, roi.height, thresh, maxval, type))
83         return;
84 #endif
85
86 #if defined(HAVE_IPP)
87     CV_IPP_CHECK()
88     {
89         IppiSize sz = { roi.width, roi.height };
90         CV_SUPPRESS_DEPRECATED_START
91         switch( type )
92         {
93         case THRESH_TRUNC:
94             if (_src.data == _dst.data && CV_INSTRUMENT_FUN_IPP(ippiThreshold_GT_8u_C1IR, _dst.ptr(), (int)dst_step, sz, thresh) >= 0)
95             {
96                 CV_IMPL_ADD(CV_IMPL_IPP);
97                 return;
98             }
99             if (CV_INSTRUMENT_FUN_IPP(ippiThreshold_GT_8u_C1R, _src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh) >= 0)
100             {
101                 CV_IMPL_ADD(CV_IMPL_IPP);
102                 return;
103             }
104             setIppErrorStatus();
105             break;
106         case THRESH_TOZERO:
107             if (_src.data == _dst.data && CV_INSTRUMENT_FUN_IPP(ippiThreshold_LTVal_8u_C1IR, _dst.ptr(), (int)dst_step, sz, thresh+1, 0) >= 0)
108             {
109                 CV_IMPL_ADD(CV_IMPL_IPP);
110                 return;
111             }
112             if (CV_INSTRUMENT_FUN_IPP(ippiThreshold_LTVal_8u_C1R, _src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh + 1, 0) >= 0)
113             {
114                 CV_IMPL_ADD(CV_IMPL_IPP);
115                 return;
116             }
117             setIppErrorStatus();
118             break;
119         case THRESH_TOZERO_INV:
120             if (_src.data == _dst.data && CV_INSTRUMENT_FUN_IPP(ippiThreshold_GTVal_8u_C1IR, _dst.ptr(), (int)dst_step, sz, thresh, 0) >= 0)
121             {
122                 CV_IMPL_ADD(CV_IMPL_IPP);
123                 return;
124             }
125             if (CV_INSTRUMENT_FUN_IPP(ippiThreshold_GTVal_8u_C1R, _src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh, 0) >= 0)
126             {
127                 CV_IMPL_ADD(CV_IMPL_IPP);
128                 return;
129             }
130             setIppErrorStatus();
131             break;
132         }
133         CV_SUPPRESS_DEPRECATED_END
134     }
135 #endif
136
137     int j = 0;
138     const uchar* src = _src.ptr();
139     uchar* dst = _dst.ptr();
140 #if CV_SSE2
141     if( (roi.width >= 8) && checkHardwareSupport(CV_CPU_SSE2) )
142     {
143         __m128i _x80 = _mm_set1_epi8('\x80');
144         __m128i thresh_u = _mm_set1_epi8(thresh);
145         __m128i thresh_s = _mm_set1_epi8(thresh ^ 0x80);
146         __m128i maxval_ = _mm_set1_epi8(maxval);
147
148         switch( type )
149         {
150         case THRESH_BINARY:
151             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
152             {
153                 for( j = 0; j <= roi.width - 32; j += 32 )
154                 {
155                     __m128i v0, v1;
156                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
157                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
158                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
159                     v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s );
160                     v0 = _mm_and_si128( v0, maxval_ );
161                     v1 = _mm_and_si128( v1, maxval_ );
162                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
163                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
164                 }
165
166                 for( ; j <= roi.width - 8; j += 8 )
167                 {
168                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
169                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
170                     v0 = _mm_and_si128( v0, maxval_ );
171                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
172                 }
173             }
174             break;
175
176         case THRESH_BINARY_INV:
177             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
178             {
179                 for( j = 0; j <= roi.width - 32; j += 32 )
180                 {
181                     __m128i v0, v1;
182                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
183                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
184                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
185                     v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s );
186                     v0 = _mm_andnot_si128( v0, maxval_ );
187                     v1 = _mm_andnot_si128( v1, maxval_ );
188                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
189                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
190                 }
191
192                 for( ; j <= roi.width - 8; j += 8 )
193                 {
194                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
195                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
196                     v0 = _mm_andnot_si128( v0, maxval_ );
197                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
198                 }
199             }
200             break;
201
202         case THRESH_TRUNC:
203             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
204             {
205                 for( j = 0; j <= roi.width - 32; j += 32 )
206                 {
207                     __m128i v0, v1;
208                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
209                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
210                     v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u ));
211                     v1 = _mm_subs_epu8( v1, _mm_subs_epu8( v1, thresh_u ));
212                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
213                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
214                 }
215
216                 for( ; j <= roi.width - 8; j += 8 )
217                 {
218                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
219                     v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u ));
220                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
221                 }
222             }
223             break;
224
225         case THRESH_TOZERO:
226             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
227             {
228                 for( j = 0; j <= roi.width - 32; j += 32 )
229                 {
230                     __m128i v0, v1;
231                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
232                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
233                     v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ));
234                     v1 = _mm_and_si128( v1, _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ));
235                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
236                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
237                 }
238
239                 for( ; j <= roi.width - 8; j += 8 )
240                 {
241                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
242                     v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ));
243                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
244                 }
245             }
246             break;
247
248         case THRESH_TOZERO_INV:
249             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
250             {
251                 for( j = 0; j <= roi.width - 32; j += 32 )
252                 {
253                     __m128i v0, v1;
254                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
255                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
256                     v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 );
257                     v1 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ), v1 );
258                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
259                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
260                 }
261
262                 for( ; j <= roi.width - 8; j += 8 )
263                 {
264                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
265                     v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 );
266                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
267                 }
268             }
269             break;
270         }
271     }
272 #elif CV_NEON
273     if( roi.width >= 16 )
274     {
275         uint8x16_t v_thresh = vdupq_n_u8(thresh), v_maxval = vdupq_n_u8(maxval);
276
277         switch( type )
278         {
279         case THRESH_BINARY:
280             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
281             {
282                 for ( j = 0; j <= roi.width - 16; j += 16)
283                     vst1q_u8(dst + j, vandq_u8(vcgtq_u8(vld1q_u8(src + j), v_thresh), v_maxval));
284             }
285             break;
286
287         case THRESH_BINARY_INV:
288             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
289             {
290                 for ( j = 0; j <= roi.width - 16; j += 16)
291                     vst1q_u8(dst + j, vandq_u8(vcleq_u8(vld1q_u8(src + j), v_thresh), v_maxval));
292             }
293             break;
294
295         case THRESH_TRUNC:
296             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
297             {
298                 for ( j = 0; j <= roi.width - 16; j += 16)
299                     vst1q_u8(dst + j, vminq_u8(vld1q_u8(src + j), v_thresh));
300             }
301             break;
302
303         case THRESH_TOZERO:
304             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
305             {
306                 for ( j = 0; j <= roi.width - 16; j += 16)
307                 {
308                     uint8x16_t v_src = vld1q_u8(src + j), v_mask = vcgtq_u8(v_src, v_thresh);
309                     vst1q_u8(dst + j, vandq_u8(v_mask, v_src));
310                 }
311             }
312             break;
313
314         case THRESH_TOZERO_INV:
315             for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
316             {
317                 for ( j = 0; j <= roi.width - 16; j += 16)
318                 {
319                     uint8x16_t v_src = vld1q_u8(src + j), v_mask = vcleq_u8(v_src, v_thresh);
320                     vst1q_u8(dst + j, vandq_u8(v_mask, v_src));
321                 }
322             }
323             break;
324         }
325     }
326 #endif
327
328     int j_scalar = j;
329     if( j_scalar < roi.width )
330     {
331         const int thresh_pivot = thresh + 1;
332         uchar tab[256];
333         switch( type )
334         {
335         case THRESH_BINARY:
336             memset(tab, 0, thresh_pivot);
337             if (thresh_pivot < 256) {
338                 memset(tab + thresh_pivot, maxval, 256 - thresh_pivot);
339             }
340             break;
341         case THRESH_BINARY_INV:
342             memset(tab, maxval, thresh_pivot);
343             if (thresh_pivot < 256) {
344                 memset(tab + thresh_pivot, 0, 256 - thresh_pivot);
345             }
346             break;
347         case THRESH_TRUNC:
348             for( int i = 0; i <= thresh; i++ )
349                 tab[i] = (uchar)i;
350             if (thresh_pivot < 256) {
351                 memset(tab + thresh_pivot, thresh, 256 - thresh_pivot);
352             }
353             break;
354         case THRESH_TOZERO:
355             memset(tab, 0, thresh_pivot);
356             for( int i = thresh_pivot; i < 256; i++ )
357                 tab[i] = (uchar)i;
358             break;
359         case THRESH_TOZERO_INV:
360             for( int i = 0; i <= thresh; i++ )
361                 tab[i] = (uchar)i;
362             if (thresh_pivot < 256) {
363                 memset(tab + thresh_pivot, 0, 256 - thresh_pivot);
364             }
365             break;
366         }
367
368         src = _src.ptr();
369         dst = _dst.ptr();
370         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
371         {
372             j = j_scalar;
373 #if CV_ENABLE_UNROLLED
374             for( ; j <= roi.width - 4; j += 4 )
375             {
376                 uchar t0 = tab[src[j]];
377                 uchar t1 = tab[src[j+1]];
378
379                 dst[j] = t0;
380                 dst[j+1] = t1;
381
382                 t0 = tab[src[j+2]];
383                 t1 = tab[src[j+3]];
384
385                 dst[j+2] = t0;
386                 dst[j+3] = t1;
387             }
388 #endif
389             for( ; j < roi.width; j++ )
390                 dst[j] = tab[src[j]];
391         }
392     }
393 }
394
395
396 static void
397 thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
398 {
399     int i, j;
400     Size roi = _src.size();
401     roi.width *= _src.channels();
402     const short* src = _src.ptr<short>();
403     short* dst = _dst.ptr<short>();
404     size_t src_step = _src.step/sizeof(src[0]);
405     size_t dst_step = _dst.step/sizeof(dst[0]);
406
407 #if CV_SSE2
408     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
409 #endif
410
411     if( _src.isContinuous() && _dst.isContinuous() )
412     {
413         roi.width *= roi.height;
414         roi.height = 1;
415         src_step = dst_step = roi.width;
416     }
417
418 #ifdef HAVE_TEGRA_OPTIMIZATION
419     if (tegra::useTegra() && tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type))
420         return;
421 #endif
422
423 #if defined(HAVE_IPP)
424     CV_IPP_CHECK()
425     {
426         IppiSize sz = { roi.width, roi.height };
427         CV_SUPPRESS_DEPRECATED_START
428         switch( type )
429         {
430         case THRESH_TRUNC:
431             if (_src.data == _dst.data && CV_INSTRUMENT_FUN_IPP(ippiThreshold_GT_16s_C1IR, dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
432             {
433                 CV_IMPL_ADD(CV_IMPL_IPP);
434                 return;
435             }
436             if (CV_INSTRUMENT_FUN_IPP(ippiThreshold_GT_16s_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
437             {
438                 CV_IMPL_ADD(CV_IMPL_IPP);
439                 return;
440             }
441             setIppErrorStatus();
442             break;
443         case THRESH_TOZERO:
444             if (_src.data == _dst.data && CV_INSTRUMENT_FUN_IPP(ippiThreshold_LTVal_16s_C1IR, dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0)
445             {
446                 CV_IMPL_ADD(CV_IMPL_IPP);
447                 return;
448             }
449             if (CV_INSTRUMENT_FUN_IPP(ippiThreshold_LTVal_16s_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0)
450             {
451                 CV_IMPL_ADD(CV_IMPL_IPP);
452                 return;
453             }
454             setIppErrorStatus();
455             break;
456         case THRESH_TOZERO_INV:
457             if (_src.data == _dst.data && CV_INSTRUMENT_FUN_IPP(ippiThreshold_GTVal_16s_C1IR, dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
458             {
459                 CV_IMPL_ADD(CV_IMPL_IPP);
460                 return;
461             }
462             if (CV_INSTRUMENT_FUN_IPP(ippiThreshold_GTVal_16s_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
463             {
464                 CV_IMPL_ADD(CV_IMPL_IPP);
465                 return;
466             }
467             setIppErrorStatus();
468             break;
469         }
470         CV_SUPPRESS_DEPRECATED_END
471     }
472 #endif
473
474     switch( type )
475     {
476     case THRESH_BINARY:
477         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
478         {
479             j = 0;
480         #if CV_SSE2
481             if( useSIMD )
482             {
483                 __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
484                 for( ; j <= roi.width - 16; j += 16 )
485                 {
486                     __m128i v0, v1;
487                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
488                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
489                     v0 = _mm_cmpgt_epi16( v0, thresh8 );
490                     v1 = _mm_cmpgt_epi16( v1, thresh8 );
491                     v0 = _mm_and_si128( v0, maxval8 );
492                     v1 = _mm_and_si128( v1, maxval8 );
493                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
494                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
495                 }
496             }
497         #elif CV_NEON
498             int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
499
500             for( ; j <= roi.width - 8; j += 8 )
501             {
502                 uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh);
503                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
504             }
505         #endif
506
507             for( ; j < roi.width; j++ )
508                 dst[j] = src[j] > thresh ? maxval : 0;
509         }
510         break;
511
512     case THRESH_BINARY_INV:
513         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
514         {
515             j = 0;
516         #if CV_SSE2
517             if( useSIMD )
518             {
519                 __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
520                 for( ; j <= roi.width - 16; j += 16 )
521                 {
522                     __m128i v0, v1;
523                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
524                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
525                     v0 = _mm_cmpgt_epi16( v0, thresh8 );
526                     v1 = _mm_cmpgt_epi16( v1, thresh8 );
527                     v0 = _mm_andnot_si128( v0, maxval8 );
528                     v1 = _mm_andnot_si128( v1, maxval8 );
529                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
530                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
531                 }
532             }
533         #elif CV_NEON
534             int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
535
536             for( ; j <= roi.width - 8; j += 8 )
537             {
538                 uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh);
539                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
540             }
541         #endif
542
543             for( ; j < roi.width; j++ )
544                 dst[j] = src[j] <= thresh ? maxval : 0;
545         }
546         break;
547
548     case THRESH_TRUNC:
549         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
550         {
551             j = 0;
552         #if CV_SSE2
553             if( useSIMD )
554             {
555                 __m128i thresh8 = _mm_set1_epi16(thresh);
556                 for( ; j <= roi.width - 16; j += 16 )
557                 {
558                     __m128i v0, v1;
559                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
560                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
561                     v0 = _mm_min_epi16( v0, thresh8 );
562                     v1 = _mm_min_epi16( v1, thresh8 );
563                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
564                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
565                 }
566             }
567         #elif CV_NEON
568             int16x8_t v_thresh = vdupq_n_s16(thresh);
569
570             for( ; j <= roi.width - 8; j += 8 )
571                 vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh));
572         #endif
573
574             for( ; j < roi.width; j++ )
575                 dst[j] = std::min(src[j], thresh);
576         }
577         break;
578
579     case THRESH_TOZERO:
580         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
581         {
582             j = 0;
583         #if CV_SSE2
584             if( useSIMD )
585             {
586                 __m128i thresh8 = _mm_set1_epi16(thresh);
587                 for( ; j <= roi.width - 16; j += 16 )
588                 {
589                     __m128i v0, v1;
590                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
591                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
592                     v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8));
593                     v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8));
594                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
595                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
596                 }
597             }
598         #elif CV_NEON
599             int16x8_t v_thresh = vdupq_n_s16(thresh);
600
601             for( ; j <= roi.width - 8; j += 8 )
602             {
603                 int16x8_t v_src = vld1q_s16(src + j);
604                 uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh);
605                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
606             }
607         #endif
608
609             for( ; j < roi.width; j++ )
610             {
611                 short v = src[j];
612                 dst[j] = v > thresh ? v : 0;
613             }
614         }
615         break;
616
617     case THRESH_TOZERO_INV:
618         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
619         {
620             j = 0;
621         #if CV_SSE2
622             if( useSIMD )
623             {
624                 __m128i thresh8 = _mm_set1_epi16(thresh);
625                 for( ; j <= roi.width - 16; j += 16 )
626                 {
627                     __m128i v0, v1;
628                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
629                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
630                     v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0);
631                     v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1);
632                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
633                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
634                 }
635             }
636         #elif CV_NEON
637             int16x8_t v_thresh = vdupq_n_s16(thresh);
638
639             for( ; j <= roi.width - 8; j += 8 )
640             {
641                 int16x8_t v_src = vld1q_s16(src + j);
642                 uint16x8_t v_mask = vcleq_s16(v_src, v_thresh);
643                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
644             }
645         #endif
646             for( ; j < roi.width; j++ )
647             {
648                 short v = src[j];
649                 dst[j] = v <= thresh ? v : 0;
650             }
651         }
652         break;
653     default:
654         return CV_Error( CV_StsBadArg, "" );
655     }
656 }
657
658
659 static void
660 thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
661 {
662     int i, j;
663     Size roi = _src.size();
664     roi.width *= _src.channels();
665     const float* src = _src.ptr<float>();
666     float* dst = _dst.ptr<float>();
667     size_t src_step = _src.step/sizeof(src[0]);
668     size_t dst_step = _dst.step/sizeof(dst[0]);
669
670 #if CV_SSE
671     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
672 #endif
673
674     if( _src.isContinuous() && _dst.isContinuous() )
675     {
676         roi.width *= roi.height;
677         roi.height = 1;
678     }
679
680 #ifdef HAVE_TEGRA_OPTIMIZATION
681     if (tegra::useTegra() && tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
682         return;
683 #endif
684
685 #if defined(HAVE_IPP)
686     CV_IPP_CHECK()
687     {
688         IppiSize sz = { roi.width, roi.height };
689         switch( type )
690         {
691         case THRESH_TRUNC:
692             if (0 <= CV_INSTRUMENT_FUN_IPP(ippiThreshold_GT_32f_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
693             {
694                 CV_IMPL_ADD(CV_IMPL_IPP);
695                 return;
696             }
697             setIppErrorStatus();
698             break;
699         case THRESH_TOZERO:
700             if (0 <= CV_INSTRUMENT_FUN_IPP(ippiThreshold_LTVal_32f_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh + FLT_EPSILON, 0))
701             {
702                 CV_IMPL_ADD(CV_IMPL_IPP);
703                 return;
704             }
705             setIppErrorStatus();
706             break;
707         case THRESH_TOZERO_INV:
708             if (0 <= CV_INSTRUMENT_FUN_IPP(ippiThreshold_GTVal_32f_C1R, src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
709             {
710                 CV_IMPL_ADD(CV_IMPL_IPP);
711                 return;
712             }
713             setIppErrorStatus();
714             break;
715         }
716     }
717 #endif
718
719     switch( type )
720     {
721         case THRESH_BINARY:
722             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
723             {
724                 j = 0;
725 #if CV_SSE
726                 if( useSIMD )
727                 {
728                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
729                     for( ; j <= roi.width - 8; j += 8 )
730                     {
731                         __m128 v0, v1;
732                         v0 = _mm_loadu_ps( src + j );
733                         v1 = _mm_loadu_ps( src + j + 4 );
734                         v0 = _mm_cmpgt_ps( v0, thresh4 );
735                         v1 = _mm_cmpgt_ps( v1, thresh4 );
736                         v0 = _mm_and_ps( v0, maxval4 );
737                         v1 = _mm_and_ps( v1, maxval4 );
738                         _mm_storeu_ps( dst + j, v0 );
739                         _mm_storeu_ps( dst + j + 4, v1 );
740                     }
741                 }
742 #elif CV_NEON
743                 float32x4_t v_thresh = vdupq_n_f32(thresh);
744                 uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
745
746                 for( ; j <= roi.width - 4; j += 4 )
747                 {
748                     float32x4_t v_src = vld1q_f32(src + j);
749                     uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval);
750                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
751                 }
752 #endif
753
754                 for( ; j < roi.width; j++ )
755                     dst[j] = src[j] > thresh ? maxval : 0;
756             }
757             break;
758
759         case THRESH_BINARY_INV:
760             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
761             {
762                 j = 0;
763 #if CV_SSE
764                 if( useSIMD )
765                 {
766                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
767                     for( ; j <= roi.width - 8; j += 8 )
768                     {
769                         __m128 v0, v1;
770                         v0 = _mm_loadu_ps( src + j );
771                         v1 = _mm_loadu_ps( src + j + 4 );
772                         v0 = _mm_cmple_ps( v0, thresh4 );
773                         v1 = _mm_cmple_ps( v1, thresh4 );
774                         v0 = _mm_and_ps( v0, maxval4 );
775                         v1 = _mm_and_ps( v1, maxval4 );
776                         _mm_storeu_ps( dst + j, v0 );
777                         _mm_storeu_ps( dst + j + 4, v1 );
778                     }
779                 }
780 #elif CV_NEON
781                 float32x4_t v_thresh = vdupq_n_f32(thresh);
782                 uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
783
784                 for( ; j <= roi.width - 4; j += 4 )
785                 {
786                     float32x4_t v_src = vld1q_f32(src + j);
787                     uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval);
788                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
789                 }
790 #endif
791
792                 for( ; j < roi.width; j++ )
793                     dst[j] = src[j] <= thresh ? maxval : 0;
794             }
795             break;
796
797         case THRESH_TRUNC:
798             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
799             {
800                 j = 0;
801 #if CV_SSE
802                 if( useSIMD )
803                 {
804                     __m128 thresh4 = _mm_set1_ps(thresh);
805                     for( ; j <= roi.width - 8; j += 8 )
806                     {
807                         __m128 v0, v1;
808                         v0 = _mm_loadu_ps( src + j );
809                         v1 = _mm_loadu_ps( src + j + 4 );
810                         v0 = _mm_min_ps( v0, thresh4 );
811                         v1 = _mm_min_ps( v1, thresh4 );
812                         _mm_storeu_ps( dst + j, v0 );
813                         _mm_storeu_ps( dst + j + 4, v1 );
814                     }
815                 }
816 #elif CV_NEON
817                 float32x4_t v_thresh = vdupq_n_f32(thresh);
818
819                 for( ; j <= roi.width - 4; j += 4 )
820                     vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh));
821 #endif
822
823                 for( ; j < roi.width; j++ )
824                     dst[j] = std::min(src[j], thresh);
825             }
826             break;
827
828         case THRESH_TOZERO:
829             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
830             {
831                 j = 0;
832 #if CV_SSE
833                 if( useSIMD )
834                 {
835                     __m128 thresh4 = _mm_set1_ps(thresh);
836                     for( ; j <= roi.width - 8; j += 8 )
837                     {
838                         __m128 v0, v1;
839                         v0 = _mm_loadu_ps( src + j );
840                         v1 = _mm_loadu_ps( src + j + 4 );
841                         v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
842                         v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
843                         _mm_storeu_ps( dst + j, v0 );
844                         _mm_storeu_ps( dst + j + 4, v1 );
845                     }
846                 }
847 #elif CV_NEON
848                 float32x4_t v_thresh = vdupq_n_f32(thresh);
849
850                 for( ; j <= roi.width - 4; j += 4 )
851                 {
852                     float32x4_t v_src = vld1q_f32(src + j);
853                     uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh),
854                                                  vreinterpretq_u32_f32(v_src));
855                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
856                 }
857 #endif
858
859                 for( ; j < roi.width; j++ )
860                 {
861                     float v = src[j];
862                     dst[j] = v > thresh ? v : 0;
863                 }
864             }
865             break;
866
867         case THRESH_TOZERO_INV:
868             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
869             {
870                 j = 0;
871 #if CV_SSE
872                 if( useSIMD )
873                 {
874                     __m128 thresh4 = _mm_set1_ps(thresh);
875                     for( ; j <= roi.width - 8; j += 8 )
876                     {
877                         __m128 v0, v1;
878                         v0 = _mm_loadu_ps( src + j );
879                         v1 = _mm_loadu_ps( src + j + 4 );
880                         v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
881                         v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
882                         _mm_storeu_ps( dst + j, v0 );
883                         _mm_storeu_ps( dst + j + 4, v1 );
884                     }
885                 }
886 #elif CV_NEON
887                 float32x4_t v_thresh = vdupq_n_f32(thresh);
888
889                 for( ; j <= roi.width - 4; j += 4 )
890                 {
891                     float32x4_t v_src = vld1q_f32(src + j);
892                     uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh),
893                                                  vreinterpretq_u32_f32(v_src));
894                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
895                 }
896 #endif
897                 for( ; j < roi.width; j++ )
898                 {
899                     float v = src[j];
900                     dst[j] = v <= thresh ? v : 0;
901                 }
902             }
903             break;
904         default:
905             return CV_Error( CV_StsBadArg, "" );
906     }
907 }
908
909 static void
910 thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
911 {
912     int i, j;
913     Size roi = _src.size();
914     roi.width *= _src.channels();
915     const double* src = _src.ptr<double>();
916     double* dst = _dst.ptr<double>();
917     size_t src_step = _src.step / sizeof(src[0]);
918     size_t dst_step = _dst.step / sizeof(dst[0]);
919
920 #if CV_SSE2
921     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
922 #endif
923
924     if (_src.isContinuous() && _dst.isContinuous())
925     {
926         roi.width *= roi.height;
927         roi.height = 1;
928     }
929
930     switch (type)
931     {
932     case THRESH_BINARY:
933         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
934         {
935             j = 0;
936 #if CV_SSE2
937             if( useSIMD )
938             {
939                 __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
940                 for( ; j <= roi.width - 8; j += 8 )
941                 {
942                     __m128d v0, v1, v2, v3;
943                     v0 = _mm_loadu_pd( src + j );
944                     v1 = _mm_loadu_pd( src + j + 2 );
945                     v2 = _mm_loadu_pd( src + j + 4 );
946                     v3 = _mm_loadu_pd( src + j + 6 );
947                     v0 = _mm_cmpgt_pd( v0, thresh2 );
948                     v1 = _mm_cmpgt_pd( v1, thresh2 );
949                     v2 = _mm_cmpgt_pd( v2, thresh2 );
950                     v3 = _mm_cmpgt_pd( v3, thresh2 );
951                     v0 = _mm_and_pd( v0, maxval2 );
952                     v1 = _mm_and_pd( v1, maxval2 );
953                     v2 = _mm_and_pd( v2, maxval2 );
954                     v3 = _mm_and_pd( v3, maxval2 );
955                     _mm_storeu_pd( dst + j, v0 );
956                     _mm_storeu_pd( dst + j + 2, v1 );
957                     _mm_storeu_pd( dst + j + 4, v2 );
958                     _mm_storeu_pd( dst + j + 6, v3 );
959                 }
960             }
961 #elif CV_NEON && defined(__aarch64__)
962             float64x2_t v_thresh = vdupq_n_f64(thresh);
963             uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
964
965             for( ; j <= roi.width - 4; j += 4 )
966             {
967                 float64x2_t v_src0 = vld1q_f64(src + j);
968                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
969                 uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v_maxval);
970                 uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v_maxval);
971                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
972                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
973             }
974 #endif
975
976             for (; j < roi.width; j++)
977                 dst[j] = src[j] > thresh ? maxval : 0;
978         }
979         break;
980
981     case THRESH_BINARY_INV:
982         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
983         {
984             j = 0;
985
986 #if CV_SSE2
987             if( useSIMD )
988             {
989                 __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
990                 for( ; j <= roi.width - 8; j += 8 )
991                 {
992                     __m128d v0, v1, v2, v3;
993                     v0 = _mm_loadu_pd( src + j );
994                     v1 = _mm_loadu_pd( src + j + 2 );
995                     v2 = _mm_loadu_pd( src + j + 4 );
996                     v3 = _mm_loadu_pd( src + j + 6 );
997                     v0 = _mm_cmple_pd( v0, thresh2 );
998                     v1 = _mm_cmple_pd( v1, thresh2 );
999                     v2 = _mm_cmple_pd( v2, thresh2 );
1000                     v3 = _mm_cmple_pd( v3, thresh2 );
1001                     v0 = _mm_and_pd( v0, maxval2 );
1002                     v1 = _mm_and_pd( v1, maxval2 );
1003                     v2 = _mm_and_pd( v2, maxval2 );
1004                     v3 = _mm_and_pd( v3, maxval2 );
1005                     _mm_storeu_pd( dst + j, v0 );
1006                     _mm_storeu_pd( dst + j + 2, v1 );
1007                     _mm_storeu_pd( dst + j + 4, v2 );
1008                     _mm_storeu_pd( dst + j + 6, v3 );
1009                 }
1010             }
1011 #elif CV_NEON && defined(__aarch64__)
1012             float64x2_t v_thresh = vdupq_n_f64(thresh);
1013             uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
1014
1015             for( ; j <= roi.width - 4; j += 4 )
1016             {
1017                 float64x2_t v_src0 = vld1q_f64(src + j);
1018                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1019                 uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), v_maxval);
1020                 uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), v_maxval);
1021                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1022                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1023             }
1024 #endif
1025             for (; j < roi.width; j++)
1026                 dst[j] = src[j] <= thresh ? maxval : 0;
1027         }
1028         break;
1029
1030     case THRESH_TRUNC:
1031         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1032         {
1033             j = 0;
1034
1035 #if CV_SSE2
1036             if( useSIMD )
1037             {
1038                 __m128d thresh2 = _mm_set1_pd(thresh);
1039                 for( ; j <= roi.width - 8; j += 8 )
1040                 {
1041                     __m128d v0, v1, v2, v3;
1042                     v0 = _mm_loadu_pd( src + j );
1043                     v1 = _mm_loadu_pd( src + j + 2 );
1044                     v2 = _mm_loadu_pd( src + j + 4 );
1045                     v3 = _mm_loadu_pd( src + j + 6 );
1046                     v0 = _mm_min_pd( v0, thresh2 );
1047                     v1 = _mm_min_pd( v1, thresh2 );
1048                     v2 = _mm_min_pd( v2, thresh2 );
1049                     v3 = _mm_min_pd( v3, thresh2 );
1050                     _mm_storeu_pd( dst + j, v0 );
1051                     _mm_storeu_pd( dst + j + 2, v1 );
1052                     _mm_storeu_pd( dst + j + 4, v2 );
1053                     _mm_storeu_pd( dst + j + 6, v3 );
1054                 }
1055             }
1056 #elif CV_NEON && defined(__aarch64__)
1057             float64x2_t v_thresh = vdupq_n_f64(thresh);
1058
1059             for( ; j <= roi.width - 4; j += 4 )
1060             {
1061                 float64x2_t v_src0 = vld1q_f64(src + j);
1062                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1063                 float64x2_t v_dst0 = vminq_f64(v_src0, v_thresh);
1064                 float64x2_t v_dst1 = vminq_f64(v_src1, v_thresh);
1065                 vst1q_f64(dst + j, v_dst0);
1066                 vst1q_f64(dst + j + 2, v_dst1);
1067             }
1068 #endif
1069             for (; j < roi.width; j++)
1070                 dst[j] = std::min(src[j], thresh);
1071         }
1072         break;
1073
1074     case THRESH_TOZERO:
1075         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1076         {
1077             j = 0;
1078
1079 #if CV_SSE2
1080             if( useSIMD )
1081             {
1082                 __m128d thresh2 = _mm_set1_pd(thresh);
1083                 for( ; j <= roi.width - 8; j += 8 )
1084                 {
1085                     __m128d v0, v1, v2, v3;
1086                     v0 = _mm_loadu_pd( src + j );
1087                     v1 = _mm_loadu_pd( src + j + 2 );
1088                     v2 = _mm_loadu_pd( src + j + 4 );
1089                     v3 = _mm_loadu_pd( src + j + 6 );
1090                     v0 = _mm_and_pd( v0, _mm_cmpgt_pd(v0, thresh2));
1091                     v1 = _mm_and_pd( v1, _mm_cmpgt_pd(v1, thresh2));
1092                     v2 = _mm_and_pd( v2, _mm_cmpgt_pd(v2, thresh2));
1093                     v3 = _mm_and_pd( v3, _mm_cmpgt_pd(v3, thresh2));
1094                     _mm_storeu_pd( dst + j, v0 );
1095                     _mm_storeu_pd( dst + j + 2, v1 );
1096                     _mm_storeu_pd( dst + j + 4, v2 );
1097                     _mm_storeu_pd( dst + j + 6, v3 );
1098                 }
1099             }
1100 #elif CV_NEON && defined(__aarch64__)
1101             float64x2_t v_thresh = vdupq_n_f64(thresh);
1102
1103             for( ; j <= roi.width - 4; j += 4 )
1104             {
1105                 float64x2_t v_src0 = vld1q_f64(src + j);
1106                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1107                 uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh),
1108                                               vreinterpretq_u64_f64(v_src0));
1109                 uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh),
1110                                               vreinterpretq_u64_f64(v_src1));
1111                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1112                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1113             }
1114 #endif
1115             for (; j < roi.width; j++)
1116             {
1117                 double v = src[j];
1118                 dst[j] = v > thresh ? v : 0;
1119             }
1120         }
1121         break;
1122
1123     case THRESH_TOZERO_INV:
1124         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1125         {
1126             j = 0;
1127
1128 #if CV_SSE2
1129             if( useSIMD )
1130             {
1131                 __m128d thresh2 = _mm_set1_pd(thresh);
1132                 for( ; j <= roi.width - 8; j += 8 )
1133                 {
1134                     __m128d v0, v1, v2, v3;
1135                     v0 = _mm_loadu_pd( src + j );
1136                     v1 = _mm_loadu_pd( src + j + 2 );
1137                     v2 = _mm_loadu_pd( src + j + 4 );
1138                     v3 = _mm_loadu_pd( src + j + 6 );
1139                     v0 = _mm_and_pd( v0, _mm_cmple_pd(v0, thresh2));
1140                     v1 = _mm_and_pd( v1, _mm_cmple_pd(v1, thresh2));
1141                     v2 = _mm_and_pd( v2, _mm_cmple_pd(v2, thresh2));
1142                     v3 = _mm_and_pd( v3, _mm_cmple_pd(v3, thresh2));
1143                     _mm_storeu_pd( dst + j, v0 );
1144                     _mm_storeu_pd( dst + j + 2, v1 );
1145                     _mm_storeu_pd( dst + j + 4, v2 );
1146                     _mm_storeu_pd( dst + j + 6, v3 );
1147                 }
1148             }
1149 #elif CV_NEON && defined(__aarch64__)
1150             float64x2_t v_thresh = vdupq_n_f64(thresh);
1151
1152             for( ; j <= roi.width - 4; j += 4 )
1153             {
1154                 float64x2_t v_src0 = vld1q_f64(src + j);
1155                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1156                 uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh),
1157                                               vreinterpretq_u64_f64(v_src0));
1158                 uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh),
1159                                               vreinterpretq_u64_f64(v_src1));
1160                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1161                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1162             }
1163 #endif
1164             for (; j < roi.width; j++)
1165             {
1166                 double v = src[j];
1167                 dst[j] = v <= thresh ? v : 0;
1168             }
1169         }
1170         break;
1171     default:
1172         return CV_Error(CV_StsBadArg, "");
1173     }
1174 }
1175
1176 #ifdef HAVE_IPP
1177 static bool ipp_getThreshVal_Otsu_8u( const unsigned char* _src, int step, Size size, unsigned char &thresh)
1178 {
1179     CV_INSTRUMENT_REGION_IPP()
1180
1181 #if IPP_VERSION_X100 >= 810
1182     int ippStatus = -1;
1183     IppiSize srcSize = { size.width, size.height };
1184     CV_SUPPRESS_DEPRECATED_START
1185     ippStatus = CV_INSTRUMENT_FUN_IPP(ippiComputeThreshold_Otsu_8u_C1R, _src, step, srcSize, &thresh);
1186     CV_SUPPRESS_DEPRECATED_END
1187
1188     if(ippStatus >= 0)
1189         return true;
1190 #else
1191     CV_UNUSED(_src); CV_UNUSED(step); CV_UNUSED(size); CV_UNUSED(thresh);
1192 #endif
1193     return false;
1194 }
1195 #endif
1196
1197 static double
1198 getThreshVal_Otsu_8u( const Mat& _src )
1199 {
1200     Size size = _src.size();
1201     int step = (int) _src.step;
1202     if( _src.isContinuous() )
1203     {
1204         size.width *= size.height;
1205         size.height = 1;
1206         step = size.width;
1207     }
1208
1209 #ifdef HAVE_IPP
1210     unsigned char thresh;
1211     CV_IPP_RUN(IPP_VERSION_X100 >= 810, ipp_getThreshVal_Otsu_8u(_src.ptr(), step, size, thresh), thresh);
1212 #endif
1213
1214     const int N = 256;
1215     int i, j, h[N] = {0};
1216     for( i = 0; i < size.height; i++ )
1217     {
1218         const uchar* src = _src.ptr() + step*i;
1219         j = 0;
1220         #if CV_ENABLE_UNROLLED
1221         for( ; j <= size.width - 4; j += 4 )
1222         {
1223             int v0 = src[j], v1 = src[j+1];
1224             h[v0]++; h[v1]++;
1225             v0 = src[j+2]; v1 = src[j+3];
1226             h[v0]++; h[v1]++;
1227         }
1228         #endif
1229         for( ; j < size.width; j++ )
1230             h[src[j]]++;
1231     }
1232
1233     double mu = 0, scale = 1./(size.width*size.height);
1234     for( i = 0; i < N; i++ )
1235         mu += i*(double)h[i];
1236
1237     mu *= scale;
1238     double mu1 = 0, q1 = 0;
1239     double max_sigma = 0, max_val = 0;
1240
1241     for( i = 0; i < N; i++ )
1242     {
1243         double p_i, q2, mu2, sigma;
1244
1245         p_i = h[i]*scale;
1246         mu1 *= q1;
1247         q1 += p_i;
1248         q2 = 1. - q1;
1249
1250         if( std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON )
1251             continue;
1252
1253         mu1 = (mu1 + i*p_i)/q1;
1254         mu2 = (mu - q1*mu1)/q2;
1255         sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
1256         if( sigma > max_sigma )
1257         {
1258             max_sigma = sigma;
1259             max_val = i;
1260         }
1261     }
1262
1263     return max_val;
1264 }
1265
1266 static double
1267 getThreshVal_Triangle_8u( const Mat& _src )
1268 {
1269     Size size = _src.size();
1270     int step = (int) _src.step;
1271     if( _src.isContinuous() )
1272     {
1273         size.width *= size.height;
1274         size.height = 1;
1275         step = size.width;
1276     }
1277
1278     const int N = 256;
1279     int i, j, h[N] = {0};
1280     for( i = 0; i < size.height; i++ )
1281     {
1282         const uchar* src = _src.ptr() + step*i;
1283         j = 0;
1284         #if CV_ENABLE_UNROLLED
1285         for( ; j <= size.width - 4; j += 4 )
1286         {
1287             int v0 = src[j], v1 = src[j+1];
1288             h[v0]++; h[v1]++;
1289             v0 = src[j+2]; v1 = src[j+3];
1290             h[v0]++; h[v1]++;
1291         }
1292         #endif
1293         for( ; j < size.width; j++ )
1294             h[src[j]]++;
1295     }
1296
1297     int left_bound = 0, right_bound = 0, max_ind = 0, max = 0;
1298     int temp;
1299     bool isflipped = false;
1300
1301     for( i = 0; i < N; i++ )
1302     {
1303         if( h[i] > 0 )
1304         {
1305             left_bound = i;
1306             break;
1307         }
1308     }
1309     if( left_bound > 0 )
1310         left_bound--;
1311
1312     for( i = N-1; i > 0; i-- )
1313     {
1314         if( h[i] > 0 )
1315         {
1316             right_bound = i;
1317             break;
1318         }
1319     }
1320     if( right_bound < N-1 )
1321         right_bound++;
1322
1323     for( i = 0; i < N; i++ )
1324     {
1325         if( h[i] > max)
1326         {
1327             max = h[i];
1328             max_ind = i;
1329         }
1330     }
1331
1332     if( max_ind-left_bound < right_bound-max_ind)
1333     {
1334         isflipped = true;
1335         i = 0, j = N-1;
1336         while( i < j )
1337         {
1338             temp = h[i]; h[i] = h[j]; h[j] = temp;
1339             i++; j--;
1340         }
1341         left_bound = N-1-right_bound;
1342         max_ind = N-1-max_ind;
1343     }
1344
1345     double thresh = left_bound;
1346     double a, b, dist = 0, tempdist;
1347
1348     /*
1349      * We do not need to compute precise distance here. Distance is maximized, so some constants can
1350      * be omitted. This speeds up a computation a bit.
1351      */
1352     a = max; b = left_bound-max_ind;
1353     for( i = left_bound+1; i <= max_ind; i++ )
1354     {
1355         tempdist = a*i + b*h[i];
1356         if( tempdist > dist)
1357         {
1358             dist = tempdist;
1359             thresh = i;
1360         }
1361     }
1362     thresh--;
1363
1364     if( isflipped )
1365         thresh = N-1-thresh;
1366
1367     return thresh;
1368 }
1369
1370 class ThresholdRunner : public ParallelLoopBody
1371 {
1372 public:
1373     ThresholdRunner(Mat _src, Mat _dst, double _thresh, double _maxval, int _thresholdType)
1374     {
1375         src = _src;
1376         dst = _dst;
1377
1378         thresh = _thresh;
1379         maxval = _maxval;
1380         thresholdType = _thresholdType;
1381     }
1382
1383     void operator () ( const Range& range ) const
1384     {
1385         int row0 = range.start;
1386         int row1 = range.end;
1387
1388         Mat srcStripe = src.rowRange(row0, row1);
1389         Mat dstStripe = dst.rowRange(row0, row1);
1390
1391         if (srcStripe.depth() == CV_8U)
1392         {
1393             thresh_8u( srcStripe, dstStripe, (uchar)thresh, (uchar)maxval, thresholdType );
1394         }
1395         else if( srcStripe.depth() == CV_16S )
1396         {
1397             thresh_16s( srcStripe, dstStripe, (short)thresh, (short)maxval, thresholdType );
1398         }
1399         else if( srcStripe.depth() == CV_32F )
1400         {
1401             thresh_32f( srcStripe, dstStripe, (float)thresh, (float)maxval, thresholdType );
1402         }
1403         else if( srcStripe.depth() == CV_64F )
1404         {
1405             thresh_64f(srcStripe, dstStripe, thresh, maxval, thresholdType);
1406         }
1407     }
1408
1409 private:
1410     Mat src;
1411     Mat dst;
1412
1413     double thresh;
1414     double maxval;
1415     int thresholdType;
1416 };
1417
1418 #ifdef HAVE_OPENCL
1419
1420 static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, double maxval, int thresh_type )
1421 {
1422     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
1423         kercn = ocl::predictOptimalVectorWidth(_src, _dst), ktype = CV_MAKE_TYPE(depth, kercn);
1424     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
1425
1426     if ( !(thresh_type == THRESH_BINARY || thresh_type == THRESH_BINARY_INV || thresh_type == THRESH_TRUNC ||
1427            thresh_type == THRESH_TOZERO || thresh_type == THRESH_TOZERO_INV) ||
1428          (!doubleSupport && depth == CV_64F))
1429         return false;
1430
1431     const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
1432                                           "THRESH_TOZERO", "THRESH_TOZERO_INV" };
1433     ocl::Device dev = ocl::Device::getDefault();
1434     int stride_size = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
1435
1436     ocl::Kernel k("threshold", ocl::imgproc::threshold_oclsrc,
1437                   format("-D %s -D T=%s -D T1=%s -D STRIDE_SIZE=%d%s", thresholdMap[thresh_type],
1438                          ocl::typeToStr(ktype), ocl::typeToStr(depth), stride_size,
1439                          doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
1440     if (k.empty())
1441         return false;
1442
1443     UMat src = _src.getUMat();
1444     _dst.create(src.size(), type);
1445     UMat dst = _dst.getUMat();
1446
1447     if (depth <= CV_32S)
1448         thresh = cvFloor(thresh);
1449
1450     const double min_vals[] = { 0, CHAR_MIN, 0, SHRT_MIN, INT_MIN, -FLT_MAX, -DBL_MAX, 0 };
1451     double min_val = min_vals[depth];
1452
1453     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn, kercn),
1454            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(thresh))),
1455            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(maxval))),
1456            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(min_val))));
1457
1458     size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, (size_t)dst.rows };
1459     globalsize[1] = (globalsize[1] + stride_size - 1) / stride_size;
1460     return k.run(2, globalsize, NULL, false);
1461 }
1462
1463 #endif
1464
1465 }
1466
1467 double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double maxval, int type )
1468 {
1469     CV_INSTRUMENT_REGION()
1470
1471     CV_OCL_RUN_(_src.dims() <= 2 && _dst.isUMat(),
1472                 ocl_threshold(_src, _dst, thresh, maxval, type), thresh)
1473
1474     Mat src = _src.getMat();
1475     int automatic_thresh = (type & ~CV_THRESH_MASK);
1476     type &= THRESH_MASK;
1477
1478     CV_Assert( automatic_thresh != (CV_THRESH_OTSU | CV_THRESH_TRIANGLE) );
1479     if( automatic_thresh == CV_THRESH_OTSU )
1480     {
1481         CV_Assert( src.type() == CV_8UC1 );
1482         thresh = getThreshVal_Otsu_8u( src );
1483     }
1484     else if( automatic_thresh == CV_THRESH_TRIANGLE )
1485     {
1486         CV_Assert( src.type() == CV_8UC1 );
1487         thresh = getThreshVal_Triangle_8u( src );
1488     }
1489
1490     _dst.create( src.size(), src.type() );
1491     Mat dst = _dst.getMat();
1492
1493     if( src.depth() == CV_8U )
1494     {
1495         int ithresh = cvFloor(thresh);
1496         thresh = ithresh;
1497         int imaxval = cvRound(maxval);
1498         if( type == THRESH_TRUNC )
1499             imaxval = ithresh;
1500         imaxval = saturate_cast<uchar>(imaxval);
1501
1502         if( ithresh < 0 || ithresh >= 255 )
1503         {
1504             if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
1505                 ((type == THRESH_TRUNC || type == THRESH_TOZERO_INV) && ithresh < 0) ||
1506                 (type == THRESH_TOZERO && ithresh >= 255) )
1507             {
1508                 int v = type == THRESH_BINARY ? (ithresh >= 255 ? 0 : imaxval) :
1509                         type == THRESH_BINARY_INV ? (ithresh >= 255 ? imaxval : 0) :
1510                         /*type == THRESH_TRUNC ? imaxval :*/ 0;
1511                 dst.setTo(v);
1512             }
1513             else
1514                 src.copyTo(dst);
1515             return thresh;
1516         }
1517         thresh = ithresh;
1518         maxval = imaxval;
1519     }
1520     else if( src.depth() == CV_16S )
1521     {
1522         int ithresh = cvFloor(thresh);
1523         thresh = ithresh;
1524         int imaxval = cvRound(maxval);
1525         if( type == THRESH_TRUNC )
1526             imaxval = ithresh;
1527         imaxval = saturate_cast<short>(imaxval);
1528
1529         if( ithresh < SHRT_MIN || ithresh >= SHRT_MAX )
1530         {
1531             if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
1532                ((type == THRESH_TRUNC || type == THRESH_TOZERO_INV) && ithresh < SHRT_MIN) ||
1533                (type == THRESH_TOZERO && ithresh >= SHRT_MAX) )
1534             {
1535                 int v = type == THRESH_BINARY ? (ithresh >= SHRT_MAX ? 0 : imaxval) :
1536                 type == THRESH_BINARY_INV ? (ithresh >= SHRT_MAX ? imaxval : 0) :
1537                 /*type == THRESH_TRUNC ? imaxval :*/ 0;
1538                 dst.setTo(v);
1539             }
1540             else
1541                 src.copyTo(dst);
1542             return thresh;
1543         }
1544         thresh = ithresh;
1545         maxval = imaxval;
1546     }
1547     else if( src.depth() == CV_32F )
1548         ;
1549     else if( src.depth() == CV_64F )
1550         ;
1551     else
1552         CV_Error( CV_StsUnsupportedFormat, "" );
1553
1554     parallel_for_(Range(0, dst.rows),
1555                   ThresholdRunner(src, dst, thresh, maxval, type),
1556                   dst.total()/(double)(1<<16));
1557     return thresh;
1558 }
1559
1560
1561 void cv::adaptiveThreshold( InputArray _src, OutputArray _dst, double maxValue,
1562                             int method, int type, int blockSize, double delta )
1563 {
1564     CV_INSTRUMENT_REGION()
1565
1566     Mat src = _src.getMat();
1567     CV_Assert( src.type() == CV_8UC1 );
1568     CV_Assert( blockSize % 2 == 1 && blockSize > 1 );
1569     Size size = src.size();
1570
1571     _dst.create( size, src.type() );
1572     Mat dst = _dst.getMat();
1573
1574     if( maxValue < 0 )
1575     {
1576         dst = Scalar(0);
1577         return;
1578     }
1579
1580     Mat mean;
1581
1582     if( src.data != dst.data )
1583         mean = dst;
1584
1585     if (method == ADAPTIVE_THRESH_MEAN_C)
1586         boxFilter( src, mean, src.type(), Size(blockSize, blockSize),
1587                    Point(-1,-1), true, BORDER_REPLICATE );
1588     else if (method == ADAPTIVE_THRESH_GAUSSIAN_C)
1589     {
1590         Mat srcfloat,meanfloat;
1591         src.convertTo(srcfloat,CV_32F);
1592         meanfloat=srcfloat;
1593         GaussianBlur(srcfloat, meanfloat, Size(blockSize, blockSize), 0, 0, BORDER_REPLICATE);
1594         meanfloat.convertTo(mean, src.type());
1595     }
1596     else
1597         CV_Error( CV_StsBadFlag, "Unknown/unsupported adaptive threshold method" );
1598
1599     int i, j;
1600     uchar imaxval = saturate_cast<uchar>(maxValue);
1601     int idelta = type == THRESH_BINARY ? cvCeil(delta) : cvFloor(delta);
1602     uchar tab[768];
1603
1604     if( type == CV_THRESH_BINARY )
1605         for( i = 0; i < 768; i++ )
1606             tab[i] = (uchar)(i - 255 > -idelta ? imaxval : 0);
1607     else if( type == CV_THRESH_BINARY_INV )
1608         for( i = 0; i < 768; i++ )
1609             tab[i] = (uchar)(i - 255 <= -idelta ? imaxval : 0);
1610     else
1611         CV_Error( CV_StsBadFlag, "Unknown/unsupported threshold type" );
1612
1613     if( src.isContinuous() && mean.isContinuous() && dst.isContinuous() )
1614     {
1615         size.width *= size.height;
1616         size.height = 1;
1617     }
1618
1619     for( i = 0; i < size.height; i++ )
1620     {
1621         const uchar* sdata = src.ptr(i);
1622         const uchar* mdata = mean.ptr(i);
1623         uchar* ddata = dst.ptr(i);
1624
1625         for( j = 0; j < size.width; j++ )
1626             ddata[j] = tab[sdata[j] - mdata[j] + 255];
1627     }
1628 }
1629
1630 CV_IMPL double
1631 cvThreshold( const void* srcarr, void* dstarr, double thresh, double maxval, int type )
1632 {
1633     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
1634
1635     CV_Assert( src.size == dst.size && src.channels() == dst.channels() &&
1636         (src.depth() == dst.depth() || dst.depth() == CV_8U));
1637
1638     thresh = cv::threshold( src, dst, thresh, maxval, type );
1639     if( dst0.data != dst.data )
1640         dst.convertTo( dst0, dst0.depth() );
1641     return thresh;
1642 }
1643
1644
1645 CV_IMPL void
1646 cvAdaptiveThreshold( const void *srcIm, void *dstIm, double maxValue,
1647                      int method, int type, int blockSize, double delta )
1648 {
1649     cv::Mat src = cv::cvarrToMat(srcIm), dst = cv::cvarrToMat(dstIm);
1650     CV_Assert( src.size == dst.size && src.type() == dst.type() );
1651     cv::adaptiveThreshold( src, dst, maxValue, method, type, blockSize, delta );
1652 }
1653
1654 /* End of file. */