Merge branch 'master' of https://github.com/Itseez/opencv into linux-window-mouse...
[platform/upstream/opencv.git] / modules / imgproc / src / thresh.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44 #include "opencl_kernels_imgproc.hpp"
45
46 #if CV_NEON && defined(__aarch64__)
47 #include <arm_neon.h>
48 namespace cv {
49 // Workaround with missing definitions of vreinterpretq_u64_f64/vreinterpretq_f64_u64
50 template <typename T> static inline
51 uint64x2_t vreinterpretq_u64_f64(T a)
52 {
53     return (uint64x2_t) a;
54 }
55 template <typename T> static inline
56 float64x2_t vreinterpretq_f64_u64(T a)
57 {
58     return (float64x2_t) a;
59 }
60 } // namespace cv
61 #endif
62
63 namespace cv
64 {
65
66 static void
67 thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
68 {
69     int i, j, j_scalar = 0;
70     uchar tab[256];
71     Size roi = _src.size();
72     roi.width *= _src.channels();
73     size_t src_step = _src.step;
74     size_t dst_step = _dst.step;
75
76     if( _src.isContinuous() && _dst.isContinuous() )
77     {
78         roi.width *= roi.height;
79         roi.height = 1;
80         src_step = dst_step = roi.width;
81     }
82
83 #ifdef HAVE_TEGRA_OPTIMIZATION
84     if (tegra::useTegra() && tegra::thresh_8u(_src, _dst, roi.width, roi.height, thresh, maxval, type))
85         return;
86 #endif
87
88 #if defined(HAVE_IPP)
89     CV_IPP_CHECK()
90     {
91         IppiSize sz = { roi.width, roi.height };
92         CV_SUPPRESS_DEPRECATED_START
93         switch( type )
94         {
95         case THRESH_TRUNC:
96 #ifndef HAVE_IPP_ICV_ONLY
97             if (_src.data == _dst.data && ippiThreshold_GT_8u_C1IR(_dst.ptr(), (int)dst_step, sz, thresh) >= 0)
98             {
99                 CV_IMPL_ADD(CV_IMPL_IPP);
100                 return;
101             }
102 #endif
103             if (ippiThreshold_GT_8u_C1R(_src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh) >= 0)
104             {
105                 CV_IMPL_ADD(CV_IMPL_IPP);
106                 return;
107             }
108             setIppErrorStatus();
109             break;
110         case THRESH_TOZERO:
111 #ifndef HAVE_IPP_ICV_ONLY
112             if (_src.data == _dst.data && ippiThreshold_LTVal_8u_C1IR(_dst.ptr(), (int)dst_step, sz, thresh+1, 0) >= 0)
113             {
114                 CV_IMPL_ADD(CV_IMPL_IPP);
115                 return;
116             }
117 #endif
118             if (ippiThreshold_LTVal_8u_C1R(_src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh+1, 0) >= 0)
119             {
120                 CV_IMPL_ADD(CV_IMPL_IPP);
121                 return;
122             }
123             setIppErrorStatus();
124             break;
125         case THRESH_TOZERO_INV:
126 #ifndef HAVE_IPP_ICV_ONLY
127             if (_src.data == _dst.data && ippiThreshold_GTVal_8u_C1IR(_dst.ptr(), (int)dst_step, sz, thresh, 0) >= 0)
128             {
129                 CV_IMPL_ADD(CV_IMPL_IPP);
130                 return;
131             }
132 #endif
133             if (ippiThreshold_GTVal_8u_C1R(_src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh, 0) >= 0)
134             {
135                 CV_IMPL_ADD(CV_IMPL_IPP);
136                 return;
137             }
138             setIppErrorStatus();
139             break;
140         }
141         CV_SUPPRESS_DEPRECATED_END
142     }
143 #endif
144
145     switch( type )
146     {
147     case THRESH_BINARY:
148         for( i = 0; i <= thresh; i++ )
149             tab[i] = 0;
150         for( ; i < 256; i++ )
151             tab[i] = maxval;
152         break;
153     case THRESH_BINARY_INV:
154         for( i = 0; i <= thresh; i++ )
155             tab[i] = maxval;
156         for( ; i < 256; i++ )
157             tab[i] = 0;
158         break;
159     case THRESH_TRUNC:
160         for( i = 0; i <= thresh; i++ )
161             tab[i] = (uchar)i;
162         for( ; i < 256; i++ )
163             tab[i] = thresh;
164         break;
165     case THRESH_TOZERO:
166         for( i = 0; i <= thresh; i++ )
167             tab[i] = 0;
168         for( ; i < 256; i++ )
169             tab[i] = (uchar)i;
170         break;
171     case THRESH_TOZERO_INV:
172         for( i = 0; i <= thresh; i++ )
173             tab[i] = (uchar)i;
174         for( ; i < 256; i++ )
175             tab[i] = 0;
176         break;
177     default:
178         CV_Error( CV_StsBadArg, "Unknown threshold type" );
179     }
180
181 #if CV_SSE2
182     if( checkHardwareSupport(CV_CPU_SSE2) )
183     {
184         __m128i _x80 = _mm_set1_epi8('\x80');
185         __m128i thresh_u = _mm_set1_epi8(thresh);
186         __m128i thresh_s = _mm_set1_epi8(thresh ^ 0x80);
187         __m128i maxval_ = _mm_set1_epi8(maxval);
188         j_scalar = roi.width & -8;
189
190         for( i = 0; i < roi.height; i++ )
191         {
192             const uchar* src = _src.ptr() + src_step*i;
193             uchar* dst = _dst.ptr() + dst_step*i;
194
195             switch( type )
196             {
197             case THRESH_BINARY:
198                 for( j = 0; j <= roi.width - 32; j += 32 )
199                 {
200                     __m128i v0, v1;
201                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
202                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
203                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
204                     v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s );
205                     v0 = _mm_and_si128( v0, maxval_ );
206                     v1 = _mm_and_si128( v1, maxval_ );
207                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
208                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
209                 }
210
211                 for( ; j <= roi.width - 8; j += 8 )
212                 {
213                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
214                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
215                     v0 = _mm_and_si128( v0, maxval_ );
216                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
217                 }
218                 break;
219
220             case THRESH_BINARY_INV:
221                 for( j = 0; j <= roi.width - 32; j += 32 )
222                 {
223                     __m128i v0, v1;
224                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
225                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
226                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
227                     v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s );
228                     v0 = _mm_andnot_si128( v0, maxval_ );
229                     v1 = _mm_andnot_si128( v1, maxval_ );
230                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
231                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
232                 }
233
234                 for( ; j <= roi.width - 8; j += 8 )
235                 {
236                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
237                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
238                     v0 = _mm_andnot_si128( v0, maxval_ );
239                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
240                 }
241                 break;
242
243             case THRESH_TRUNC:
244                 for( j = 0; j <= roi.width - 32; j += 32 )
245                 {
246                     __m128i v0, v1;
247                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
248                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
249                     v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u ));
250                     v1 = _mm_subs_epu8( v1, _mm_subs_epu8( v1, thresh_u ));
251                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
252                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
253                 }
254
255                 for( ; j <= roi.width - 8; j += 8 )
256                 {
257                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
258                     v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u ));
259                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
260                 }
261                 break;
262
263             case THRESH_TOZERO:
264                 for( j = 0; j <= roi.width - 32; j += 32 )
265                 {
266                     __m128i v0, v1;
267                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
268                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
269                     v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ));
270                     v1 = _mm_and_si128( v1, _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ));
271                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
272                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
273                 }
274
275                 for( ; j <= roi.width - 8; j += 8 )
276                 {
277                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
278                     v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ));
279                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
280                 }
281                 break;
282
283             case THRESH_TOZERO_INV:
284                 for( j = 0; j <= roi.width - 32; j += 32 )
285                 {
286                     __m128i v0, v1;
287                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
288                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
289                     v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 );
290                     v1 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ), v1 );
291                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
292                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
293                 }
294
295                 for( ; j <= roi.width - 8; j += 8 )
296                 {
297                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
298                     v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 );
299                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
300                 }
301                 break;
302             }
303         }
304     }
305 #elif CV_NEON
306     uint8x16_t v_thresh = vdupq_n_u8(thresh), v_maxval = vdupq_n_u8(maxval);
307
308     switch( type )
309     {
310     case THRESH_BINARY:
311         for( i = 0; i < roi.height; i++ )
312         {
313             const uchar* src = _src.ptr() + src_step*i;
314             uchar* dst = _dst.ptr() + dst_step*i;
315
316             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
317                 vst1q_u8(dst + j_scalar, vandq_u8(vcgtq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval));
318         }
319         break;
320
321     case THRESH_BINARY_INV:
322         for( i = 0; i < roi.height; i++ )
323         {
324             const uchar* src = _src.ptr() + src_step*i;
325             uchar* dst = _dst.ptr() + dst_step*i;
326
327             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
328                 vst1q_u8(dst + j_scalar, vandq_u8(vcleq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval));
329         }
330         break;
331
332     case THRESH_TRUNC:
333         for( i = 0; i < roi.height; i++ )
334         {
335             const uchar* src = _src.ptr() + src_step*i;
336             uchar* dst = _dst.ptr() + dst_step*i;
337
338             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
339                 vst1q_u8(dst + j_scalar, vminq_u8(vld1q_u8(src + j_scalar), v_thresh));
340         }
341         break;
342
343     case THRESH_TOZERO:
344         for( i = 0; i < roi.height; i++ )
345         {
346             const uchar* src = _src.ptr() + src_step*i;
347             uchar* dst = _dst.ptr() + dst_step*i;
348
349             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
350             {
351                 uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcgtq_u8(v_src, v_thresh);
352                 vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src));
353             }
354         }
355         break;
356
357     case THRESH_TOZERO_INV:
358         for( i = 0; i < roi.height; i++ )
359         {
360             const uchar* src = _src.ptr() + src_step*i;
361             uchar* dst = _dst.ptr() + dst_step*i;
362
363             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
364             {
365                 uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcleq_u8(v_src, v_thresh);
366                 vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src));
367             }
368         }
369         break;
370     default:
371         return CV_Error( CV_StsBadArg, "" );
372     }
373 #endif
374
375     if( j_scalar < roi.width )
376     {
377         for( i = 0; i < roi.height; i++ )
378         {
379             const uchar* src = _src.ptr() + src_step*i;
380             uchar* dst = _dst.ptr() + dst_step*i;
381             j = j_scalar;
382 #if CV_ENABLE_UNROLLED
383             for( ; j <= roi.width - 4; j += 4 )
384             {
385                 uchar t0 = tab[src[j]];
386                 uchar t1 = tab[src[j+1]];
387
388                 dst[j] = t0;
389                 dst[j+1] = t1;
390
391                 t0 = tab[src[j+2]];
392                 t1 = tab[src[j+3]];
393
394                 dst[j+2] = t0;
395                 dst[j+3] = t1;
396             }
397 #endif
398             for( ; j < roi.width; j++ )
399                 dst[j] = tab[src[j]];
400         }
401     }
402 }
403
404
405 static void
406 thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
407 {
408     int i, j;
409     Size roi = _src.size();
410     roi.width *= _src.channels();
411     const short* src = _src.ptr<short>();
412     short* dst = _dst.ptr<short>();
413     size_t src_step = _src.step/sizeof(src[0]);
414     size_t dst_step = _dst.step/sizeof(dst[0]);
415
416 #if CV_SSE2
417     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
418 #endif
419
420     if( _src.isContinuous() && _dst.isContinuous() )
421     {
422         roi.width *= roi.height;
423         roi.height = 1;
424         src_step = dst_step = roi.width;
425     }
426
427 #ifdef HAVE_TEGRA_OPTIMIZATION
428     if (tegra::useTegra() && tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type))
429         return;
430 #endif
431
432 #if defined(HAVE_IPP)
433     CV_IPP_CHECK()
434     {
435         IppiSize sz = { roi.width, roi.height };
436         CV_SUPPRESS_DEPRECATED_START
437         switch( type )
438         {
439         case THRESH_TRUNC:
440 #ifndef HAVE_IPP_ICV_ONLY
441             if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
442             {
443                 CV_IMPL_ADD(CV_IMPL_IPP);
444                 return;
445             }
446 #endif
447             if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
448             {
449                 CV_IMPL_ADD(CV_IMPL_IPP);
450                 return;
451             }
452             setIppErrorStatus();
453             break;
454         case THRESH_TOZERO:
455 #ifndef HAVE_IPP_ICV_ONLY
456             if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0)
457             {
458                 CV_IMPL_ADD(CV_IMPL_IPP);
459                 return;
460             }
461 #endif
462             if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0)
463             {
464                 CV_IMPL_ADD(CV_IMPL_IPP);
465                 return;
466             }
467             setIppErrorStatus();
468             break;
469         case THRESH_TOZERO_INV:
470 #ifndef HAVE_IPP_ICV_ONLY
471             if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
472             {
473                 CV_IMPL_ADD(CV_IMPL_IPP);
474                 return;
475             }
476 #endif
477             if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
478             {
479                 CV_IMPL_ADD(CV_IMPL_IPP);
480                 return;
481             }
482             setIppErrorStatus();
483             break;
484         }
485         CV_SUPPRESS_DEPRECATED_END
486     }
487 #endif
488
489     switch( type )
490     {
491     case THRESH_BINARY:
492         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
493         {
494             j = 0;
495         #if CV_SSE2
496             if( useSIMD )
497             {
498                 __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
499                 for( ; j <= roi.width - 16; j += 16 )
500                 {
501                     __m128i v0, v1;
502                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
503                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
504                     v0 = _mm_cmpgt_epi16( v0, thresh8 );
505                     v1 = _mm_cmpgt_epi16( v1, thresh8 );
506                     v0 = _mm_and_si128( v0, maxval8 );
507                     v1 = _mm_and_si128( v1, maxval8 );
508                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
509                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
510                 }
511             }
512         #elif CV_NEON
513             int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
514
515             for( ; j <= roi.width - 8; j += 8 )
516             {
517                 uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh);
518                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
519             }
520         #endif
521
522             for( ; j < roi.width; j++ )
523                 dst[j] = src[j] > thresh ? maxval : 0;
524         }
525         break;
526
527     case THRESH_BINARY_INV:
528         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
529         {
530             j = 0;
531         #if CV_SSE2
532             if( useSIMD )
533             {
534                 __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
535                 for( ; j <= roi.width - 16; j += 16 )
536                 {
537                     __m128i v0, v1;
538                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
539                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
540                     v0 = _mm_cmpgt_epi16( v0, thresh8 );
541                     v1 = _mm_cmpgt_epi16( v1, thresh8 );
542                     v0 = _mm_andnot_si128( v0, maxval8 );
543                     v1 = _mm_andnot_si128( v1, maxval8 );
544                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
545                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
546                 }
547             }
548         #elif CV_NEON
549             int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
550
551             for( ; j <= roi.width - 8; j += 8 )
552             {
553                 uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh);
554                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
555             }
556         #endif
557
558             for( ; j < roi.width; j++ )
559                 dst[j] = src[j] <= thresh ? maxval : 0;
560         }
561         break;
562
563     case THRESH_TRUNC:
564         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
565         {
566             j = 0;
567         #if CV_SSE2
568             if( useSIMD )
569             {
570                 __m128i thresh8 = _mm_set1_epi16(thresh);
571                 for( ; j <= roi.width - 16; j += 16 )
572                 {
573                     __m128i v0, v1;
574                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
575                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
576                     v0 = _mm_min_epi16( v0, thresh8 );
577                     v1 = _mm_min_epi16( v1, thresh8 );
578                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
579                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
580                 }
581             }
582         #elif CV_NEON
583             int16x8_t v_thresh = vdupq_n_s16(thresh);
584
585             for( ; j <= roi.width - 8; j += 8 )
586                 vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh));
587         #endif
588
589             for( ; j < roi.width; j++ )
590                 dst[j] = std::min(src[j], thresh);
591         }
592         break;
593
594     case THRESH_TOZERO:
595         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
596         {
597             j = 0;
598         #if CV_SSE2
599             if( useSIMD )
600             {
601                 __m128i thresh8 = _mm_set1_epi16(thresh);
602                 for( ; j <= roi.width - 16; j += 16 )
603                 {
604                     __m128i v0, v1;
605                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
606                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
607                     v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8));
608                     v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8));
609                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
610                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
611                 }
612             }
613         #elif CV_NEON
614             int16x8_t v_thresh = vdupq_n_s16(thresh);
615
616             for( ; j <= roi.width - 8; j += 8 )
617             {
618                 int16x8_t v_src = vld1q_s16(src + j);
619                 uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh);
620                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
621             }
622         #endif
623
624             for( ; j < roi.width; j++ )
625             {
626                 short v = src[j];
627                 dst[j] = v > thresh ? v : 0;
628             }
629         }
630         break;
631
632     case THRESH_TOZERO_INV:
633         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
634         {
635             j = 0;
636         #if CV_SSE2
637             if( useSIMD )
638             {
639                 __m128i thresh8 = _mm_set1_epi16(thresh);
640                 for( ; j <= roi.width - 16; j += 16 )
641                 {
642                     __m128i v0, v1;
643                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
644                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
645                     v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0);
646                     v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1);
647                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
648                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
649                 }
650             }
651         #elif CV_NEON
652             int16x8_t v_thresh = vdupq_n_s16(thresh);
653
654             for( ; j <= roi.width - 8; j += 8 )
655             {
656                 int16x8_t v_src = vld1q_s16(src + j);
657                 uint16x8_t v_mask = vcleq_s16(v_src, v_thresh);
658                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
659             }
660         #endif
661             for( ; j < roi.width; j++ )
662             {
663                 short v = src[j];
664                 dst[j] = v <= thresh ? v : 0;
665             }
666         }
667         break;
668     default:
669         return CV_Error( CV_StsBadArg, "" );
670     }
671 }
672
673
674 static void
675 thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
676 {
677     int i, j;
678     Size roi = _src.size();
679     roi.width *= _src.channels();
680     const float* src = _src.ptr<float>();
681     float* dst = _dst.ptr<float>();
682     size_t src_step = _src.step/sizeof(src[0]);
683     size_t dst_step = _dst.step/sizeof(dst[0]);
684
685 #if CV_SSE
686     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
687 #endif
688
689     if( _src.isContinuous() && _dst.isContinuous() )
690     {
691         roi.width *= roi.height;
692         roi.height = 1;
693     }
694
695 #ifdef HAVE_TEGRA_OPTIMIZATION
696     if (tegra::useTegra() && tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
697         return;
698 #endif
699
700 #if defined(HAVE_IPP)
701     CV_IPP_CHECK()
702     {
703         IppiSize sz = { roi.width, roi.height };
704         switch( type )
705         {
706         case THRESH_TRUNC:
707             if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
708             {
709                 CV_IMPL_ADD(CV_IMPL_IPP);
710                 return;
711             }
712             setIppErrorStatus();
713             break;
714         case THRESH_TOZERO:
715             if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
716             {
717                 CV_IMPL_ADD(CV_IMPL_IPP);
718                 return;
719             }
720             setIppErrorStatus();
721             break;
722         case THRESH_TOZERO_INV:
723             if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
724             {
725                 CV_IMPL_ADD(CV_IMPL_IPP);
726                 return;
727             }
728             setIppErrorStatus();
729             break;
730         }
731     }
732 #endif
733
734     switch( type )
735     {
736         case THRESH_BINARY:
737             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
738             {
739                 j = 0;
740 #if CV_SSE
741                 if( useSIMD )
742                 {
743                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
744                     for( ; j <= roi.width - 8; j += 8 )
745                     {
746                         __m128 v0, v1;
747                         v0 = _mm_loadu_ps( src + j );
748                         v1 = _mm_loadu_ps( src + j + 4 );
749                         v0 = _mm_cmpgt_ps( v0, thresh4 );
750                         v1 = _mm_cmpgt_ps( v1, thresh4 );
751                         v0 = _mm_and_ps( v0, maxval4 );
752                         v1 = _mm_and_ps( v1, maxval4 );
753                         _mm_storeu_ps( dst + j, v0 );
754                         _mm_storeu_ps( dst + j + 4, v1 );
755                     }
756                 }
757 #elif CV_NEON
758                 float32x4_t v_thresh = vdupq_n_f32(thresh);
759                 uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
760
761                 for( ; j <= roi.width - 4; j += 4 )
762                 {
763                     float32x4_t v_src = vld1q_f32(src + j);
764                     uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval);
765                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
766                 }
767 #endif
768
769                 for( ; j < roi.width; j++ )
770                     dst[j] = src[j] > thresh ? maxval : 0;
771             }
772             break;
773
774         case THRESH_BINARY_INV:
775             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
776             {
777                 j = 0;
778 #if CV_SSE
779                 if( useSIMD )
780                 {
781                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
782                     for( ; j <= roi.width - 8; j += 8 )
783                     {
784                         __m128 v0, v1;
785                         v0 = _mm_loadu_ps( src + j );
786                         v1 = _mm_loadu_ps( src + j + 4 );
787                         v0 = _mm_cmple_ps( v0, thresh4 );
788                         v1 = _mm_cmple_ps( v1, thresh4 );
789                         v0 = _mm_and_ps( v0, maxval4 );
790                         v1 = _mm_and_ps( v1, maxval4 );
791                         _mm_storeu_ps( dst + j, v0 );
792                         _mm_storeu_ps( dst + j + 4, v1 );
793                     }
794                 }
795 #elif CV_NEON
796                 float32x4_t v_thresh = vdupq_n_f32(thresh);
797                 uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
798
799                 for( ; j <= roi.width - 4; j += 4 )
800                 {
801                     float32x4_t v_src = vld1q_f32(src + j);
802                     uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval);
803                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
804                 }
805 #endif
806
807                 for( ; j < roi.width; j++ )
808                     dst[j] = src[j] <= thresh ? maxval : 0;
809             }
810             break;
811
812         case THRESH_TRUNC:
813             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
814             {
815                 j = 0;
816 #if CV_SSE
817                 if( useSIMD )
818                 {
819                     __m128 thresh4 = _mm_set1_ps(thresh);
820                     for( ; j <= roi.width - 8; j += 8 )
821                     {
822                         __m128 v0, v1;
823                         v0 = _mm_loadu_ps( src + j );
824                         v1 = _mm_loadu_ps( src + j + 4 );
825                         v0 = _mm_min_ps( v0, thresh4 );
826                         v1 = _mm_min_ps( v1, thresh4 );
827                         _mm_storeu_ps( dst + j, v0 );
828                         _mm_storeu_ps( dst + j + 4, v1 );
829                     }
830                 }
831 #elif CV_NEON
832                 float32x4_t v_thresh = vdupq_n_f32(thresh);
833
834                 for( ; j <= roi.width - 4; j += 4 )
835                     vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh));
836 #endif
837
838                 for( ; j < roi.width; j++ )
839                     dst[j] = std::min(src[j], thresh);
840             }
841             break;
842
843         case THRESH_TOZERO:
844             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
845             {
846                 j = 0;
847 #if CV_SSE
848                 if( useSIMD )
849                 {
850                     __m128 thresh4 = _mm_set1_ps(thresh);
851                     for( ; j <= roi.width - 8; j += 8 )
852                     {
853                         __m128 v0, v1;
854                         v0 = _mm_loadu_ps( src + j );
855                         v1 = _mm_loadu_ps( src + j + 4 );
856                         v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
857                         v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
858                         _mm_storeu_ps( dst + j, v0 );
859                         _mm_storeu_ps( dst + j + 4, v1 );
860                     }
861                 }
862 #elif CV_NEON
863                 float32x4_t v_thresh = vdupq_n_f32(thresh);
864
865                 for( ; j <= roi.width - 4; j += 4 )
866                 {
867                     float32x4_t v_src = vld1q_f32(src + j);
868                     uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh),
869                                                  vreinterpretq_u32_f32(v_src));
870                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
871                 }
872 #endif
873
874                 for( ; j < roi.width; j++ )
875                 {
876                     float v = src[j];
877                     dst[j] = v > thresh ? v : 0;
878                 }
879             }
880             break;
881
882         case THRESH_TOZERO_INV:
883             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
884             {
885                 j = 0;
886 #if CV_SSE
887                 if( useSIMD )
888                 {
889                     __m128 thresh4 = _mm_set1_ps(thresh);
890                     for( ; j <= roi.width - 8; j += 8 )
891                     {
892                         __m128 v0, v1;
893                         v0 = _mm_loadu_ps( src + j );
894                         v1 = _mm_loadu_ps( src + j + 4 );
895                         v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
896                         v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
897                         _mm_storeu_ps( dst + j, v0 );
898                         _mm_storeu_ps( dst + j + 4, v1 );
899                     }
900                 }
901 #elif CV_NEON
902                 float32x4_t v_thresh = vdupq_n_f32(thresh);
903
904                 for( ; j <= roi.width - 4; j += 4 )
905                 {
906                     float32x4_t v_src = vld1q_f32(src + j);
907                     uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh),
908                                                  vreinterpretq_u32_f32(v_src));
909                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
910                 }
911 #endif
912                 for( ; j < roi.width; j++ )
913                 {
914                     float v = src[j];
915                     dst[j] = v <= thresh ? v : 0;
916                 }
917             }
918             break;
919         default:
920             return CV_Error( CV_StsBadArg, "" );
921     }
922 }
923
924 static void
925 thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
926 {
927     int i, j;
928     Size roi = _src.size();
929     roi.width *= _src.channels();
930     const double* src = _src.ptr<double>();
931     double* dst = _dst.ptr<double>();
932     size_t src_step = _src.step / sizeof(src[0]);
933     size_t dst_step = _dst.step / sizeof(dst[0]);
934
935 #if CV_SSE2
936     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
937 #endif
938
939     if (_src.isContinuous() && _dst.isContinuous())
940     {
941         roi.width *= roi.height;
942         roi.height = 1;
943     }
944
945     switch (type)
946     {
947     case THRESH_BINARY:
948         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
949         {
950             j = 0;
951 #if CV_SSE2
952             if( useSIMD )
953             {
954                 __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
955                 for( ; j <= roi.width - 8; j += 8 )
956                 {
957                     __m128d v0, v1, v2, v3;
958                     v0 = _mm_loadu_pd( src + j );
959                     v1 = _mm_loadu_pd( src + j + 2 );
960                     v2 = _mm_loadu_pd( src + j + 4 );
961                     v3 = _mm_loadu_pd( src + j + 6 );
962                     v0 = _mm_cmpgt_pd( v0, thresh2 );
963                     v1 = _mm_cmpgt_pd( v1, thresh2 );
964                     v2 = _mm_cmpgt_pd( v2, thresh2 );
965                     v3 = _mm_cmpgt_pd( v3, thresh2 );
966                     v0 = _mm_and_pd( v0, maxval2 );
967                     v1 = _mm_and_pd( v1, maxval2 );
968                     v2 = _mm_and_pd( v2, maxval2 );
969                     v3 = _mm_and_pd( v3, maxval2 );
970                     _mm_storeu_pd( dst + j, v0 );
971                     _mm_storeu_pd( dst + j + 2, v1 );
972                     _mm_storeu_pd( dst + j + 4, v2 );
973                     _mm_storeu_pd( dst + j + 6, v3 );
974                 }
975             }
976 #elif CV_NEON && defined(__aarch64__)
977             float64x2_t v_thresh = vdupq_n_f64(thresh);
978             uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
979
980             for( ; j <= roi.width - 4; j += 4 )
981             {
982                 float64x2_t v_src0 = vld1q_f64(src + j);
983                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
984                 uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v_maxval);
985                 uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v_maxval);
986                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
987                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
988             }
989 #endif
990
991             for (; j < roi.width; j++)
992                 dst[j] = src[j] > thresh ? maxval : 0;
993         }
994         break;
995
996     case THRESH_BINARY_INV:
997         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
998         {
999             j = 0;
1000
1001 #if CV_SSE2
1002             if( useSIMD )
1003             {
1004                 __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
1005                 for( ; j <= roi.width - 8; j += 8 )
1006                 {
1007                     __m128d v0, v1, v2, v3;
1008                     v0 = _mm_loadu_pd( src + j );
1009                     v1 = _mm_loadu_pd( src + j + 2 );
1010                     v2 = _mm_loadu_pd( src + j + 4 );
1011                     v3 = _mm_loadu_pd( src + j + 6 );
1012                     v0 = _mm_cmple_pd( v0, thresh2 );
1013                     v1 = _mm_cmple_pd( v1, thresh2 );
1014                     v2 = _mm_cmple_pd( v2, thresh2 );
1015                     v3 = _mm_cmple_pd( v3, thresh2 );
1016                     v0 = _mm_and_pd( v0, maxval2 );
1017                     v1 = _mm_and_pd( v1, maxval2 );
1018                     v2 = _mm_and_pd( v2, maxval2 );
1019                     v3 = _mm_and_pd( v3, maxval2 );
1020                     _mm_storeu_pd( dst + j, v0 );
1021                     _mm_storeu_pd( dst + j + 2, v1 );
1022                     _mm_storeu_pd( dst + j + 4, v2 );
1023                     _mm_storeu_pd( dst + j + 6, v3 );
1024                 }
1025             }
1026 #elif CV_NEON && defined(__aarch64__)
1027             float64x2_t v_thresh = vdupq_n_f64(thresh);
1028             uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
1029
1030             for( ; j <= roi.width - 4; j += 4 )
1031             {
1032                 float64x2_t v_src0 = vld1q_f64(src + j);
1033                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1034                 uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), v_maxval);
1035                 uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), v_maxval);
1036                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1037                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1038             }
1039 #endif
1040             for (; j < roi.width; j++)
1041                 dst[j] = src[j] <= thresh ? maxval : 0;
1042         }
1043         break;
1044
1045     case THRESH_TRUNC:
1046         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1047         {
1048             j = 0;
1049
1050 #if CV_SSE2
1051             if( useSIMD )
1052             {
1053                 __m128d thresh2 = _mm_set1_pd(thresh);
1054                 for( ; j <= roi.width - 8; j += 8 )
1055                 {
1056                     __m128d v0, v1, v2, v3;
1057                     v0 = _mm_loadu_pd( src + j );
1058                     v1 = _mm_loadu_pd( src + j + 2 );
1059                     v2 = _mm_loadu_pd( src + j + 4 );
1060                     v3 = _mm_loadu_pd( src + j + 6 );
1061                     v0 = _mm_min_pd( v0, thresh2 );
1062                     v1 = _mm_min_pd( v1, thresh2 );
1063                     v2 = _mm_min_pd( v2, thresh2 );
1064                     v3 = _mm_min_pd( v3, thresh2 );
1065                     _mm_storeu_pd( dst + j, v0 );
1066                     _mm_storeu_pd( dst + j + 2, v1 );
1067                     _mm_storeu_pd( dst + j + 4, v2 );
1068                     _mm_storeu_pd( dst + j + 6, v3 );
1069                 }
1070             }
1071 #elif CV_NEON && defined(__aarch64__)
1072             float64x2_t v_thresh = vdupq_n_f64(thresh);
1073
1074             for( ; j <= roi.width - 4; j += 4 )
1075             {
1076                 float64x2_t v_src0 = vld1q_f64(src + j);
1077                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1078                 float64x2_t v_dst0 = vminq_f64(v_src0, v_thresh);
1079                 float64x2_t v_dst1 = vminq_f64(v_src1, v_thresh);
1080                 vst1q_f64(dst + j, v_dst0);
1081                 vst1q_f64(dst + j + 2, v_dst1);
1082             }
1083 #endif
1084             for (; j < roi.width; j++)
1085                 dst[j] = std::min(src[j], thresh);
1086         }
1087         break;
1088
1089     case THRESH_TOZERO:
1090         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1091         {
1092             j = 0;
1093
1094 #if CV_SSE2
1095             if( useSIMD )
1096             {
1097                 __m128d thresh2 = _mm_set1_pd(thresh);
1098                 for( ; j <= roi.width - 8; j += 8 )
1099                 {
1100                     __m128d v0, v1, v2, v3;
1101                     v0 = _mm_loadu_pd( src + j );
1102                     v1 = _mm_loadu_pd( src + j + 2 );
1103                     v2 = _mm_loadu_pd( src + j + 4 );
1104                     v3 = _mm_loadu_pd( src + j + 6 );
1105                     v0 = _mm_and_pd( v0, _mm_cmpgt_pd(v0, thresh2));
1106                     v1 = _mm_and_pd( v1, _mm_cmpgt_pd(v1, thresh2));
1107                     v2 = _mm_and_pd( v2, _mm_cmpgt_pd(v2, thresh2));
1108                     v3 = _mm_and_pd( v3, _mm_cmpgt_pd(v3, thresh2));
1109                     _mm_storeu_pd( dst + j, v0 );
1110                     _mm_storeu_pd( dst + j + 2, v1 );
1111                     _mm_storeu_pd( dst + j + 4, v2 );
1112                     _mm_storeu_pd( dst + j + 6, v3 );
1113                 }
1114             }
1115 #elif CV_NEON && defined(__aarch64__)
1116             float64x2_t v_thresh = vdupq_n_f64(thresh);
1117
1118             for( ; j <= roi.width - 4; j += 4 )
1119             {
1120                 float64x2_t v_src0 = vld1q_f64(src + j);
1121                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1122                 uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh),
1123                                               vreinterpretq_u64_f64(v_src0));
1124                 uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh),
1125                                               vreinterpretq_u64_f64(v_src1));
1126                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1127                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1128             }
1129 #endif
1130             for (; j < roi.width; j++)
1131             {
1132                 double v = src[j];
1133                 dst[j] = v > thresh ? v : 0;
1134             }
1135         }
1136         break;
1137
1138     case THRESH_TOZERO_INV:
1139         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1140         {
1141             j = 0;
1142
1143 #if CV_SSE2
1144             if( useSIMD )
1145             {
1146                 __m128d thresh2 = _mm_set1_pd(thresh);
1147                 for( ; j <= roi.width - 8; j += 8 )
1148                 {
1149                     __m128d v0, v1, v2, v3;
1150                     v0 = _mm_loadu_pd( src + j );
1151                     v1 = _mm_loadu_pd( src + j + 2 );
1152                     v2 = _mm_loadu_pd( src + j + 4 );
1153                     v3 = _mm_loadu_pd( src + j + 6 );
1154                     v0 = _mm_and_pd( v0, _mm_cmple_pd(v0, thresh2));
1155                     v1 = _mm_and_pd( v1, _mm_cmple_pd(v1, thresh2));
1156                     v2 = _mm_and_pd( v2, _mm_cmple_pd(v2, thresh2));
1157                     v3 = _mm_and_pd( v3, _mm_cmple_pd(v3, thresh2));
1158                     _mm_storeu_pd( dst + j, v0 );
1159                     _mm_storeu_pd( dst + j + 2, v1 );
1160                     _mm_storeu_pd( dst + j + 4, v2 );
1161                     _mm_storeu_pd( dst + j + 6, v3 );
1162                 }
1163             }
1164 #elif CV_NEON && defined(__aarch64__)
1165             float64x2_t v_thresh = vdupq_n_f64(thresh);
1166
1167             for( ; j <= roi.width - 4; j += 4 )
1168             {
1169                 float64x2_t v_src0 = vld1q_f64(src + j);
1170                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1171                 uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh),
1172                                               vreinterpretq_u64_f64(v_src0));
1173                 uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh),
1174                                               vreinterpretq_u64_f64(v_src1));
1175                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1176                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1177             }
1178 #endif
1179             for (; j < roi.width; j++)
1180             {
1181                 double v = src[j];
1182                 dst[j] = v <= thresh ? v : 0;
1183             }
1184         }
1185         break;
1186     default:
1187         return CV_Error(CV_StsBadArg, "");
1188     }
1189 }
1190
1191 #ifdef HAVE_IPP
1192 static bool ipp_getThreshVal_Otsu_8u( const unsigned char* _src, int step, Size size, unsigned char &thresh)
1193 {
1194 #if IPP_VERSION_X100 >= 810 && !HAVE_ICV
1195     int ippStatus = -1;
1196     IppiSize srcSize = { size.width, size.height };
1197     CV_SUPPRESS_DEPRECATED_START
1198     ippStatus = ippiComputeThreshold_Otsu_8u_C1R(_src, step, srcSize, &thresh);
1199     CV_SUPPRESS_DEPRECATED_END
1200
1201     if(ippStatus >= 0)
1202         return true;
1203 #else
1204     CV_UNUSED(_src); CV_UNUSED(step); CV_UNUSED(size); CV_UNUSED(thresh);
1205 #endif
1206     return false;
1207 }
1208 #endif
1209
1210 static double
1211 getThreshVal_Otsu_8u( const Mat& _src )
1212 {
1213     Size size = _src.size();
1214     int step = (int) _src.step;
1215     if( _src.isContinuous() )
1216     {
1217         size.width *= size.height;
1218         size.height = 1;
1219         step = size.width;
1220     }
1221
1222 #ifdef HAVE_IPP
1223     unsigned char thresh;
1224     CV_IPP_RUN(IPP_VERSION_X100 >= 810 && !HAVE_ICV, ipp_getThreshVal_Otsu_8u(_src.ptr(), step, size, thresh), thresh);
1225 #endif
1226
1227     const int N = 256;
1228     int i, j, h[N] = {0};
1229     for( i = 0; i < size.height; i++ )
1230     {
1231         const uchar* src = _src.ptr() + step*i;
1232         j = 0;
1233         #if CV_ENABLE_UNROLLED
1234         for( ; j <= size.width - 4; j += 4 )
1235         {
1236             int v0 = src[j], v1 = src[j+1];
1237             h[v0]++; h[v1]++;
1238             v0 = src[j+2]; v1 = src[j+3];
1239             h[v0]++; h[v1]++;
1240         }
1241         #endif
1242         for( ; j < size.width; j++ )
1243             h[src[j]]++;
1244     }
1245
1246     double mu = 0, scale = 1./(size.width*size.height);
1247     for( i = 0; i < N; i++ )
1248         mu += i*(double)h[i];
1249
1250     mu *= scale;
1251     double mu1 = 0, q1 = 0;
1252     double max_sigma = 0, max_val = 0;
1253
1254     for( i = 0; i < N; i++ )
1255     {
1256         double p_i, q2, mu2, sigma;
1257
1258         p_i = h[i]*scale;
1259         mu1 *= q1;
1260         q1 += p_i;
1261         q2 = 1. - q1;
1262
1263         if( std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON )
1264             continue;
1265
1266         mu1 = (mu1 + i*p_i)/q1;
1267         mu2 = (mu - q1*mu1)/q2;
1268         sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
1269         if( sigma > max_sigma )
1270         {
1271             max_sigma = sigma;
1272             max_val = i;
1273         }
1274     }
1275
1276     return max_val;
1277 }
1278
1279 static double
1280 getThreshVal_Triangle_8u( const Mat& _src )
1281 {
1282     Size size = _src.size();
1283     int step = (int) _src.step;
1284     if( _src.isContinuous() )
1285     {
1286         size.width *= size.height;
1287         size.height = 1;
1288         step = size.width;
1289     }
1290
1291     const int N = 256;
1292     int i, j, h[N] = {0};
1293     for( i = 0; i < size.height; i++ )
1294     {
1295         const uchar* src = _src.ptr() + step*i;
1296         j = 0;
1297         #if CV_ENABLE_UNROLLED
1298         for( ; j <= size.width - 4; j += 4 )
1299         {
1300             int v0 = src[j], v1 = src[j+1];
1301             h[v0]++; h[v1]++;
1302             v0 = src[j+2]; v1 = src[j+3];
1303             h[v0]++; h[v1]++;
1304         }
1305         #endif
1306         for( ; j < size.width; j++ )
1307             h[src[j]]++;
1308     }
1309
1310     int left_bound = 0, right_bound = 0, max_ind = 0, max = 0;
1311     int temp;
1312     bool isflipped = false;
1313
1314     for( i = 0; i < N; i++ )
1315     {
1316         if( h[i] > 0 )
1317         {
1318             left_bound = i;
1319             break;
1320         }
1321     }
1322     if( left_bound > 0 )
1323         left_bound--;
1324
1325     for( i = N-1; i > 0; i-- )
1326     {
1327         if( h[i] > 0 )
1328         {
1329             right_bound = i;
1330             break;
1331         }
1332     }
1333     if( right_bound < N-1 )
1334         right_bound++;
1335
1336     for( i = 0; i < N; i++ )
1337     {
1338         if( h[i] > max)
1339         {
1340             max = h[i];
1341             max_ind = i;
1342         }
1343     }
1344
1345     if( max_ind-left_bound < right_bound-max_ind)
1346     {
1347         isflipped = true;
1348         i = 0, j = N-1;
1349         while( i < j )
1350         {
1351             temp = h[i]; h[i] = h[j]; h[j] = temp;
1352             i++; j--;
1353         }
1354         left_bound = N-1-right_bound;
1355         max_ind = N-1-max_ind;
1356     }
1357
1358     double thresh = left_bound;
1359     double a, b, dist = 0, tempdist;
1360
1361     /*
1362      * We do not need to compute precise distance here. Distance is maximized, so some constants can
1363      * be omitted. This speeds up a computation a bit.
1364      */
1365     a = max; b = left_bound-max_ind;
1366     for( i = left_bound+1; i <= max_ind; i++ )
1367     {
1368         tempdist = a*i + b*h[i];
1369         if( tempdist > dist)
1370         {
1371             dist = tempdist;
1372             thresh = i;
1373         }
1374     }
1375     thresh--;
1376
1377     if( isflipped )
1378         thresh = N-1-thresh;
1379
1380     return thresh;
1381 }
1382
1383 class ThresholdRunner : public ParallelLoopBody
1384 {
1385 public:
1386     ThresholdRunner(Mat _src, Mat _dst, double _thresh, double _maxval, int _thresholdType)
1387     {
1388         src = _src;
1389         dst = _dst;
1390
1391         thresh = _thresh;
1392         maxval = _maxval;
1393         thresholdType = _thresholdType;
1394     }
1395
1396     void operator () ( const Range& range ) const
1397     {
1398         int row0 = range.start;
1399         int row1 = range.end;
1400
1401         Mat srcStripe = src.rowRange(row0, row1);
1402         Mat dstStripe = dst.rowRange(row0, row1);
1403
1404         if (srcStripe.depth() == CV_8U)
1405         {
1406             thresh_8u( srcStripe, dstStripe, (uchar)thresh, (uchar)maxval, thresholdType );
1407         }
1408         else if( srcStripe.depth() == CV_16S )
1409         {
1410             thresh_16s( srcStripe, dstStripe, (short)thresh, (short)maxval, thresholdType );
1411         }
1412         else if( srcStripe.depth() == CV_32F )
1413         {
1414             thresh_32f( srcStripe, dstStripe, (float)thresh, (float)maxval, thresholdType );
1415         }
1416         else if( srcStripe.depth() == CV_64F )
1417         {
1418             thresh_64f(srcStripe, dstStripe, thresh, maxval, thresholdType);
1419         }
1420     }
1421
1422 private:
1423     Mat src;
1424     Mat dst;
1425
1426     double thresh;
1427     double maxval;
1428     int thresholdType;
1429 };
1430
1431 #ifdef HAVE_OPENCL
1432
1433 static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, double maxval, int thresh_type )
1434 {
1435     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
1436         kercn = ocl::predictOptimalVectorWidth(_src, _dst), ktype = CV_MAKE_TYPE(depth, kercn);
1437     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
1438
1439     if ( !(thresh_type == THRESH_BINARY || thresh_type == THRESH_BINARY_INV || thresh_type == THRESH_TRUNC ||
1440            thresh_type == THRESH_TOZERO || thresh_type == THRESH_TOZERO_INV) ||
1441          (!doubleSupport && depth == CV_64F))
1442         return false;
1443
1444     const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
1445                                           "THRESH_TOZERO", "THRESH_TOZERO_INV" };
1446     ocl::Device dev = ocl::Device::getDefault();
1447     int stride_size = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
1448
1449     ocl::Kernel k("threshold", ocl::imgproc::threshold_oclsrc,
1450                   format("-D %s -D T=%s -D T1=%s -D STRIDE_SIZE=%d%s", thresholdMap[thresh_type],
1451                          ocl::typeToStr(ktype), ocl::typeToStr(depth), stride_size,
1452                          doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
1453     if (k.empty())
1454         return false;
1455
1456     UMat src = _src.getUMat();
1457     _dst.create(src.size(), type);
1458     UMat dst = _dst.getUMat();
1459
1460     if (depth <= CV_32S)
1461         thresh = cvFloor(thresh);
1462
1463     const double min_vals[] = { 0, CHAR_MIN, 0, SHRT_MIN, INT_MIN, -FLT_MAX, -DBL_MAX, 0 };
1464     double min_val = min_vals[depth];
1465
1466     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn, kercn),
1467            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(thresh))),
1468            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(maxval))),
1469            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(min_val))));
1470
1471     size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, (size_t)dst.rows };
1472     globalsize[1] = (globalsize[1] + stride_size - 1) / stride_size;
1473     return k.run(2, globalsize, NULL, false);
1474 }
1475
1476 #endif
1477
1478 }
1479
1480 double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double maxval, int type )
1481 {
1482     CV_OCL_RUN_(_src.dims() <= 2 && _dst.isUMat(),
1483                 ocl_threshold(_src, _dst, thresh, maxval, type), thresh)
1484
1485     Mat src = _src.getMat();
1486     int automatic_thresh = (type & ~CV_THRESH_MASK);
1487     type &= THRESH_MASK;
1488
1489     CV_Assert( automatic_thresh != (CV_THRESH_OTSU | CV_THRESH_TRIANGLE) );
1490     if( automatic_thresh == CV_THRESH_OTSU )
1491     {
1492         CV_Assert( src.type() == CV_8UC1 );
1493         thresh = getThreshVal_Otsu_8u( src );
1494     }
1495     else if( automatic_thresh == CV_THRESH_TRIANGLE )
1496     {
1497         CV_Assert( src.type() == CV_8UC1 );
1498         thresh = getThreshVal_Triangle_8u( src );
1499     }
1500
1501     _dst.create( src.size(), src.type() );
1502     Mat dst = _dst.getMat();
1503
1504     if( src.depth() == CV_8U )
1505     {
1506         int ithresh = cvFloor(thresh);
1507         thresh = ithresh;
1508         int imaxval = cvRound(maxval);
1509         if( type == THRESH_TRUNC )
1510             imaxval = ithresh;
1511         imaxval = saturate_cast<uchar>(imaxval);
1512
1513         if( ithresh < 0 || ithresh >= 255 )
1514         {
1515             if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
1516                 ((type == THRESH_TRUNC || type == THRESH_TOZERO_INV) && ithresh < 0) ||
1517                 (type == THRESH_TOZERO && ithresh >= 255) )
1518             {
1519                 int v = type == THRESH_BINARY ? (ithresh >= 255 ? 0 : imaxval) :
1520                         type == THRESH_BINARY_INV ? (ithresh >= 255 ? imaxval : 0) :
1521                         /*type == THRESH_TRUNC ? imaxval :*/ 0;
1522                 dst.setTo(v);
1523             }
1524             else
1525                 src.copyTo(dst);
1526             return thresh;
1527         }
1528         thresh = ithresh;
1529         maxval = imaxval;
1530     }
1531     else if( src.depth() == CV_16S )
1532     {
1533         int ithresh = cvFloor(thresh);
1534         thresh = ithresh;
1535         int imaxval = cvRound(maxval);
1536         if( type == THRESH_TRUNC )
1537             imaxval = ithresh;
1538         imaxval = saturate_cast<short>(imaxval);
1539
1540         if( ithresh < SHRT_MIN || ithresh >= SHRT_MAX )
1541         {
1542             if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
1543                ((type == THRESH_TRUNC || type == THRESH_TOZERO_INV) && ithresh < SHRT_MIN) ||
1544                (type == THRESH_TOZERO && ithresh >= SHRT_MAX) )
1545             {
1546                 int v = type == THRESH_BINARY ? (ithresh >= SHRT_MAX ? 0 : imaxval) :
1547                 type == THRESH_BINARY_INV ? (ithresh >= SHRT_MAX ? imaxval : 0) :
1548                 /*type == THRESH_TRUNC ? imaxval :*/ 0;
1549                 dst.setTo(v);
1550             }
1551             else
1552                 src.copyTo(dst);
1553             return thresh;
1554         }
1555         thresh = ithresh;
1556         maxval = imaxval;
1557     }
1558     else if( src.depth() == CV_32F )
1559         ;
1560     else if( src.depth() == CV_64F )
1561         ;
1562     else
1563         CV_Error( CV_StsUnsupportedFormat, "" );
1564
1565     parallel_for_(Range(0, dst.rows),
1566                   ThresholdRunner(src, dst, thresh, maxval, type),
1567                   dst.total()/(double)(1<<16));
1568     return thresh;
1569 }
1570
1571
1572 void cv::adaptiveThreshold( InputArray _src, OutputArray _dst, double maxValue,
1573                             int method, int type, int blockSize, double delta )
1574 {
1575     Mat src = _src.getMat();
1576     CV_Assert( src.type() == CV_8UC1 );
1577     CV_Assert( blockSize % 2 == 1 && blockSize > 1 );
1578     Size size = src.size();
1579
1580     _dst.create( size, src.type() );
1581     Mat dst = _dst.getMat();
1582
1583     if( maxValue < 0 )
1584     {
1585         dst = Scalar(0);
1586         return;
1587     }
1588
1589     Mat mean;
1590
1591     if( src.data != dst.data )
1592         mean = dst;
1593
1594     if (method == ADAPTIVE_THRESH_MEAN_C)
1595         boxFilter( src, mean, src.type(), Size(blockSize, blockSize),
1596                    Point(-1,-1), true, BORDER_REPLICATE );
1597     else if (method == ADAPTIVE_THRESH_GAUSSIAN_C)
1598     {
1599         Mat srcfloat,meanfloat;
1600         src.convertTo(srcfloat,CV_32F);
1601         meanfloat=srcfloat;
1602         GaussianBlur(srcfloat, meanfloat, Size(blockSize, blockSize), 0, 0, BORDER_REPLICATE);
1603         meanfloat.convertTo(mean, src.type());
1604     }
1605     else
1606         CV_Error( CV_StsBadFlag, "Unknown/unsupported adaptive threshold method" );
1607
1608     int i, j;
1609     uchar imaxval = saturate_cast<uchar>(maxValue);
1610     int idelta = type == THRESH_BINARY ? cvCeil(delta) : cvFloor(delta);
1611     uchar tab[768];
1612
1613     if( type == CV_THRESH_BINARY )
1614         for( i = 0; i < 768; i++ )
1615             tab[i] = (uchar)(i - 255 > -idelta ? imaxval : 0);
1616     else if( type == CV_THRESH_BINARY_INV )
1617         for( i = 0; i < 768; i++ )
1618             tab[i] = (uchar)(i - 255 <= -idelta ? imaxval : 0);
1619     else
1620         CV_Error( CV_StsBadFlag, "Unknown/unsupported threshold type" );
1621
1622     if( src.isContinuous() && mean.isContinuous() && dst.isContinuous() )
1623     {
1624         size.width *= size.height;
1625         size.height = 1;
1626     }
1627
1628     for( i = 0; i < size.height; i++ )
1629     {
1630         const uchar* sdata = src.ptr(i);
1631         const uchar* mdata = mean.ptr(i);
1632         uchar* ddata = dst.ptr(i);
1633
1634         for( j = 0; j < size.width; j++ )
1635             ddata[j] = tab[sdata[j] - mdata[j] + 255];
1636     }
1637 }
1638
1639 CV_IMPL double
1640 cvThreshold( const void* srcarr, void* dstarr, double thresh, double maxval, int type )
1641 {
1642     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
1643
1644     CV_Assert( src.size == dst.size && src.channels() == dst.channels() &&
1645         (src.depth() == dst.depth() || dst.depth() == CV_8U));
1646
1647     thresh = cv::threshold( src, dst, thresh, maxval, type );
1648     if( dst0.data != dst.data )
1649         dst.convertTo( dst0, dst0.depth() );
1650     return thresh;
1651 }
1652
1653
1654 CV_IMPL void
1655 cvAdaptiveThreshold( const void *srcIm, void *dstIm, double maxValue,
1656                      int method, int type, int blockSize, double delta )
1657 {
1658     cv::Mat src = cv::cvarrToMat(srcIm), dst = cv::cvarrToMat(dstIm);
1659     CV_Assert( src.size == dst.size && src.type() == dst.type() );
1660     cv::adaptiveThreshold( src, dst, maxValue, method, type, blockSize, delta );
1661 }
1662
1663 /* End of file. */