Merge pull request #6827 from jtkb:cmake-maven-release
[platform/upstream/opencv.git] / modules / imgproc / src / thresh.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44 #include "opencl_kernels_imgproc.hpp"
45
46 namespace cv
47 {
48
49 static void
50 thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
51 {
52     int i, j, j_scalar = 0;
53     uchar tab[256];
54     Size roi = _src.size();
55     roi.width *= _src.channels();
56     size_t src_step = _src.step;
57     size_t dst_step = _dst.step;
58
59     if( _src.isContinuous() && _dst.isContinuous() )
60     {
61         roi.width *= roi.height;
62         roi.height = 1;
63         src_step = dst_step = roi.width;
64     }
65
66 #ifdef HAVE_TEGRA_OPTIMIZATION
67     if (tegra::useTegra() && tegra::thresh_8u(_src, _dst, roi.width, roi.height, thresh, maxval, type))
68         return;
69 #endif
70
71 #if defined(HAVE_IPP)
72     CV_IPP_CHECK()
73     {
74         IppiSize sz = { roi.width, roi.height };
75         CV_SUPPRESS_DEPRECATED_START
76         switch( type )
77         {
78         case THRESH_TRUNC:
79 #ifndef HAVE_IPP_ICV_ONLY
80             if (_src.data == _dst.data && ippiThreshold_GT_8u_C1IR(_dst.ptr(), (int)dst_step, sz, thresh) >= 0)
81             {
82                 CV_IMPL_ADD(CV_IMPL_IPP);
83                 return;
84             }
85 #endif
86             if (ippiThreshold_GT_8u_C1R(_src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh) >= 0)
87             {
88                 CV_IMPL_ADD(CV_IMPL_IPP);
89                 return;
90             }
91             setIppErrorStatus();
92             break;
93         case THRESH_TOZERO:
94 #ifndef HAVE_IPP_ICV_ONLY
95             if (_src.data == _dst.data && ippiThreshold_LTVal_8u_C1IR(_dst.ptr(), (int)dst_step, sz, thresh+1, 0) >= 0)
96             {
97                 CV_IMPL_ADD(CV_IMPL_IPP);
98                 return;
99             }
100 #endif
101             if (ippiThreshold_LTVal_8u_C1R(_src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh+1, 0) >= 0)
102             {
103                 CV_IMPL_ADD(CV_IMPL_IPP);
104                 return;
105             }
106             setIppErrorStatus();
107             break;
108         case THRESH_TOZERO_INV:
109 #ifndef HAVE_IPP_ICV_ONLY
110             if (_src.data == _dst.data && ippiThreshold_GTVal_8u_C1IR(_dst.ptr(), (int)dst_step, sz, thresh, 0) >= 0)
111             {
112                 CV_IMPL_ADD(CV_IMPL_IPP);
113                 return;
114             }
115 #endif
116             if (ippiThreshold_GTVal_8u_C1R(_src.ptr(), (int)src_step, _dst.ptr(), (int)dst_step, sz, thresh, 0) >= 0)
117             {
118                 CV_IMPL_ADD(CV_IMPL_IPP);
119                 return;
120             }
121             setIppErrorStatus();
122             break;
123         }
124         CV_SUPPRESS_DEPRECATED_END
125     }
126 #endif
127
128     switch( type )
129     {
130     case THRESH_BINARY:
131         for( i = 0; i <= thresh; i++ )
132             tab[i] = 0;
133         for( ; i < 256; i++ )
134             tab[i] = maxval;
135         break;
136     case THRESH_BINARY_INV:
137         for( i = 0; i <= thresh; i++ )
138             tab[i] = maxval;
139         for( ; i < 256; i++ )
140             tab[i] = 0;
141         break;
142     case THRESH_TRUNC:
143         for( i = 0; i <= thresh; i++ )
144             tab[i] = (uchar)i;
145         for( ; i < 256; i++ )
146             tab[i] = thresh;
147         break;
148     case THRESH_TOZERO:
149         for( i = 0; i <= thresh; i++ )
150             tab[i] = 0;
151         for( ; i < 256; i++ )
152             tab[i] = (uchar)i;
153         break;
154     case THRESH_TOZERO_INV:
155         for( i = 0; i <= thresh; i++ )
156             tab[i] = (uchar)i;
157         for( ; i < 256; i++ )
158             tab[i] = 0;
159         break;
160     default:
161         CV_Error( CV_StsBadArg, "Unknown threshold type" );
162     }
163
164 #if CV_SSE2
165     if( checkHardwareSupport(CV_CPU_SSE2) )
166     {
167         __m128i _x80 = _mm_set1_epi8('\x80');
168         __m128i thresh_u = _mm_set1_epi8(thresh);
169         __m128i thresh_s = _mm_set1_epi8(thresh ^ 0x80);
170         __m128i maxval_ = _mm_set1_epi8(maxval);
171         j_scalar = roi.width & -8;
172
173         for( i = 0; i < roi.height; i++ )
174         {
175             const uchar* src = _src.ptr() + src_step*i;
176             uchar* dst = _dst.ptr() + dst_step*i;
177
178             switch( type )
179             {
180             case THRESH_BINARY:
181                 for( j = 0; j <= roi.width - 32; j += 32 )
182                 {
183                     __m128i v0, v1;
184                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
185                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
186                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
187                     v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s );
188                     v0 = _mm_and_si128( v0, maxval_ );
189                     v1 = _mm_and_si128( v1, maxval_ );
190                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
191                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
192                 }
193
194                 for( ; j <= roi.width - 8; j += 8 )
195                 {
196                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
197                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
198                     v0 = _mm_and_si128( v0, maxval_ );
199                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
200                 }
201                 break;
202
203             case THRESH_BINARY_INV:
204                 for( j = 0; j <= roi.width - 32; j += 32 )
205                 {
206                     __m128i v0, v1;
207                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
208                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
209                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
210                     v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s );
211                     v0 = _mm_andnot_si128( v0, maxval_ );
212                     v1 = _mm_andnot_si128( v1, maxval_ );
213                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
214                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
215                 }
216
217                 for( ; j <= roi.width - 8; j += 8 )
218                 {
219                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
220                     v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
221                     v0 = _mm_andnot_si128( v0, maxval_ );
222                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
223                 }
224                 break;
225
226             case THRESH_TRUNC:
227                 for( j = 0; j <= roi.width - 32; j += 32 )
228                 {
229                     __m128i v0, v1;
230                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
231                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
232                     v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u ));
233                     v1 = _mm_subs_epu8( v1, _mm_subs_epu8( v1, thresh_u ));
234                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
235                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
236                 }
237
238                 for( ; j <= roi.width - 8; j += 8 )
239                 {
240                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
241                     v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u ));
242                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
243                 }
244                 break;
245
246             case THRESH_TOZERO:
247                 for( j = 0; j <= roi.width - 32; j += 32 )
248                 {
249                     __m128i v0, v1;
250                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
251                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
252                     v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ));
253                     v1 = _mm_and_si128( v1, _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ));
254                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
255                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
256                 }
257
258                 for( ; j <= roi.width - 8; j += 8 )
259                 {
260                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
261                     v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ));
262                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
263                 }
264                 break;
265
266             case THRESH_TOZERO_INV:
267                 for( j = 0; j <= roi.width - 32; j += 32 )
268                 {
269                     __m128i v0, v1;
270                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
271                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
272                     v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 );
273                     v1 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ), v1 );
274                     _mm_storeu_si128( (__m128i*)(dst + j), v0 );
275                     _mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
276                 }
277
278                 for( ; j <= roi.width - 8; j += 8 )
279                 {
280                     __m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
281                     v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 );
282                     _mm_storel_epi64( (__m128i*)(dst + j), v0 );
283                 }
284                 break;
285             }
286         }
287     }
288 #elif CV_NEON
289     uint8x16_t v_thresh = vdupq_n_u8(thresh), v_maxval = vdupq_n_u8(maxval);
290
291     switch( type )
292     {
293     case THRESH_BINARY:
294         for( i = 0; i < roi.height; i++ )
295         {
296             const uchar* src = _src.ptr() + src_step*i;
297             uchar* dst = _dst.ptr() + dst_step*i;
298
299             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
300                 vst1q_u8(dst + j_scalar, vandq_u8(vcgtq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval));
301         }
302         break;
303
304     case THRESH_BINARY_INV:
305         for( i = 0; i < roi.height; i++ )
306         {
307             const uchar* src = _src.ptr() + src_step*i;
308             uchar* dst = _dst.ptr() + dst_step*i;
309
310             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
311                 vst1q_u8(dst + j_scalar, vandq_u8(vcleq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval));
312         }
313         break;
314
315     case THRESH_TRUNC:
316         for( i = 0; i < roi.height; i++ )
317         {
318             const uchar* src = _src.ptr() + src_step*i;
319             uchar* dst = _dst.ptr() + dst_step*i;
320
321             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
322                 vst1q_u8(dst + j_scalar, vminq_u8(vld1q_u8(src + j_scalar), v_thresh));
323         }
324         break;
325
326     case THRESH_TOZERO:
327         for( i = 0; i < roi.height; i++ )
328         {
329             const uchar* src = _src.ptr() + src_step*i;
330             uchar* dst = _dst.ptr() + dst_step*i;
331
332             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
333             {
334                 uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcgtq_u8(v_src, v_thresh);
335                 vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src));
336             }
337         }
338         break;
339
340     case THRESH_TOZERO_INV:
341         for( i = 0; i < roi.height; i++ )
342         {
343             const uchar* src = _src.ptr() + src_step*i;
344             uchar* dst = _dst.ptr() + dst_step*i;
345
346             for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
347             {
348                 uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcleq_u8(v_src, v_thresh);
349                 vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src));
350             }
351         }
352         break;
353     default:
354         return CV_Error( CV_StsBadArg, "" );
355     }
356 #endif
357
358     if( j_scalar < roi.width )
359     {
360         for( i = 0; i < roi.height; i++ )
361         {
362             const uchar* src = _src.ptr() + src_step*i;
363             uchar* dst = _dst.ptr() + dst_step*i;
364             j = j_scalar;
365 #if CV_ENABLE_UNROLLED
366             for( ; j <= roi.width - 4; j += 4 )
367             {
368                 uchar t0 = tab[src[j]];
369                 uchar t1 = tab[src[j+1]];
370
371                 dst[j] = t0;
372                 dst[j+1] = t1;
373
374                 t0 = tab[src[j+2]];
375                 t1 = tab[src[j+3]];
376
377                 dst[j+2] = t0;
378                 dst[j+3] = t1;
379             }
380 #endif
381             for( ; j < roi.width; j++ )
382                 dst[j] = tab[src[j]];
383         }
384     }
385 }
386
387
388 static void
389 thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
390 {
391     int i, j;
392     Size roi = _src.size();
393     roi.width *= _src.channels();
394     const short* src = _src.ptr<short>();
395     short* dst = _dst.ptr<short>();
396     size_t src_step = _src.step/sizeof(src[0]);
397     size_t dst_step = _dst.step/sizeof(dst[0]);
398
399 #if CV_SSE2
400     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
401 #endif
402
403     if( _src.isContinuous() && _dst.isContinuous() )
404     {
405         roi.width *= roi.height;
406         roi.height = 1;
407         src_step = dst_step = roi.width;
408     }
409
410 #ifdef HAVE_TEGRA_OPTIMIZATION
411     if (tegra::useTegra() && tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type))
412         return;
413 #endif
414
415 #if defined(HAVE_IPP)
416     CV_IPP_CHECK()
417     {
418         IppiSize sz = { roi.width, roi.height };
419         CV_SUPPRESS_DEPRECATED_START
420         switch( type )
421         {
422         case THRESH_TRUNC:
423 #ifndef HAVE_IPP_ICV_ONLY
424             if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
425             {
426                 CV_IMPL_ADD(CV_IMPL_IPP);
427                 return;
428             }
429 #endif
430             if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
431             {
432                 CV_IMPL_ADD(CV_IMPL_IPP);
433                 return;
434             }
435             setIppErrorStatus();
436             break;
437         case THRESH_TOZERO:
438 #ifndef HAVE_IPP_ICV_ONLY
439             if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0)
440             {
441                 CV_IMPL_ADD(CV_IMPL_IPP);
442                 return;
443             }
444 #endif
445             if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0)
446             {
447                 CV_IMPL_ADD(CV_IMPL_IPP);
448                 return;
449             }
450             setIppErrorStatus();
451             break;
452         case THRESH_TOZERO_INV:
453 #ifndef HAVE_IPP_ICV_ONLY
454             if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
455             {
456                 CV_IMPL_ADD(CV_IMPL_IPP);
457                 return;
458             }
459 #endif
460             if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
461             {
462                 CV_IMPL_ADD(CV_IMPL_IPP);
463                 return;
464             }
465             setIppErrorStatus();
466             break;
467         }
468         CV_SUPPRESS_DEPRECATED_END
469     }
470 #endif
471
472     switch( type )
473     {
474     case THRESH_BINARY:
475         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
476         {
477             j = 0;
478         #if CV_SSE2
479             if( useSIMD )
480             {
481                 __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
482                 for( ; j <= roi.width - 16; j += 16 )
483                 {
484                     __m128i v0, v1;
485                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
486                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
487                     v0 = _mm_cmpgt_epi16( v0, thresh8 );
488                     v1 = _mm_cmpgt_epi16( v1, thresh8 );
489                     v0 = _mm_and_si128( v0, maxval8 );
490                     v1 = _mm_and_si128( v1, maxval8 );
491                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
492                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
493                 }
494             }
495         #elif CV_NEON
496             int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
497
498             for( ; j <= roi.width - 8; j += 8 )
499             {
500                 uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh);
501                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
502             }
503         #endif
504
505             for( ; j < roi.width; j++ )
506                 dst[j] = src[j] > thresh ? maxval : 0;
507         }
508         break;
509
510     case THRESH_BINARY_INV:
511         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
512         {
513             j = 0;
514         #if CV_SSE2
515             if( useSIMD )
516             {
517                 __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
518                 for( ; j <= roi.width - 16; j += 16 )
519                 {
520                     __m128i v0, v1;
521                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
522                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
523                     v0 = _mm_cmpgt_epi16( v0, thresh8 );
524                     v1 = _mm_cmpgt_epi16( v1, thresh8 );
525                     v0 = _mm_andnot_si128( v0, maxval8 );
526                     v1 = _mm_andnot_si128( v1, maxval8 );
527                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
528                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
529                 }
530             }
531         #elif CV_NEON
532             int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
533
534             for( ; j <= roi.width - 8; j += 8 )
535             {
536                 uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh);
537                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
538             }
539         #endif
540
541             for( ; j < roi.width; j++ )
542                 dst[j] = src[j] <= thresh ? maxval : 0;
543         }
544         break;
545
546     case THRESH_TRUNC:
547         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
548         {
549             j = 0;
550         #if CV_SSE2
551             if( useSIMD )
552             {
553                 __m128i thresh8 = _mm_set1_epi16(thresh);
554                 for( ; j <= roi.width - 16; j += 16 )
555                 {
556                     __m128i v0, v1;
557                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
558                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
559                     v0 = _mm_min_epi16( v0, thresh8 );
560                     v1 = _mm_min_epi16( v1, thresh8 );
561                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
562                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
563                 }
564             }
565         #elif CV_NEON
566             int16x8_t v_thresh = vdupq_n_s16(thresh);
567
568             for( ; j <= roi.width - 8; j += 8 )
569                 vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh));
570         #endif
571
572             for( ; j < roi.width; j++ )
573                 dst[j] = std::min(src[j], thresh);
574         }
575         break;
576
577     case THRESH_TOZERO:
578         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
579         {
580             j = 0;
581         #if CV_SSE2
582             if( useSIMD )
583             {
584                 __m128i thresh8 = _mm_set1_epi16(thresh);
585                 for( ; j <= roi.width - 16; j += 16 )
586                 {
587                     __m128i v0, v1;
588                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
589                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
590                     v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8));
591                     v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8));
592                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
593                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
594                 }
595             }
596         #elif CV_NEON
597             int16x8_t v_thresh = vdupq_n_s16(thresh);
598
599             for( ; j <= roi.width - 8; j += 8 )
600             {
601                 int16x8_t v_src = vld1q_s16(src + j);
602                 uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh);
603                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
604             }
605         #endif
606
607             for( ; j < roi.width; j++ )
608             {
609                 short v = src[j];
610                 dst[j] = v > thresh ? v : 0;
611             }
612         }
613         break;
614
615     case THRESH_TOZERO_INV:
616         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
617         {
618             j = 0;
619         #if CV_SSE2
620             if( useSIMD )
621             {
622                 __m128i thresh8 = _mm_set1_epi16(thresh);
623                 for( ; j <= roi.width - 16; j += 16 )
624                 {
625                     __m128i v0, v1;
626                     v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
627                     v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
628                     v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0);
629                     v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1);
630                     _mm_storeu_si128((__m128i*)(dst + j), v0 );
631                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
632                 }
633             }
634         #elif CV_NEON
635             int16x8_t v_thresh = vdupq_n_s16(thresh);
636
637             for( ; j <= roi.width - 8; j += 8 )
638             {
639                 int16x8_t v_src = vld1q_s16(src + j);
640                 uint16x8_t v_mask = vcleq_s16(v_src, v_thresh);
641                 vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
642             }
643         #endif
644             for( ; j < roi.width; j++ )
645             {
646                 short v = src[j];
647                 dst[j] = v <= thresh ? v : 0;
648             }
649         }
650         break;
651     default:
652         return CV_Error( CV_StsBadArg, "" );
653     }
654 }
655
656
657 static void
658 thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
659 {
660     int i, j;
661     Size roi = _src.size();
662     roi.width *= _src.channels();
663     const float* src = _src.ptr<float>();
664     float* dst = _dst.ptr<float>();
665     size_t src_step = _src.step/sizeof(src[0]);
666     size_t dst_step = _dst.step/sizeof(dst[0]);
667
668 #if CV_SSE
669     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
670 #endif
671
672     if( _src.isContinuous() && _dst.isContinuous() )
673     {
674         roi.width *= roi.height;
675         roi.height = 1;
676     }
677
678 #ifdef HAVE_TEGRA_OPTIMIZATION
679     if (tegra::useTegra() && tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
680         return;
681 #endif
682
683 #if defined(HAVE_IPP)
684     CV_IPP_CHECK()
685     {
686         IppiSize sz = { roi.width, roi.height };
687         switch( type )
688         {
689         case THRESH_TRUNC:
690             if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
691             {
692                 CV_IMPL_ADD(CV_IMPL_IPP);
693                 return;
694             }
695             setIppErrorStatus();
696             break;
697         case THRESH_TOZERO:
698             if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
699             {
700                 CV_IMPL_ADD(CV_IMPL_IPP);
701                 return;
702             }
703             setIppErrorStatus();
704             break;
705         case THRESH_TOZERO_INV:
706             if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
707             {
708                 CV_IMPL_ADD(CV_IMPL_IPP);
709                 return;
710             }
711             setIppErrorStatus();
712             break;
713         }
714     }
715 #endif
716
717     switch( type )
718     {
719         case THRESH_BINARY:
720             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
721             {
722                 j = 0;
723 #if CV_SSE
724                 if( useSIMD )
725                 {
726                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
727                     for( ; j <= roi.width - 8; j += 8 )
728                     {
729                         __m128 v0, v1;
730                         v0 = _mm_loadu_ps( src + j );
731                         v1 = _mm_loadu_ps( src + j + 4 );
732                         v0 = _mm_cmpgt_ps( v0, thresh4 );
733                         v1 = _mm_cmpgt_ps( v1, thresh4 );
734                         v0 = _mm_and_ps( v0, maxval4 );
735                         v1 = _mm_and_ps( v1, maxval4 );
736                         _mm_storeu_ps( dst + j, v0 );
737                         _mm_storeu_ps( dst + j + 4, v1 );
738                     }
739                 }
740 #elif CV_NEON
741                 float32x4_t v_thresh = vdupq_n_f32(thresh);
742                 uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
743
744                 for( ; j <= roi.width - 4; j += 4 )
745                 {
746                     float32x4_t v_src = vld1q_f32(src + j);
747                     uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval);
748                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
749                 }
750 #endif
751
752                 for( ; j < roi.width; j++ )
753                     dst[j] = src[j] > thresh ? maxval : 0;
754             }
755             break;
756
757         case THRESH_BINARY_INV:
758             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
759             {
760                 j = 0;
761 #if CV_SSE
762                 if( useSIMD )
763                 {
764                     __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
765                     for( ; j <= roi.width - 8; j += 8 )
766                     {
767                         __m128 v0, v1;
768                         v0 = _mm_loadu_ps( src + j );
769                         v1 = _mm_loadu_ps( src + j + 4 );
770                         v0 = _mm_cmple_ps( v0, thresh4 );
771                         v1 = _mm_cmple_ps( v1, thresh4 );
772                         v0 = _mm_and_ps( v0, maxval4 );
773                         v1 = _mm_and_ps( v1, maxval4 );
774                         _mm_storeu_ps( dst + j, v0 );
775                         _mm_storeu_ps( dst + j + 4, v1 );
776                     }
777                 }
778 #elif CV_NEON
779                 float32x4_t v_thresh = vdupq_n_f32(thresh);
780                 uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
781
782                 for( ; j <= roi.width - 4; j += 4 )
783                 {
784                     float32x4_t v_src = vld1q_f32(src + j);
785                     uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval);
786                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
787                 }
788 #endif
789
790                 for( ; j < roi.width; j++ )
791                     dst[j] = src[j] <= thresh ? maxval : 0;
792             }
793             break;
794
795         case THRESH_TRUNC:
796             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
797             {
798                 j = 0;
799 #if CV_SSE
800                 if( useSIMD )
801                 {
802                     __m128 thresh4 = _mm_set1_ps(thresh);
803                     for( ; j <= roi.width - 8; j += 8 )
804                     {
805                         __m128 v0, v1;
806                         v0 = _mm_loadu_ps( src + j );
807                         v1 = _mm_loadu_ps( src + j + 4 );
808                         v0 = _mm_min_ps( v0, thresh4 );
809                         v1 = _mm_min_ps( v1, thresh4 );
810                         _mm_storeu_ps( dst + j, v0 );
811                         _mm_storeu_ps( dst + j + 4, v1 );
812                     }
813                 }
814 #elif CV_NEON
815                 float32x4_t v_thresh = vdupq_n_f32(thresh);
816
817                 for( ; j <= roi.width - 4; j += 4 )
818                     vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh));
819 #endif
820
821                 for( ; j < roi.width; j++ )
822                     dst[j] = std::min(src[j], thresh);
823             }
824             break;
825
826         case THRESH_TOZERO:
827             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
828             {
829                 j = 0;
830 #if CV_SSE
831                 if( useSIMD )
832                 {
833                     __m128 thresh4 = _mm_set1_ps(thresh);
834                     for( ; j <= roi.width - 8; j += 8 )
835                     {
836                         __m128 v0, v1;
837                         v0 = _mm_loadu_ps( src + j );
838                         v1 = _mm_loadu_ps( src + j + 4 );
839                         v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
840                         v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
841                         _mm_storeu_ps( dst + j, v0 );
842                         _mm_storeu_ps( dst + j + 4, v1 );
843                     }
844                 }
845 #elif CV_NEON
846                 float32x4_t v_thresh = vdupq_n_f32(thresh);
847
848                 for( ; j <= roi.width - 4; j += 4 )
849                 {
850                     float32x4_t v_src = vld1q_f32(src + j);
851                     uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh),
852                                                  vreinterpretq_u32_f32(v_src));
853                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
854                 }
855 #endif
856
857                 for( ; j < roi.width; j++ )
858                 {
859                     float v = src[j];
860                     dst[j] = v > thresh ? v : 0;
861                 }
862             }
863             break;
864
865         case THRESH_TOZERO_INV:
866             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
867             {
868                 j = 0;
869 #if CV_SSE
870                 if( useSIMD )
871                 {
872                     __m128 thresh4 = _mm_set1_ps(thresh);
873                     for( ; j <= roi.width - 8; j += 8 )
874                     {
875                         __m128 v0, v1;
876                         v0 = _mm_loadu_ps( src + j );
877                         v1 = _mm_loadu_ps( src + j + 4 );
878                         v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
879                         v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
880                         _mm_storeu_ps( dst + j, v0 );
881                         _mm_storeu_ps( dst + j + 4, v1 );
882                     }
883                 }
884 #elif CV_NEON
885                 float32x4_t v_thresh = vdupq_n_f32(thresh);
886
887                 for( ; j <= roi.width - 4; j += 4 )
888                 {
889                     float32x4_t v_src = vld1q_f32(src + j);
890                     uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh),
891                                                  vreinterpretq_u32_f32(v_src));
892                     vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
893                 }
894 #endif
895                 for( ; j < roi.width; j++ )
896                 {
897                     float v = src[j];
898                     dst[j] = v <= thresh ? v : 0;
899                 }
900             }
901             break;
902         default:
903             return CV_Error( CV_StsBadArg, "" );
904     }
905 }
906
907 static void
908 thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
909 {
910     int i, j;
911     Size roi = _src.size();
912     roi.width *= _src.channels();
913     const double* src = _src.ptr<double>();
914     double* dst = _dst.ptr<double>();
915     size_t src_step = _src.step / sizeof(src[0]);
916     size_t dst_step = _dst.step / sizeof(dst[0]);
917
918 #if CV_SSE2
919     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
920 #endif
921
922     if (_src.isContinuous() && _dst.isContinuous())
923     {
924         roi.width *= roi.height;
925         roi.height = 1;
926     }
927
928     switch (type)
929     {
930     case THRESH_BINARY:
931         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
932         {
933             j = 0;
934 #if CV_SSE2
935             if( useSIMD )
936             {
937                 __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
938                 for( ; j <= roi.width - 8; j += 8 )
939                 {
940                     __m128d v0, v1, v2, v3;
941                     v0 = _mm_loadu_pd( src + j );
942                     v1 = _mm_loadu_pd( src + j + 2 );
943                     v2 = _mm_loadu_pd( src + j + 4 );
944                     v3 = _mm_loadu_pd( src + j + 6 );
945                     v0 = _mm_cmpgt_pd( v0, thresh2 );
946                     v1 = _mm_cmpgt_pd( v1, thresh2 );
947                     v2 = _mm_cmpgt_pd( v2, thresh2 );
948                     v3 = _mm_cmpgt_pd( v3, thresh2 );
949                     v0 = _mm_and_pd( v0, maxval2 );
950                     v1 = _mm_and_pd( v1, maxval2 );
951                     v2 = _mm_and_pd( v2, maxval2 );
952                     v3 = _mm_and_pd( v3, maxval2 );
953                     _mm_storeu_pd( dst + j, v0 );
954                     _mm_storeu_pd( dst + j + 2, v1 );
955                     _mm_storeu_pd( dst + j + 4, v2 );
956                     _mm_storeu_pd( dst + j + 6, v3 );
957                 }
958             }
959 #elif CV_NEON && defined(__aarch64__)
960             float64x2_t v_thresh = vdupq_n_f64(thresh);
961             uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
962
963             for( ; j <= roi.width - 4; j += 4 )
964             {
965                 float64x2_t v_src0 = vld1q_f64(src + j);
966                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
967                 uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v_maxval);
968                 uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v_maxval);
969                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
970                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
971             }
972 #endif
973
974             for (; j < roi.width; j++)
975                 dst[j] = src[j] > thresh ? maxval : 0;
976         }
977         break;
978
979     case THRESH_BINARY_INV:
980         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
981         {
982             j = 0;
983
984 #if CV_SSE2
985             if( useSIMD )
986             {
987                 __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
988                 for( ; j <= roi.width - 8; j += 8 )
989                 {
990                     __m128d v0, v1, v2, v3;
991                     v0 = _mm_loadu_pd( src + j );
992                     v1 = _mm_loadu_pd( src + j + 2 );
993                     v2 = _mm_loadu_pd( src + j + 4 );
994                     v3 = _mm_loadu_pd( src + j + 6 );
995                     v0 = _mm_cmple_pd( v0, thresh2 );
996                     v1 = _mm_cmple_pd( v1, thresh2 );
997                     v2 = _mm_cmple_pd( v2, thresh2 );
998                     v3 = _mm_cmple_pd( v3, thresh2 );
999                     v0 = _mm_and_pd( v0, maxval2 );
1000                     v1 = _mm_and_pd( v1, maxval2 );
1001                     v2 = _mm_and_pd( v2, maxval2 );
1002                     v3 = _mm_and_pd( v3, maxval2 );
1003                     _mm_storeu_pd( dst + j, v0 );
1004                     _mm_storeu_pd( dst + j + 2, v1 );
1005                     _mm_storeu_pd( dst + j + 4, v2 );
1006                     _mm_storeu_pd( dst + j + 6, v3 );
1007                 }
1008             }
1009 #elif CV_NEON && defined(__aarch64__)
1010             float64x2_t v_thresh = vdupq_n_f64(thresh);
1011             uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
1012
1013             for( ; j <= roi.width - 4; j += 4 )
1014             {
1015                 float64x2_t v_src0 = vld1q_f64(src + j);
1016                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1017                 uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), v_maxval);
1018                 uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), v_maxval);
1019                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1020                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1021             }
1022 #endif
1023             for (; j < roi.width; j++)
1024                 dst[j] = src[j] <= thresh ? maxval : 0;
1025         }
1026         break;
1027
1028     case THRESH_TRUNC:
1029         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1030         {
1031             j = 0;
1032
1033 #if CV_SSE2
1034             if( useSIMD )
1035             {
1036                 __m128d thresh2 = _mm_set1_pd(thresh);
1037                 for( ; j <= roi.width - 8; j += 8 )
1038                 {
1039                     __m128d v0, v1, v2, v3;
1040                     v0 = _mm_loadu_pd( src + j );
1041                     v1 = _mm_loadu_pd( src + j + 2 );
1042                     v2 = _mm_loadu_pd( src + j + 4 );
1043                     v3 = _mm_loadu_pd( src + j + 6 );
1044                     v0 = _mm_min_pd( v0, thresh2 );
1045                     v1 = _mm_min_pd( v1, thresh2 );
1046                     v2 = _mm_min_pd( v2, thresh2 );
1047                     v3 = _mm_min_pd( v3, thresh2 );
1048                     _mm_storeu_pd( dst + j, v0 );
1049                     _mm_storeu_pd( dst + j + 2, v1 );
1050                     _mm_storeu_pd( dst + j + 4, v2 );
1051                     _mm_storeu_pd( dst + j + 6, v3 );
1052                 }
1053             }
1054 #elif CV_NEON && defined(__aarch64__)
1055             float64x2_t v_thresh = vdupq_n_f64(thresh);
1056
1057             for( ; j <= roi.width - 4; j += 4 )
1058             {
1059                 float64x2_t v_src0 = vld1q_f64(src + j);
1060                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1061                 float64x2_t v_dst0 = vminq_f64(v_src0, v_thresh);
1062                 float64x2_t v_dst1 = vminq_f64(v_src1, v_thresh);
1063                 vst1q_f64(dst + j, v_dst0);
1064                 vst1q_f64(dst + j + 2, v_dst1);
1065             }
1066 #endif
1067             for (; j < roi.width; j++)
1068                 dst[j] = std::min(src[j], thresh);
1069         }
1070         break;
1071
1072     case THRESH_TOZERO:
1073         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1074         {
1075             j = 0;
1076
1077 #if CV_SSE2
1078             if( useSIMD )
1079             {
1080                 __m128d thresh2 = _mm_set1_pd(thresh);
1081                 for( ; j <= roi.width - 8; j += 8 )
1082                 {
1083                     __m128d v0, v1, v2, v3;
1084                     v0 = _mm_loadu_pd( src + j );
1085                     v1 = _mm_loadu_pd( src + j + 2 );
1086                     v2 = _mm_loadu_pd( src + j + 4 );
1087                     v3 = _mm_loadu_pd( src + j + 6 );
1088                     v0 = _mm_and_pd( v0, _mm_cmpgt_pd(v0, thresh2));
1089                     v1 = _mm_and_pd( v1, _mm_cmpgt_pd(v1, thresh2));
1090                     v2 = _mm_and_pd( v2, _mm_cmpgt_pd(v2, thresh2));
1091                     v3 = _mm_and_pd( v3, _mm_cmpgt_pd(v3, thresh2));
1092                     _mm_storeu_pd( dst + j, v0 );
1093                     _mm_storeu_pd( dst + j + 2, v1 );
1094                     _mm_storeu_pd( dst + j + 4, v2 );
1095                     _mm_storeu_pd( dst + j + 6, v3 );
1096                 }
1097             }
1098 #elif CV_NEON && defined(__aarch64__)
1099             float64x2_t v_thresh = vdupq_n_f64(thresh);
1100
1101             for( ; j <= roi.width - 4; j += 4 )
1102             {
1103                 float64x2_t v_src0 = vld1q_f64(src + j);
1104                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1105                 uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh),
1106                                               vreinterpretq_u64_f64(v_src0));
1107                 uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh),
1108                                               vreinterpretq_u64_f64(v_src1));
1109                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1110                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1111             }
1112 #endif
1113             for (; j < roi.width; j++)
1114             {
1115                 double v = src[j];
1116                 dst[j] = v > thresh ? v : 0;
1117             }
1118         }
1119         break;
1120
1121     case THRESH_TOZERO_INV:
1122         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
1123         {
1124             j = 0;
1125
1126 #if CV_SSE2
1127             if( useSIMD )
1128             {
1129                 __m128d thresh2 = _mm_set1_pd(thresh);
1130                 for( ; j <= roi.width - 8; j += 8 )
1131                 {
1132                     __m128d v0, v1, v2, v3;
1133                     v0 = _mm_loadu_pd( src + j );
1134                     v1 = _mm_loadu_pd( src + j + 2 );
1135                     v2 = _mm_loadu_pd( src + j + 4 );
1136                     v3 = _mm_loadu_pd( src + j + 6 );
1137                     v0 = _mm_and_pd( v0, _mm_cmple_pd(v0, thresh2));
1138                     v1 = _mm_and_pd( v1, _mm_cmple_pd(v1, thresh2));
1139                     v2 = _mm_and_pd( v2, _mm_cmple_pd(v2, thresh2));
1140                     v3 = _mm_and_pd( v3, _mm_cmple_pd(v3, thresh2));
1141                     _mm_storeu_pd( dst + j, v0 );
1142                     _mm_storeu_pd( dst + j + 2, v1 );
1143                     _mm_storeu_pd( dst + j + 4, v2 );
1144                     _mm_storeu_pd( dst + j + 6, v3 );
1145                 }
1146             }
1147 #elif CV_NEON && defined(__aarch64__)
1148             float64x2_t v_thresh = vdupq_n_f64(thresh);
1149
1150             for( ; j <= roi.width - 4; j += 4 )
1151             {
1152                 float64x2_t v_src0 = vld1q_f64(src + j);
1153                 float64x2_t v_src1 = vld1q_f64(src + j + 2);
1154                 uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh),
1155                                               vreinterpretq_u64_f64(v_src0));
1156                 uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh),
1157                                               vreinterpretq_u64_f64(v_src1));
1158                 vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
1159                 vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
1160             }
1161 #endif
1162             for (; j < roi.width; j++)
1163             {
1164                 double v = src[j];
1165                 dst[j] = v <= thresh ? v : 0;
1166             }
1167         }
1168         break;
1169     default:
1170         return CV_Error(CV_StsBadArg, "");
1171     }
1172 }
1173
1174 #ifdef HAVE_IPP
1175 static bool ipp_getThreshVal_Otsu_8u( const unsigned char* _src, int step, Size size, unsigned char &thresh)
1176 {
1177 #if IPP_VERSION_X100 >= 810 && !HAVE_ICV
1178     int ippStatus = -1;
1179     IppiSize srcSize = { size.width, size.height };
1180     CV_SUPPRESS_DEPRECATED_START
1181     ippStatus = ippiComputeThreshold_Otsu_8u_C1R(_src, step, srcSize, &thresh);
1182     CV_SUPPRESS_DEPRECATED_END
1183
1184     if(ippStatus >= 0)
1185         return true;
1186 #else
1187     CV_UNUSED(_src); CV_UNUSED(step); CV_UNUSED(size); CV_UNUSED(thresh);
1188 #endif
1189     return false;
1190 }
1191 #endif
1192
1193 static double
1194 getThreshVal_Otsu_8u( const Mat& _src )
1195 {
1196     Size size = _src.size();
1197     int step = (int) _src.step;
1198     if( _src.isContinuous() )
1199     {
1200         size.width *= size.height;
1201         size.height = 1;
1202         step = size.width;
1203     }
1204
1205 #ifdef HAVE_IPP
1206     unsigned char thresh;
1207     CV_IPP_RUN(IPP_VERSION_X100 >= 810 && !HAVE_ICV, ipp_getThreshVal_Otsu_8u(_src.ptr(), step, size, thresh), thresh);
1208 #endif
1209
1210     const int N = 256;
1211     int i, j, h[N] = {0};
1212     for( i = 0; i < size.height; i++ )
1213     {
1214         const uchar* src = _src.ptr() + step*i;
1215         j = 0;
1216         #if CV_ENABLE_UNROLLED
1217         for( ; j <= size.width - 4; j += 4 )
1218         {
1219             int v0 = src[j], v1 = src[j+1];
1220             h[v0]++; h[v1]++;
1221             v0 = src[j+2]; v1 = src[j+3];
1222             h[v0]++; h[v1]++;
1223         }
1224         #endif
1225         for( ; j < size.width; j++ )
1226             h[src[j]]++;
1227     }
1228
1229     double mu = 0, scale = 1./(size.width*size.height);
1230     for( i = 0; i < N; i++ )
1231         mu += i*(double)h[i];
1232
1233     mu *= scale;
1234     double mu1 = 0, q1 = 0;
1235     double max_sigma = 0, max_val = 0;
1236
1237     for( i = 0; i < N; i++ )
1238     {
1239         double p_i, q2, mu2, sigma;
1240
1241         p_i = h[i]*scale;
1242         mu1 *= q1;
1243         q1 += p_i;
1244         q2 = 1. - q1;
1245
1246         if( std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON )
1247             continue;
1248
1249         mu1 = (mu1 + i*p_i)/q1;
1250         mu2 = (mu - q1*mu1)/q2;
1251         sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
1252         if( sigma > max_sigma )
1253         {
1254             max_sigma = sigma;
1255             max_val = i;
1256         }
1257     }
1258
1259     return max_val;
1260 }
1261
1262 static double
1263 getThreshVal_Triangle_8u( const Mat& _src )
1264 {
1265     Size size = _src.size();
1266     int step = (int) _src.step;
1267     if( _src.isContinuous() )
1268     {
1269         size.width *= size.height;
1270         size.height = 1;
1271         step = size.width;
1272     }
1273
1274     const int N = 256;
1275     int i, j, h[N] = {0};
1276     for( i = 0; i < size.height; i++ )
1277     {
1278         const uchar* src = _src.ptr() + step*i;
1279         j = 0;
1280         #if CV_ENABLE_UNROLLED
1281         for( ; j <= size.width - 4; j += 4 )
1282         {
1283             int v0 = src[j], v1 = src[j+1];
1284             h[v0]++; h[v1]++;
1285             v0 = src[j+2]; v1 = src[j+3];
1286             h[v0]++; h[v1]++;
1287         }
1288         #endif
1289         for( ; j < size.width; j++ )
1290             h[src[j]]++;
1291     }
1292
1293     int left_bound = 0, right_bound = 0, max_ind = 0, max = 0;
1294     int temp;
1295     bool isflipped = false;
1296
1297     for( i = 0; i < N; i++ )
1298     {
1299         if( h[i] > 0 )
1300         {
1301             left_bound = i;
1302             break;
1303         }
1304     }
1305     if( left_bound > 0 )
1306         left_bound--;
1307
1308     for( i = N-1; i > 0; i-- )
1309     {
1310         if( h[i] > 0 )
1311         {
1312             right_bound = i;
1313             break;
1314         }
1315     }
1316     if( right_bound < N-1 )
1317         right_bound++;
1318
1319     for( i = 0; i < N; i++ )
1320     {
1321         if( h[i] > max)
1322         {
1323             max = h[i];
1324             max_ind = i;
1325         }
1326     }
1327
1328     if( max_ind-left_bound < right_bound-max_ind)
1329     {
1330         isflipped = true;
1331         i = 0, j = N-1;
1332         while( i < j )
1333         {
1334             temp = h[i]; h[i] = h[j]; h[j] = temp;
1335             i++; j--;
1336         }
1337         left_bound = N-1-right_bound;
1338         max_ind = N-1-max_ind;
1339     }
1340
1341     double thresh = left_bound;
1342     double a, b, dist = 0, tempdist;
1343
1344     /*
1345      * We do not need to compute precise distance here. Distance is maximized, so some constants can
1346      * be omitted. This speeds up a computation a bit.
1347      */
1348     a = max; b = left_bound-max_ind;
1349     for( i = left_bound+1; i <= max_ind; i++ )
1350     {
1351         tempdist = a*i + b*h[i];
1352         if( tempdist > dist)
1353         {
1354             dist = tempdist;
1355             thresh = i;
1356         }
1357     }
1358     thresh--;
1359
1360     if( isflipped )
1361         thresh = N-1-thresh;
1362
1363     return thresh;
1364 }
1365
1366 class ThresholdRunner : public ParallelLoopBody
1367 {
1368 public:
1369     ThresholdRunner(Mat _src, Mat _dst, double _thresh, double _maxval, int _thresholdType)
1370     {
1371         src = _src;
1372         dst = _dst;
1373
1374         thresh = _thresh;
1375         maxval = _maxval;
1376         thresholdType = _thresholdType;
1377     }
1378
1379     void operator () ( const Range& range ) const
1380     {
1381         int row0 = range.start;
1382         int row1 = range.end;
1383
1384         Mat srcStripe = src.rowRange(row0, row1);
1385         Mat dstStripe = dst.rowRange(row0, row1);
1386
1387         if (srcStripe.depth() == CV_8U)
1388         {
1389             thresh_8u( srcStripe, dstStripe, (uchar)thresh, (uchar)maxval, thresholdType );
1390         }
1391         else if( srcStripe.depth() == CV_16S )
1392         {
1393             thresh_16s( srcStripe, dstStripe, (short)thresh, (short)maxval, thresholdType );
1394         }
1395         else if( srcStripe.depth() == CV_32F )
1396         {
1397             thresh_32f( srcStripe, dstStripe, (float)thresh, (float)maxval, thresholdType );
1398         }
1399         else if( srcStripe.depth() == CV_64F )
1400         {
1401             thresh_64f(srcStripe, dstStripe, thresh, maxval, thresholdType);
1402         }
1403     }
1404
1405 private:
1406     Mat src;
1407     Mat dst;
1408
1409     double thresh;
1410     double maxval;
1411     int thresholdType;
1412 };
1413
1414 #ifdef HAVE_OPENCL
1415
1416 static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, double maxval, int thresh_type )
1417 {
1418     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
1419         kercn = ocl::predictOptimalVectorWidth(_src, _dst), ktype = CV_MAKE_TYPE(depth, kercn);
1420     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
1421
1422     if ( !(thresh_type == THRESH_BINARY || thresh_type == THRESH_BINARY_INV || thresh_type == THRESH_TRUNC ||
1423            thresh_type == THRESH_TOZERO || thresh_type == THRESH_TOZERO_INV) ||
1424          (!doubleSupport && depth == CV_64F))
1425         return false;
1426
1427     const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
1428                                           "THRESH_TOZERO", "THRESH_TOZERO_INV" };
1429     ocl::Device dev = ocl::Device::getDefault();
1430     int stride_size = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
1431
1432     ocl::Kernel k("threshold", ocl::imgproc::threshold_oclsrc,
1433                   format("-D %s -D T=%s -D T1=%s -D STRIDE_SIZE=%d%s", thresholdMap[thresh_type],
1434                          ocl::typeToStr(ktype), ocl::typeToStr(depth), stride_size,
1435                          doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
1436     if (k.empty())
1437         return false;
1438
1439     UMat src = _src.getUMat();
1440     _dst.create(src.size(), type);
1441     UMat dst = _dst.getUMat();
1442
1443     if (depth <= CV_32S)
1444         thresh = cvFloor(thresh);
1445
1446     const double min_vals[] = { 0, CHAR_MIN, 0, SHRT_MIN, INT_MIN, -FLT_MAX, -DBL_MAX, 0 };
1447     double min_val = min_vals[depth];
1448
1449     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn, kercn),
1450            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(thresh))),
1451            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(maxval))),
1452            ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(min_val))));
1453
1454     size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, (size_t)dst.rows };
1455     globalsize[1] = (globalsize[1] + stride_size - 1) / stride_size;
1456     return k.run(2, globalsize, NULL, false);
1457 }
1458
1459 #endif
1460
1461 }
1462
1463 double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double maxval, int type )
1464 {
1465     CV_OCL_RUN_(_src.dims() <= 2 && _dst.isUMat(),
1466                 ocl_threshold(_src, _dst, thresh, maxval, type), thresh)
1467
1468     Mat src = _src.getMat();
1469     int automatic_thresh = (type & ~CV_THRESH_MASK);
1470     type &= THRESH_MASK;
1471
1472     CV_Assert( automatic_thresh != (CV_THRESH_OTSU | CV_THRESH_TRIANGLE) );
1473     if( automatic_thresh == CV_THRESH_OTSU )
1474     {
1475         CV_Assert( src.type() == CV_8UC1 );
1476         thresh = getThreshVal_Otsu_8u( src );
1477     }
1478     else if( automatic_thresh == CV_THRESH_TRIANGLE )
1479     {
1480         CV_Assert( src.type() == CV_8UC1 );
1481         thresh = getThreshVal_Triangle_8u( src );
1482     }
1483
1484     _dst.create( src.size(), src.type() );
1485     Mat dst = _dst.getMat();
1486
1487     if( src.depth() == CV_8U )
1488     {
1489         int ithresh = cvFloor(thresh);
1490         thresh = ithresh;
1491         int imaxval = cvRound(maxval);
1492         if( type == THRESH_TRUNC )
1493             imaxval = ithresh;
1494         imaxval = saturate_cast<uchar>(imaxval);
1495
1496         if( ithresh < 0 || ithresh >= 255 )
1497         {
1498             if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
1499                 ((type == THRESH_TRUNC || type == THRESH_TOZERO_INV) && ithresh < 0) ||
1500                 (type == THRESH_TOZERO && ithresh >= 255) )
1501             {
1502                 int v = type == THRESH_BINARY ? (ithresh >= 255 ? 0 : imaxval) :
1503                         type == THRESH_BINARY_INV ? (ithresh >= 255 ? imaxval : 0) :
1504                         /*type == THRESH_TRUNC ? imaxval :*/ 0;
1505                 dst.setTo(v);
1506             }
1507             else
1508                 src.copyTo(dst);
1509             return thresh;
1510         }
1511         thresh = ithresh;
1512         maxval = imaxval;
1513     }
1514     else if( src.depth() == CV_16S )
1515     {
1516         int ithresh = cvFloor(thresh);
1517         thresh = ithresh;
1518         int imaxval = cvRound(maxval);
1519         if( type == THRESH_TRUNC )
1520             imaxval = ithresh;
1521         imaxval = saturate_cast<short>(imaxval);
1522
1523         if( ithresh < SHRT_MIN || ithresh >= SHRT_MAX )
1524         {
1525             if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
1526                ((type == THRESH_TRUNC || type == THRESH_TOZERO_INV) && ithresh < SHRT_MIN) ||
1527                (type == THRESH_TOZERO && ithresh >= SHRT_MAX) )
1528             {
1529                 int v = type == THRESH_BINARY ? (ithresh >= SHRT_MAX ? 0 : imaxval) :
1530                 type == THRESH_BINARY_INV ? (ithresh >= SHRT_MAX ? imaxval : 0) :
1531                 /*type == THRESH_TRUNC ? imaxval :*/ 0;
1532                 dst.setTo(v);
1533             }
1534             else
1535                 src.copyTo(dst);
1536             return thresh;
1537         }
1538         thresh = ithresh;
1539         maxval = imaxval;
1540     }
1541     else if( src.depth() == CV_32F )
1542         ;
1543     else if( src.depth() == CV_64F )
1544         ;
1545     else
1546         CV_Error( CV_StsUnsupportedFormat, "" );
1547
1548     parallel_for_(Range(0, dst.rows),
1549                   ThresholdRunner(src, dst, thresh, maxval, type),
1550                   dst.total()/(double)(1<<16));
1551     return thresh;
1552 }
1553
1554
1555 void cv::adaptiveThreshold( InputArray _src, OutputArray _dst, double maxValue,
1556                             int method, int type, int blockSize, double delta )
1557 {
1558     Mat src = _src.getMat();
1559     CV_Assert( src.type() == CV_8UC1 );
1560     CV_Assert( blockSize % 2 == 1 && blockSize > 1 );
1561     Size size = src.size();
1562
1563     _dst.create( size, src.type() );
1564     Mat dst = _dst.getMat();
1565
1566     if( maxValue < 0 )
1567     {
1568         dst = Scalar(0);
1569         return;
1570     }
1571
1572     Mat mean;
1573
1574     if( src.data != dst.data )
1575         mean = dst;
1576
1577     if (method == ADAPTIVE_THRESH_MEAN_C)
1578         boxFilter( src, mean, src.type(), Size(blockSize, blockSize),
1579                    Point(-1,-1), true, BORDER_REPLICATE );
1580     else if (method == ADAPTIVE_THRESH_GAUSSIAN_C)
1581     {
1582         Mat srcfloat,meanfloat;
1583         src.convertTo(srcfloat,CV_32F);
1584         meanfloat=srcfloat;
1585         GaussianBlur(srcfloat, meanfloat, Size(blockSize, blockSize), 0, 0, BORDER_REPLICATE);
1586         meanfloat.convertTo(mean, src.type());
1587     }
1588     else
1589         CV_Error( CV_StsBadFlag, "Unknown/unsupported adaptive threshold method" );
1590
1591     int i, j;
1592     uchar imaxval = saturate_cast<uchar>(maxValue);
1593     int idelta = type == THRESH_BINARY ? cvCeil(delta) : cvFloor(delta);
1594     uchar tab[768];
1595
1596     if( type == CV_THRESH_BINARY )
1597         for( i = 0; i < 768; i++ )
1598             tab[i] = (uchar)(i - 255 > -idelta ? imaxval : 0);
1599     else if( type == CV_THRESH_BINARY_INV )
1600         for( i = 0; i < 768; i++ )
1601             tab[i] = (uchar)(i - 255 <= -idelta ? imaxval : 0);
1602     else
1603         CV_Error( CV_StsBadFlag, "Unknown/unsupported threshold type" );
1604
1605     if( src.isContinuous() && mean.isContinuous() && dst.isContinuous() )
1606     {
1607         size.width *= size.height;
1608         size.height = 1;
1609     }
1610
1611     for( i = 0; i < size.height; i++ )
1612     {
1613         const uchar* sdata = src.ptr(i);
1614         const uchar* mdata = mean.ptr(i);
1615         uchar* ddata = dst.ptr(i);
1616
1617         for( j = 0; j < size.width; j++ )
1618             ddata[j] = tab[sdata[j] - mdata[j] + 255];
1619     }
1620 }
1621
1622 CV_IMPL double
1623 cvThreshold( const void* srcarr, void* dstarr, double thresh, double maxval, int type )
1624 {
1625     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
1626
1627     CV_Assert( src.size == dst.size && src.channels() == dst.channels() &&
1628         (src.depth() == dst.depth() || dst.depth() == CV_8U));
1629
1630     thresh = cv::threshold( src, dst, thresh, maxval, type );
1631     if( dst0.data != dst.data )
1632         dst.convertTo( dst0, dst0.depth() );
1633     return thresh;
1634 }
1635
1636
1637 CV_IMPL void
1638 cvAdaptiveThreshold( const void *srcIm, void *dstIm, double maxValue,
1639                      int method, int type, int blockSize, double delta )
1640 {
1641     cv::Mat src = cv::cvarrToMat(srcIm), dst = cv::cvarrToMat(dstIm);
1642     CV_Assert( src.size == dst.size && src.type() == dst.type() );
1643     cv::adaptiveThreshold( src, dst, maxValue, method, type, blockSize, delta );
1644 }
1645
1646 /* End of file. */