1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 /* ////////////////////////////////////////////////////////////////////
45 // Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
49 #include "precomp.hpp"
50 #include "opencl_kernels.hpp"
56 struct IPPArithmInitializer
58 IPPArithmInitializer(void)
64 IPPArithmInitializer ippArithmInitializer;
71 #define FUNCTOR_TEMPLATE(name) \
72 template<typename T> struct name {}
74 FUNCTOR_TEMPLATE(VLoadStore128);
75 FUNCTOR_TEMPLATE(VLoadStore64);
76 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
80 template<typename T, class Op, class VOp>
81 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
88 for( ; sz.height--; src1 += step1/sizeof(src1[0]),
89 src2 += step2/sizeof(src2[0]),
90 dst += step/sizeof(dst[0]) )
97 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
99 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
100 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
101 r0 = vop(r0, VLoadStore128<T>::load(src2 + x ));
102 r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
103 VLoadStore128<T>::store(dst + x , r0);
104 VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
111 for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) )
113 typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
114 r = vop(r, VLoadStore64<T>::load(src2 + x));
115 VLoadStore64<T>::store(dst + x, r);
119 #if CV_ENABLE_UNROLLED
120 for( ; x <= sz.width - 4; x += 4 )
122 T v0 = op(src1[x], src2[x]);
123 T v1 = op(src1[x+1], src2[x+1]);
124 dst[x] = v0; dst[x+1] = v1;
125 v0 = op(src1[x+2], src2[x+2]);
126 v1 = op(src1[x+3], src2[x+3]);
127 dst[x+2] = v0; dst[x+3] = v1;
131 for( ; x < sz.width; x++ )
132 dst[x] = op(src1[x], src2[x]);
136 template<typename T, class Op, class Op32>
137 void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
138 T* dst, size_t step, Size sz)
145 for( ; sz.height--; src1 += step1/sizeof(src1[0]),
146 src2 += step2/sizeof(src2[0]),
147 dst += step/sizeof(dst[0]) )
154 if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
156 for( ; x <= sz.width - 8; x += 8 )
158 typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
159 typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
160 r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x ));
161 r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
162 VLoadStore128Aligned<T>::store(dst + x , r0);
163 VLoadStore128Aligned<T>::store(dst + x + 4, r1);
171 for( ; x <= sz.width - 8; x += 8 )
173 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
174 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
175 r0 = op32(r0, VLoadStore128<T>::load(src2 + x ));
176 r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
177 VLoadStore128<T>::store(dst + x , r0);
178 VLoadStore128<T>::store(dst + x + 4, r1);
182 #if CV_ENABLE_UNROLLED
183 for( ; x <= sz.width - 4; x += 4 )
185 T v0 = op(src1[x], src2[x]);
186 T v1 = op(src1[x+1], src2[x+1]);
187 dst[x] = v0; dst[x+1] = v1;
188 v0 = op(src1[x+2], src2[x+2]);
189 v1 = op(src1[x+3], src2[x+3]);
190 dst[x+2] = v0; dst[x+3] = v1;
194 for( ; x < sz.width; x++ )
195 dst[x] = op(src1[x], src2[x]);
200 template<typename T, class Op, class Op64>
201 void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
202 T* dst, size_t step, Size sz)
209 for( ; sz.height--; src1 += step1/sizeof(src1[0]),
210 src2 += step2/sizeof(src2[0]),
211 dst += step/sizeof(dst[0]) )
218 if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
220 for( ; x <= sz.width - 4; x += 4 )
222 typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
223 typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
224 r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x ));
225 r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
226 VLoadStore128Aligned<T>::store(dst + x , r0);
227 VLoadStore128Aligned<T>::store(dst + x + 2, r1);
233 for( ; x <= sz.width - 4; x += 4 )
235 T v0 = op(src1[x], src2[x]);
236 T v1 = op(src1[x+1], src2[x+1]);
237 dst[x] = v0; dst[x+1] = v1;
238 v0 = op(src1[x+2], src2[x+2]);
239 v1 = op(src1[x+3], src2[x+3]);
240 dst[x+2] = v0; dst[x+3] = v1;
243 for( ; x < sz.width; x++ )
244 dst[x] = op(src1[x], src2[x]);
250 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
252 struct name<template_arg>{ \
253 typedef register_type reg_type; \
254 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
255 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
258 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
260 struct name<template_arg>{ \
261 typedef register_type reg_type; \
262 static reg_type load(const template_arg * p) { return load_body (p); } \
263 static void store(template_arg * p, reg_type v) { store_body (p, v); } \
266 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
268 struct name<template_arg> \
270 VLoadStore128<template_arg>::reg_type operator()( \
271 const VLoadStore128<template_arg>::reg_type & a, \
272 const VLoadStore128<template_arg>::reg_type & b) const \
278 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
280 struct name<template_arg> \
282 VLoadStore128<template_arg>::reg_type operator()( \
283 const VLoadStore128<template_arg>::reg_type & a, \
284 const VLoadStore128<template_arg>::reg_type & ) const \
290 FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
291 FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
292 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
293 FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
294 FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
295 FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps );
296 FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd );
298 FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
299 FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
300 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
301 FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
303 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128);
304 FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps );
305 FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd );
307 FUNCTOR_TEMPLATE(VAdd);
308 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b));
309 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b));
310 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
311 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b));
312 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b));
313 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b));
314 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b));
316 FUNCTOR_TEMPLATE(VSub);
317 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b));
318 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b));
319 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
320 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b));
321 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b));
322 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b));
323 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b));
325 FUNCTOR_TEMPLATE(VMin);
326 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
327 FUNCTOR_CLOSURE_2arg(VMin, schar,
328 __m128i m = _mm_cmpgt_epi8(a, b);
329 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
331 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
332 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b));
333 FUNCTOR_CLOSURE_2arg(VMin, int,
334 __m128i m = _mm_cmpgt_epi32(a, b);
335 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
337 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b));
338 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
340 FUNCTOR_TEMPLATE(VMax);
341 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
342 FUNCTOR_CLOSURE_2arg(VMax, schar,
343 __m128i m = _mm_cmpgt_epi8(b, a);
344 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
346 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
347 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b));
348 FUNCTOR_CLOSURE_2arg(VMax, int,
349 __m128i m = _mm_cmpgt_epi32(b, a);
350 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
352 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b));
353 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
356 static int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
357 static int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
359 FUNCTOR_TEMPLATE(VAbsDiff);
360 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
361 return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
363 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
364 __m128i d = _mm_subs_epi8(a, b);
365 __m128i m = _mm_cmpgt_epi8(b, a);
366 return _mm_subs_epi8(_mm_xor_si128(d, m), m);
368 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
369 return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
371 FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
372 __m128i M = _mm_max_epi16(a, b);
373 __m128i m = _mm_min_epi16(a, b);
374 return _mm_subs_epi16(M, m);
376 FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
377 __m128i d = _mm_sub_epi32(a, b);
378 __m128i m = _mm_cmpgt_epi32(b, a);
379 return _mm_sub_epi32(_mm_xor_si128(d, m), m);
381 FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
382 return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
384 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
385 return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
388 FUNCTOR_TEMPLATE(VAnd);
389 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
390 FUNCTOR_TEMPLATE(VOr);
391 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
392 FUNCTOR_TEMPLATE(VXor);
393 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
394 FUNCTOR_TEMPLATE(VNot);
395 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
399 #define IF_SIMD(op) op
401 #define IF_SIMD(op) NOP
404 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
405 { return CV_FAST_CAST_8U(a + b); }
406 template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
407 { return CV_FAST_CAST_8U(a - b); }
409 template<typename T> struct OpAbsDiff
414 T operator()(T a, T b) const { return (T)std::abs(a - b); }
417 template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
418 { return saturate_cast<short>(std::abs(a - b)); }
420 template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
421 { return saturate_cast<schar>(std::abs(a - b)); }
423 template<typename T, typename WT=T> struct OpAbsDiffS
428 T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
431 template<typename T> struct OpAnd
436 T operator()( T a, T b ) const { return a & b; }
439 template<typename T> struct OpOr
444 T operator()( T a, T b ) const { return a | b; }
447 template<typename T> struct OpXor
452 T operator()( T a, T b ) const { return a ^ b; }
455 template<typename T> struct OpNot
460 T operator()( T a, T ) const { return ~a; }
463 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
466 step1 = step2 = step = sz.width*elemSize;
469 static void add8u( const uchar* src1, size_t step1,
470 const uchar* src2, size_t step2,
471 uchar* dst, size_t step, Size sz, void* )
473 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
474 ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
475 (vBinOp<uchar, OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, sz)));
478 static void add8s( const schar* src1, size_t step1,
479 const schar* src2, size_t step2,
480 schar* dst, size_t step, Size sz, void* )
482 vBinOp<schar, OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, sz);
485 static void add16u( const ushort* src1, size_t step1,
486 const ushort* src2, size_t step2,
487 ushort* dst, size_t step, Size sz, void* )
489 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
490 ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
491 (vBinOp<ushort, OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, sz)));
494 static void add16s( const short* src1, size_t step1,
495 const short* src2, size_t step2,
496 short* dst, size_t step, Size sz, void* )
498 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
499 ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
500 (vBinOp<short, OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, sz)));
503 static void add32s( const int* src1, size_t step1,
504 const int* src2, size_t step2,
505 int* dst, size_t step, Size sz, void* )
507 vBinOp32<int, OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, sz);
510 static void add32f( const float* src1, size_t step1,
511 const float* src2, size_t step2,
512 float* dst, size_t step, Size sz, void* )
514 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
515 ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
516 (vBinOp32<float, OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, sz)));
519 static void add64f( const double* src1, size_t step1,
520 const double* src2, size_t step2,
521 double* dst, size_t step, Size sz, void* )
523 vBinOp64<double, OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, sz);
526 static void sub8u( const uchar* src1, size_t step1,
527 const uchar* src2, size_t step2,
528 uchar* dst, size_t step, Size sz, void* )
530 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
531 ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
532 (vBinOp<uchar, OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, sz)));
535 static void sub8s( const schar* src1, size_t step1,
536 const schar* src2, size_t step2,
537 schar* dst, size_t step, Size sz, void* )
539 vBinOp<schar, OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, sz);
542 static void sub16u( const ushort* src1, size_t step1,
543 const ushort* src2, size_t step2,
544 ushort* dst, size_t step, Size sz, void* )
546 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
547 ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
548 (vBinOp<ushort, OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, sz)));
551 static void sub16s( const short* src1, size_t step1,
552 const short* src2, size_t step2,
553 short* dst, size_t step, Size sz, void* )
555 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
556 ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
557 (vBinOp<short, OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, sz)));
560 static void sub32s( const int* src1, size_t step1,
561 const int* src2, size_t step2,
562 int* dst, size_t step, Size sz, void* )
564 vBinOp32<int, OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, sz);
567 static void sub32f( const float* src1, size_t step1,
568 const float* src2, size_t step2,
569 float* dst, size_t step, Size sz, void* )
571 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
572 ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz),
573 (vBinOp32<float, OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, sz)));
576 static void sub64f( const double* src1, size_t step1,
577 const double* src2, size_t step2,
578 double* dst, size_t step, Size sz, void* )
580 vBinOp64<double, OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, sz);
583 template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
584 template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
586 static void max8u( const uchar* src1, size_t step1,
587 const uchar* src2, size_t step2,
588 uchar* dst, size_t step, Size sz, void* )
590 #if (ARITHM_USE_IPP == 1)
592 uchar* s1 = (uchar*)src1;
593 uchar* s2 = (uchar*)src2;
595 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
596 for(int i = 0; i < sz.height; i++)
598 ippsMaxEvery_8u(s1, s2, d, sz.width);
605 vBinOp<uchar, OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, sz);
608 // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
609 // ippiMaxEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
610 // (vBinOp8<uchar, OpMax<uchar>, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz)));
613 static void max8s( const schar* src1, size_t step1,
614 const schar* src2, size_t step2,
615 schar* dst, size_t step, Size sz, void* )
617 vBinOp<schar, OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, sz);
620 static void max16u( const ushort* src1, size_t step1,
621 const ushort* src2, size_t step2,
622 ushort* dst, size_t step, Size sz, void* )
624 #if (ARITHM_USE_IPP == 1)
626 ushort* s1 = (ushort*)src1;
627 ushort* s2 = (ushort*)src2;
629 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
630 for(int i = 0; i < sz.height; i++)
632 ippsMaxEvery_16u(s1, s2, d, sz.width);
633 s1 = (ushort*)((uchar*)s1 + step1);
634 s2 = (ushort*)((uchar*)s2 + step2);
635 d = (ushort*)((uchar*)d + step);
639 vBinOp<ushort, OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, sz);
642 // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
643 // ippiMaxEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
644 // (vBinOp16<ushort, OpMax<ushort>, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz)));
647 static void max16s( const short* src1, size_t step1,
648 const short* src2, size_t step2,
649 short* dst, size_t step, Size sz, void* )
651 vBinOp<short, OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, sz);
654 static void max32s( const int* src1, size_t step1,
655 const int* src2, size_t step2,
656 int* dst, size_t step, Size sz, void* )
658 vBinOp32<int, OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, sz);
661 static void max32f( const float* src1, size_t step1,
662 const float* src2, size_t step2,
663 float* dst, size_t step, Size sz, void* )
665 #if (ARITHM_USE_IPP == 1)
667 float* s1 = (float*)src1;
668 float* s2 = (float*)src2;
670 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
671 for(int i = 0; i < sz.height; i++)
673 ippsMaxEvery_32f(s1, s2, d, sz.width);
674 s1 = (float*)((uchar*)s1 + step1);
675 s2 = (float*)((uchar*)s2 + step2);
676 d = (float*)((uchar*)d + step);
680 vBinOp32<float, OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, sz);
682 // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
683 // ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
684 // (vBinOp32f<OpMax<float>, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz)));
687 static void max64f( const double* src1, size_t step1,
688 const double* src2, size_t step2,
689 double* dst, size_t step, Size sz, void* )
691 vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz);
694 static void min8u( const uchar* src1, size_t step1,
695 const uchar* src2, size_t step2,
696 uchar* dst, size_t step, Size sz, void* )
698 #if (ARITHM_USE_IPP == 1)
700 uchar* s1 = (uchar*)src1;
701 uchar* s2 = (uchar*)src2;
703 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
704 for(int i = 0; i < sz.height; i++)
706 ippsMinEvery_8u(s1, s2, d, sz.width);
713 vBinOp<uchar, OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, sz);
716 // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
717 // ippiMinEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
718 // (vBinOp8<uchar, OpMin<uchar>, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz)));
721 static void min8s( const schar* src1, size_t step1,
722 const schar* src2, size_t step2,
723 schar* dst, size_t step, Size sz, void* )
725 vBinOp<schar, OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, sz);
728 static void min16u( const ushort* src1, size_t step1,
729 const ushort* src2, size_t step2,
730 ushort* dst, size_t step, Size sz, void* )
732 #if (ARITHM_USE_IPP == 1)
734 ushort* s1 = (ushort*)src1;
735 ushort* s2 = (ushort*)src2;
737 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
738 for(int i = 0; i < sz.height; i++)
740 ippsMinEvery_16u(s1, s2, d, sz.width);
741 s1 = (ushort*)((uchar*)s1 + step1);
742 s2 = (ushort*)((uchar*)s2 + step2);
743 d = (ushort*)((uchar*)d + step);
747 vBinOp<ushort, OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, sz);
750 // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
751 // ippiMinEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
752 // (vBinOp16<ushort, OpMin<ushort>, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz)));
755 static void min16s( const short* src1, size_t step1,
756 const short* src2, size_t step2,
757 short* dst, size_t step, Size sz, void* )
759 vBinOp<short, OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, sz);
762 static void min32s( const int* src1, size_t step1,
763 const int* src2, size_t step2,
764 int* dst, size_t step, Size sz, void* )
766 vBinOp32<int, OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, sz);
769 static void min32f( const float* src1, size_t step1,
770 const float* src2, size_t step2,
771 float* dst, size_t step, Size sz, void* )
773 #if (ARITHM_USE_IPP == 1)
775 float* s1 = (float*)src1;
776 float* s2 = (float*)src2;
778 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
779 for(int i = 0; i < sz.height; i++)
781 ippsMinEvery_32f(s1, s2, d, sz.width);
782 s1 = (float*)((uchar*)s1 + step1);
783 s2 = (float*)((uchar*)s2 + step2);
784 d = (float*)((uchar*)d + step);
788 vBinOp32<float, OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, sz);
790 // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
791 // ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
792 // (vBinOp32f<OpMin<float>, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz)));
795 static void min64f( const double* src1, size_t step1,
796 const double* src2, size_t step2,
797 double* dst, size_t step, Size sz, void* )
799 vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz);
802 static void absdiff8u( const uchar* src1, size_t step1,
803 const uchar* src2, size_t step2,
804 uchar* dst, size_t step, Size sz, void* )
806 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
807 ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
808 (vBinOp<uchar, OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, sz)));
811 static void absdiff8s( const schar* src1, size_t step1,
812 const schar* src2, size_t step2,
813 schar* dst, size_t step, Size sz, void* )
815 vBinOp<schar, OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, sz);
818 static void absdiff16u( const ushort* src1, size_t step1,
819 const ushort* src2, size_t step2,
820 ushort* dst, size_t step, Size sz, void* )
822 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
823 ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
824 (vBinOp<ushort, OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, sz)));
827 static void absdiff16s( const short* src1, size_t step1,
828 const short* src2, size_t step2,
829 short* dst, size_t step, Size sz, void* )
831 vBinOp<short, OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, sz);
834 static void absdiff32s( const int* src1, size_t step1,
835 const int* src2, size_t step2,
836 int* dst, size_t step, Size sz, void* )
838 vBinOp32<int, OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, sz);
841 static void absdiff32f( const float* src1, size_t step1,
842 const float* src2, size_t step2,
843 float* dst, size_t step, Size sz, void* )
845 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
846 ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
847 (vBinOp32<float, OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, sz)));
850 static void absdiff64f( const double* src1, size_t step1,
851 const double* src2, size_t step2,
852 double* dst, size_t step, Size sz, void* )
854 vBinOp64<double, OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, sz);
858 static void and8u( const uchar* src1, size_t step1,
859 const uchar* src2, size_t step2,
860 uchar* dst, size_t step, Size sz, void* )
862 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
863 ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
864 (vBinOp<uchar, OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, sz)));
867 static void or8u( const uchar* src1, size_t step1,
868 const uchar* src2, size_t step2,
869 uchar* dst, size_t step, Size sz, void* )
871 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
872 ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
873 (vBinOp<uchar, OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, sz)));
876 static void xor8u( const uchar* src1, size_t step1,
877 const uchar* src2, size_t step2,
878 uchar* dst, size_t step, Size sz, void* )
880 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
881 ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
882 (vBinOp<uchar, OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, sz)));
885 static void not8u( const uchar* src1, size_t step1,
886 const uchar* src2, size_t step2,
887 uchar* dst, size_t step, Size sz, void* )
889 IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void *)src2;
890 ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, (IppiSize&)sz),
891 (vBinOp<uchar, OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, sz)));
894 /****************************************************************************************\
895 * logical operations *
896 \****************************************************************************************/
898 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
900 int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
901 size_t esz = CV_ELEM_SIZE(buftype);
902 getConvertFunc(sc.depth(), buftype)(sc.data, 0, 0, 0, scbuf, 0, Size(std::min(cn, scn), 1), 0);
906 CV_Assert( scn == 1 );
907 size_t esz1 = CV_ELEM_SIZE1(buftype);
908 for( size_t i = esz1; i < esz; i++ )
909 scbuf[i] = scbuf[i - esz1];
911 for( size_t i = esz; i < blocksize*esz; i++ )
912 scbuf[i] = scbuf[i - esz];
916 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
917 OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
918 OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
919 OCL_OP_RDIV_SCALE=15 };
923 static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
924 "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
925 "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
927 static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
928 InputArray _mask, bool bitwise, int oclop, bool haveScalar )
930 bool haveMask = !_mask.empty();
931 int srctype = _src1.type();
932 int srcdepth = CV_MAT_DEPTH(srctype);
933 int cn = CV_MAT_CN(srctype);
935 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
936 if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
937 (!doubleSupport && srcdepth == CV_64F && !bitwise))
941 int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
942 int scalarcn = kercn == 3 ? 4 : kercn;
944 sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d",
945 haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
946 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
947 ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
948 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
949 ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
950 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
951 ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
954 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
958 UMat src1 = _src1.getUMat(), src2;
959 UMat dst = _dst.getUMat(), mask = _mask.getUMat();
961 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
962 ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
963 ocl::KernelArg::WriteOnly(dst, cn, kercn);
964 ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
968 size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
969 double buf[4] = {0,0,0,0};
971 if( oclop != OCL_OP_NOT )
973 Mat src2sc = _src2.getMat();
974 convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
977 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
980 k.args(src1arg, dstarg, scalararg);
982 k.args(src1arg, maskarg, dstarg, scalararg);
986 src2 = _src2.getUMat();
987 ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
990 k.args(src1arg, src2arg, dstarg);
992 k.args(src1arg, src2arg, maskarg, dstarg);
995 size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
996 return k.run(2, globalsize, 0, false);
1001 static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
1002 InputArray _mask, const BinaryFunc* tab,
1003 bool bitwise, int oclop )
1005 const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
1006 int kind1 = psrc1->kind(), kind2 = psrc2->kind();
1007 int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1008 int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
1009 int dims1 = psrc1->dims(), dims2 = psrc2->dims();
1010 Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
1011 Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
1013 bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
1014 dims1 <= 2 && dims2 <= 2;
1016 bool haveMask = !_mask.empty(), haveScalar = false;
1019 if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
1021 _dst.create(sz1, type1);
1022 CV_OCL_RUN(use_opencl,
1023 ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
1028 cn = (int)CV_ELEM_SIZE(type1);
1033 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1034 Size sz = getContinuousSize(src1, src2, dst);
1035 size_t len = sz.width*(size_t)cn;
1036 if( len == (size_t)(int)len )
1038 sz.width = (int)len;
1039 func(src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, 0);
1044 if( oclop == OCL_OP_NOT )
1046 else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
1047 !psrc1->sameSize(*psrc2) || type1 != type2 )
1049 if( checkScalar(*psrc1, type2, kind1, kind2) )
1051 // src1 is a scalar; swap it with src2
1054 swap(depth1, depth2);
1058 else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1059 CV_Error( CV_StsUnmatchedSizes,
1060 "The operation is neither 'array op array' (where arrays have the same size and type), "
1061 "nor 'array op scalar', nor 'scalar op array'" );
1066 CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
1069 size_t esz = CV_ELEM_SIZE(type1);
1070 size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
1071 BinaryFunc copymask = 0;
1072 bool reallocate = false;
1076 int mtype = _mask.type();
1077 CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
1078 copymask = getCopyMaskFunc(esz);
1079 reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
1082 AutoBuffer<uchar> _buf;
1083 uchar *scbuf = 0, *maskbuf = 0;
1085 _dst.createSameSize(*psrc1, type1);
1086 // if this is mask operation and dst has been reallocated,
1087 // we have to clear the destination
1088 if( haveMask && reallocate )
1091 CV_OCL_RUN(use_opencl,
1092 ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
1095 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
1096 Mat dst = _dst.getMat(), mask = _mask.getMat();
1108 const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
1111 NAryMatIterator it(arrays, ptrs);
1112 size_t total = it.size, blocksize = total;
1114 if( blocksize*cn > INT_MAX )
1115 blocksize = INT_MAX/cn;
1119 blocksize = std::min(blocksize, blocksize0);
1120 _buf.allocate(blocksize*esz);
1124 for( size_t i = 0; i < it.nplanes; i++, ++it )
1126 for( size_t j = 0; j < total; j += blocksize )
1128 int bsz = (int)MIN(total - j, blocksize);
1130 func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
1133 copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
1138 ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
1144 const Mat* arrays[] = { &src1, &dst, &mask, 0 };
1147 NAryMatIterator it(arrays, ptrs);
1148 size_t total = it.size, blocksize = std::min(total, blocksize0);
1150 _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
1152 maskbuf = alignPtr(scbuf + blocksize*esz, 16);
1154 convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
1156 for( size_t i = 0; i < it.nplanes; i++, ++it )
1158 for( size_t j = 0; j < total; j += blocksize )
1160 int bsz = (int)MIN(total - j, blocksize);
1162 func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
1165 copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
1170 ptrs[0] += bsz; ptrs[1] += bsz;
1176 static BinaryFunc* getMaxTab()
1178 static BinaryFunc maxTab[] =
1180 (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s),
1181 (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s),
1182 (BinaryFunc)GET_OPTIMIZED(max32s),
1183 (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f,
1190 static BinaryFunc* getMinTab()
1192 static BinaryFunc minTab[] =
1194 (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s),
1195 (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s),
1196 (BinaryFunc)GET_OPTIMIZED(min32s),
1197 (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f,
1206 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
1208 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
1209 binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
1212 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
1214 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
1215 binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
1218 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
1220 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
1221 binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
1224 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
1226 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
1227 binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
1230 void cv::max( InputArray src1, InputArray src2, OutputArray dst )
1232 binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1235 void cv::min( InputArray src1, InputArray src2, OutputArray dst )
1237 binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1240 void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
1242 OutputArray _dst(dst);
1243 binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1246 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
1248 OutputArray _dst(dst);
1249 binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1252 void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
1254 OutputArray _dst(dst);
1255 binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1258 void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
1260 OutputArray _dst(dst);
1261 binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1265 /****************************************************************************************\
1267 \****************************************************************************************/
1272 static int actualScalarDepth(const double* data, int len)
1274 int i = 0, minval = INT_MAX, maxval = INT_MIN;
1277 int ival = cvRound(data[i]);
1278 if( ival != data[i] )
1280 minval = MIN(minval, ival);
1281 maxval = MAX(maxval, ival);
1283 return i < len ? CV_64F :
1284 minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
1285 minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
1286 minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
1287 minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
1293 static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1294 InputArray _mask, int wtype,
1295 void* usrdata, int oclop,
1298 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
1299 int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1300 bool haveMask = !_mask.empty();
1302 if( ((haveMask || haveScalar) && cn > 4) )
1305 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
1307 wdepth = std::min(wdepth, CV_32F);
1309 wtype = CV_MAKETYPE(wdepth, cn);
1310 int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
1311 if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
1314 int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
1315 int scalarcn = kercn == 3 ? 4 : kercn;
1317 char cvtstr[4][32], opts[1024];
1318 sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
1319 "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
1320 "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
1321 (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
1322 oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
1323 ocl::typeToStr(CV_MAKETYPE(depth1, 1)),
1324 ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
1325 ocl::typeToStr(CV_MAKETYPE(depth2, 1)),
1326 ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
1327 ocl::typeToStr(CV_MAKETYPE(ddepth, 1)),
1328 ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
1329 ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
1330 ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), wdepth,
1331 ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
1332 ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
1333 ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
1334 doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn);
1336 size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
1337 const uchar* usrdata_p = (const uchar*)usrdata;
1338 const double* usrdata_d = (const double*)usrdata;
1340 int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
1341 oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
1342 if( n > 0 && wdepth == CV_32F )
1344 for( i = 0; i < n; i++ )
1345 usrdata_f[i] = (float)usrdata_d[i];
1346 usrdata_p = (const uchar*)usrdata_f;
1349 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
1353 UMat src1 = _src1.getUMat(), src2;
1354 UMat dst = _dst.getUMat(), mask = _mask.getUMat();
1356 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
1357 ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
1358 ocl::KernelArg::WriteOnly(dst, cn, kercn);
1359 ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
1363 size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
1364 double buf[4]={0,0,0,0};
1365 Mat src2sc = _src2.getMat();
1367 if( !src2sc.empty() )
1368 convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
1369 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
1374 k.args(src1arg, dstarg, scalararg);
1376 k.args(src1arg, dstarg, scalararg,
1377 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
1379 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
1382 k.args(src1arg, maskarg, dstarg, scalararg);
1386 src2 = _src2.getUMat();
1387 ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
1392 k.args(src1arg, src2arg, dstarg);
1394 k.args(src1arg, src2arg, dstarg,
1395 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
1397 k.args(src1arg, src2arg, dstarg,
1398 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
1399 ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
1400 ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
1402 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
1405 k.args(src1arg, src2arg, maskarg, dstarg);
1408 size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
1409 return k.run(2, globalsize, NULL, false);
1414 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1415 InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
1416 void* usrdata=0, int oclop=-1 )
1418 const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
1419 int kind1 = psrc1->kind(), kind2 = psrc2->kind();
1420 bool haveMask = !_mask.empty();
1421 bool reallocate = false;
1422 int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1423 int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
1424 int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
1425 Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
1426 Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
1428 bool use_opencl = _dst.isUMat() && dims1 <= 2 && dims2 <= 2;
1430 bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
1431 bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
1433 if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
1434 !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
1435 (_dst.fixedType() && _dst.type() == type1)) &&
1436 ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
1438 _dst.createSameSize(*psrc1, type1);
1439 CV_OCL_RUN(use_opencl,
1440 ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
1441 (!usrdata ? type1 : std::max(depth1, CV_32F)),
1442 usrdata, oclop, false))
1444 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1445 Size sz = getContinuousSize(src1, src2, dst, src1.channels());
1446 tab[depth1](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
1450 bool haveScalar = false, swapped12 = false;
1452 if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
1453 (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
1454 (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
1456 if( checkScalar(*psrc1, type2, kind1, kind2) )
1458 // src1 is a scalar; swap it with src2
1462 swap(depth1, depth2);
1466 if( oclop == OCL_OP_SUB )
1467 oclop = OCL_OP_RSUB;
1468 if ( oclop == OCL_OP_DIV_SCALE )
1469 oclop = OCL_OP_RDIV_SCALE;
1471 else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1472 CV_Error( CV_StsUnmatchedSizes,
1473 "The operation is neither 'array op array' "
1474 "(where arrays have the same size and the same number of channels), "
1475 "nor 'array op scalar', nor 'scalar op array'" );
1477 CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
1481 Mat sc = psrc2->getMat();
1482 depth2 = actualScalarDepth(sc.ptr<double>(), cn);
1483 if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
1492 if( _dst.fixedType() )
1493 dtype = _dst.type();
1496 if( !haveScalar && type1 != type2 )
1497 CV_Error(CV_StsBadArg,
1498 "When the input arrays in add/subtract/multiply/divide functions have different types, "
1499 "the output array type must be explicitly specified");
1503 dtype = CV_MAT_DEPTH(dtype);
1505 if( depth1 == depth2 && dtype == depth1 )
1509 wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
1510 depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
1511 wtype = std::max(wtype, dtype);
1513 // when the result of addition should be converted to an integer type,
1514 // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
1515 // instead of converting the other input to floating-point and then converting the operation result back to integers.
1516 if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
1521 wtype = std::max(depth1, std::max(depth2, CV_32F));
1522 wtype = std::max(wtype, dtype);
1525 dtype = CV_MAKETYPE(dtype, cn);
1526 wtype = CV_MAKETYPE(wtype, cn);
1530 int mtype = _mask.type();
1531 CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
1532 reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
1535 _dst.createSameSize(*psrc1, dtype);
1539 CV_OCL_RUN(use_opencl,
1540 ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
1541 usrdata, oclop, haveScalar))
1543 BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
1544 BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
1545 BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
1547 size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
1548 size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
1549 size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
1550 BinaryFunc copymask = getCopyMaskFunc(dsz);
1551 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
1553 AutoBuffer<uchar> _buf;
1554 uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
1555 size_t bufesz = (cvtsrc1 ? wsz : 0) +
1556 (cvtsrc2 || haveScalar ? wsz : 0) +
1557 (cvtdst ? wsz : 0) +
1558 (haveMask ? dsz : 0);
1559 BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
1563 const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
1566 NAryMatIterator it(arrays, ptrs);
1567 size_t total = it.size, blocksize = total;
1569 if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
1570 blocksize = std::min(blocksize, blocksize0);
1572 _buf.allocate(bufesz*blocksize + 64);
1575 buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
1577 buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
1578 wbuf = maskbuf = buf;
1580 buf = alignPtr(buf + blocksize*wsz, 16);
1584 for( size_t i = 0; i < it.nplanes; i++, ++it )
1586 for( size_t j = 0; j < total; j += blocksize )
1588 int bsz = (int)MIN(total - j, blocksize);
1589 Size bszn(bsz*cn, 1);
1590 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
1591 uchar* dptr = ptrs[2];
1594 cvtsrc1( sptr1, 0, 0, 0, buf1, 0, bszn, 0 );
1597 if( ptrs[0] == ptrs[1] )
1601 cvtsrc2( sptr2, 0, 0, 0, buf2, 0, bszn, 0 );
1605 if( !haveMask && !cvtdst )
1606 func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata );
1609 func( sptr1, 0, sptr2, 0, wbuf, 0, bszn, usrdata );
1611 cvtdst( wbuf, 0, 0, 0, dptr, 0, bszn, 0 );
1614 copymask( wbuf, 0, ptrs[3], 0, dptr, 0, Size(bsz, 1), &dsz );
1619 cvtdst( wbuf, 0, 0, 0, maskbuf, 0, bszn, 0 );
1620 copymask( maskbuf, 0, ptrs[3], 0, dptr, 0, Size(bsz, 1), &dsz );
1624 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
1630 const Mat* arrays[] = { &src1, &dst, &mask, 0 };
1633 NAryMatIterator it(arrays, ptrs);
1634 size_t total = it.size, blocksize = std::min(total, blocksize0);
1636 _buf.allocate(bufesz*blocksize + 64);
1639 buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
1640 buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
1641 wbuf = maskbuf = buf;
1643 buf = alignPtr(buf + blocksize*wsz, 16);
1647 convertAndUnrollScalar( src2, wtype, buf2, blocksize);
1649 for( size_t i = 0; i < it.nplanes; i++, ++it )
1651 for( size_t j = 0; j < total; j += blocksize )
1653 int bsz = (int)MIN(total - j, blocksize);
1654 Size bszn(bsz*cn, 1);
1655 const uchar *sptr1 = ptrs[0];
1656 const uchar* sptr2 = buf2;
1657 uchar* dptr = ptrs[1];
1661 cvtsrc1( sptr1, 0, 0, 0, buf1, 0, bszn, 0 );
1666 std::swap(sptr1, sptr2);
1668 if( !haveMask && !cvtdst )
1669 func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata );
1672 func( sptr1, 0, sptr2, 0, wbuf, 0, bszn, usrdata );
1674 cvtdst( wbuf, 0, 0, 0, dptr, 0, bszn, 0 );
1677 copymask( wbuf, 0, ptrs[2], 0, dptr, 0, Size(bsz, 1), &dsz );
1682 cvtdst( wbuf, 0, 0, 0, maskbuf, 0, bszn, 0 );
1683 copymask( maskbuf, 0, ptrs[2], 0, dptr, 0, Size(bsz, 1), &dsz );
1687 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
1693 static BinaryFunc* getAddTab()
1695 static BinaryFunc addTab[] =
1697 (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s),
1698 (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s),
1699 (BinaryFunc)GET_OPTIMIZED(add32s),
1700 (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f,
1707 static BinaryFunc* getSubTab()
1709 static BinaryFunc subTab[] =
1711 (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s),
1712 (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s),
1713 (BinaryFunc)GET_OPTIMIZED(sub32s),
1714 (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f,
1721 static BinaryFunc* getAbsDiffTab()
1723 static BinaryFunc absDiffTab[] =
1725 (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s),
1726 (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s),
1727 (BinaryFunc)GET_OPTIMIZED(absdiff32s),
1728 (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f,
1737 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
1738 InputArray mask, int dtype )
1740 arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
1743 void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
1744 InputArray mask, int dtype )
1746 #ifdef HAVE_TEGRA_OPTIMIZATION
1747 if (mask.empty() && src1.depth() == CV_8U && src2.depth() == CV_8U)
1749 if (dtype == -1 && dst.fixedType())
1750 dtype = dst.depth();
1752 if (!dst.fixedType() || dtype == dst.depth())
1754 if (dtype == CV_16S)
1756 Mat _dst = dst.getMat();
1757 if(tegra::subtract_8u8u16s(src1.getMat(), src2.getMat(), _dst))
1760 else if (dtype == CV_32F)
1762 Mat _dst = dst.getMat();
1763 if(tegra::subtract_8u8u32f(src1.getMat(), src2.getMat(), _dst))
1766 else if (dtype == CV_8S)
1768 Mat _dst = dst.getMat();
1769 if(tegra::subtract_8u8u8s(src1.getMat(), src2.getMat(), _dst))
1775 arithm_op(src1, src2, dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
1778 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
1780 arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
1783 /****************************************************************************************\
1785 \****************************************************************************************/
1790 template<typename T, typename WT> static void
1791 mul_( const T* src1, size_t step1, const T* src2, size_t step2,
1792 T* dst, size_t step, Size size, WT scale )
1794 step1 /= sizeof(src1[0]);
1795 step2 /= sizeof(src2[0]);
1796 step /= sizeof(dst[0]);
1798 if( scale == (WT)1. )
1800 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
1803 #if CV_ENABLE_UNROLLED
1804 for(; i <= size.width - 4; i += 4 )
1808 t0 = saturate_cast<T>(src1[i ] * src2[i ]);
1809 t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
1813 t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
1814 t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
1819 for( ; i < size.width; i++ )
1820 dst[i] = saturate_cast<T>(src1[i] * src2[i]);
1825 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
1828 #if CV_ENABLE_UNROLLED
1829 for(; i <= size.width - 4; i += 4 )
1831 T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
1832 T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
1833 dst[i] = t0; dst[i+1] = t1;
1835 t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
1836 t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
1837 dst[i+2] = t0; dst[i+3] = t1;
1840 for( ; i < size.width; i++ )
1841 dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
1846 template<typename T> static void
1847 div_( const T* src1, size_t step1, const T* src2, size_t step2,
1848 T* dst, size_t step, Size size, double scale )
1850 step1 /= sizeof(src1[0]);
1851 step2 /= sizeof(src2[0]);
1852 step /= sizeof(dst[0]);
1854 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
1857 #if CV_ENABLE_UNROLLED
1858 for( ; i <= size.width - 4; i += 4 )
1860 if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
1862 double a = (double)src2[i] * src2[i+1];
1863 double b = (double)src2[i+2] * src2[i+3];
1864 double d = scale/(a * b);
1868 T z0 = saturate_cast<T>(src2[i+1] * ((double)src1[i] * b));
1869 T z1 = saturate_cast<T>(src2[i] * ((double)src1[i+1] * b));
1870 T z2 = saturate_cast<T>(src2[i+3] * ((double)src1[i+2] * a));
1871 T z3 = saturate_cast<T>(src2[i+2] * ((double)src1[i+3] * a));
1873 dst[i] = z0; dst[i+1] = z1;
1874 dst[i+2] = z2; dst[i+3] = z3;
1878 T z0 = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
1879 T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
1880 T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
1881 T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
1883 dst[i] = z0; dst[i+1] = z1;
1884 dst[i+2] = z2; dst[i+3] = z3;
1888 for( ; i < size.width; i++ )
1889 dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
1893 template<typename T> static void
1894 recip_( const T*, size_t, const T* src2, size_t step2,
1895 T* dst, size_t step, Size size, double scale )
1897 step2 /= sizeof(src2[0]);
1898 step /= sizeof(dst[0]);
1900 for( ; size.height--; src2 += step2, dst += step )
1903 #if CV_ENABLE_UNROLLED
1904 for( ; i <= size.width - 4; i += 4 )
1906 if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
1908 double a = (double)src2[i] * src2[i+1];
1909 double b = (double)src2[i+2] * src2[i+3];
1910 double d = scale/(a * b);
1914 T z0 = saturate_cast<T>(src2[i+1] * b);
1915 T z1 = saturate_cast<T>(src2[i] * b);
1916 T z2 = saturate_cast<T>(src2[i+3] * a);
1917 T z3 = saturate_cast<T>(src2[i+2] * a);
1919 dst[i] = z0; dst[i+1] = z1;
1920 dst[i+2] = z2; dst[i+3] = z3;
1924 T z0 = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
1925 T z1 = src2[i+1] != 0 ? saturate_cast<T>(scale/src2[i+1]) : 0;
1926 T z2 = src2[i+2] != 0 ? saturate_cast<T>(scale/src2[i+2]) : 0;
1927 T z3 = src2[i+3] != 0 ? saturate_cast<T>(scale/src2[i+3]) : 0;
1929 dst[i] = z0; dst[i+1] = z1;
1930 dst[i+2] = z2; dst[i+3] = z3;
1934 for( ; i < size.width; i++ )
1935 dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
1940 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
1941 uchar* dst, size_t step, Size sz, void* scale)
1943 mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
1946 static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
1947 schar* dst, size_t step, Size sz, void* scale)
1949 mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
1952 static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
1953 ushort* dst, size_t step, Size sz, void* scale)
1955 mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
1958 static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
1959 short* dst, size_t step, Size sz, void* scale)
1961 mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
1964 static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
1965 int* dst, size_t step, Size sz, void* scale)
1967 mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
1970 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
1971 float* dst, size_t step, Size sz, void* scale)
1973 mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
1976 static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
1977 double* dst, size_t step, Size sz, void* scale)
1979 mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
1982 static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
1983 uchar* dst, size_t step, Size sz, void* scale)
1986 div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
1988 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
1991 static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
1992 schar* dst, size_t step, Size sz, void* scale)
1994 div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
1997 static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
1998 ushort* dst, size_t step, Size sz, void* scale)
2000 div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2003 static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
2004 short* dst, size_t step, Size sz, void* scale)
2006 div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2009 static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
2010 int* dst, size_t step, Size sz, void* scale)
2012 div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2015 static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
2016 float* dst, size_t step, Size sz, void* scale)
2018 div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2021 static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
2022 double* dst, size_t step, Size sz, void* scale)
2024 div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2027 static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
2028 uchar* dst, size_t step, Size sz, void* scale)
2030 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2033 static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
2034 schar* dst, size_t step, Size sz, void* scale)
2036 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2039 static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
2040 ushort* dst, size_t step, Size sz, void* scale)
2042 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2045 static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
2046 short* dst, size_t step, Size sz, void* scale)
2048 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2051 static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
2052 int* dst, size_t step, Size sz, void* scale)
2054 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2057 static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
2058 float* dst, size_t step, Size sz, void* scale)
2060 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2063 static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
2064 double* dst, size_t step, Size sz, void* scale)
2066 recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
2070 static BinaryFunc* getMulTab()
2072 static BinaryFunc mulTab[] =
2074 (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
2075 (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f,
2076 (BinaryFunc)mul64f, 0
2082 static BinaryFunc* getDivTab()
2084 static BinaryFunc divTab[] =
2086 (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u,
2087 (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f,
2088 (BinaryFunc)div64f, 0
2094 static BinaryFunc* getRecipTab()
2096 static BinaryFunc recipTab[] =
2098 (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u,
2099 (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f,
2100 (BinaryFunc)recip64f, 0
2108 void cv::multiply(InputArray src1, InputArray src2,
2109 OutputArray dst, double scale, int dtype)
2111 arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
2112 true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
2115 void cv::divide(InputArray src1, InputArray src2,
2116 OutputArray dst, double scale, int dtype)
2118 arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
2121 void cv::divide(double scale, InputArray src2,
2122 OutputArray dst, int dtype)
2124 arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
2127 /****************************************************************************************\
2129 \****************************************************************************************/
2134 template<typename T, typename WT> static void
2135 addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
2136 T* dst, size_t step, Size size, void* _scalars )
2138 const double* scalars = (const double*)_scalars;
2139 WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
2140 step1 /= sizeof(src1[0]);
2141 step2 /= sizeof(src2[0]);
2142 step /= sizeof(dst[0]);
2144 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2147 #if CV_ENABLE_UNROLLED
2148 for( ; x <= size.width - 4; x += 4 )
2150 T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
2151 T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
2152 dst[x] = t0; dst[x+1] = t1;
2154 t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
2155 t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
2156 dst[x+2] = t0; dst[x+3] = t1;
2159 for( ; x < size.width; x++ )
2160 dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
2166 addWeighted8u( const uchar* src1, size_t step1,
2167 const uchar* src2, size_t step2,
2168 uchar* dst, size_t step, Size size,
2171 const double* scalars = (const double*)_scalars;
2172 float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
2174 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2181 __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
2182 __m128i z = _mm_setzero_si128();
2184 for( ; x <= size.width - 8; x += 8 )
2186 __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
2187 __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
2189 __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
2190 __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
2191 __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
2192 __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
2194 u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
2195 u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
2196 u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
2198 u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
2199 u = _mm_packus_epi16(u, u);
2201 _mm_storel_epi64((__m128i*)(dst + x), u);
2205 #if CV_ENABLE_UNROLLED
2206 for( ; x <= size.width - 4; x += 4 )
2209 t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
2210 t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
2212 dst[x] = saturate_cast<uchar>(t0);
2213 dst[x+1] = saturate_cast<uchar>(t1);
2215 t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
2216 t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
2218 dst[x+2] = saturate_cast<uchar>(t0);
2219 dst[x+3] = saturate_cast<uchar>(t1);
2223 for( ; x < size.width; x++ )
2225 float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
2226 dst[x] = saturate_cast<uchar>(t0);
2231 static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
2232 schar* dst, size_t step, Size sz, void* scalars )
2234 addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, sz, scalars);
2237 static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
2238 ushort* dst, size_t step, Size sz, void* scalars )
2240 addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, sz, scalars);
2243 static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
2244 short* dst, size_t step, Size sz, void* scalars )
2246 addWeighted_<short, float>(src1, step1, src2, step2, dst, step, sz, scalars);
2249 static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
2250 int* dst, size_t step, Size sz, void* scalars )
2252 addWeighted_<int, double>(src1, step1, src2, step2, dst, step, sz, scalars);
2255 static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
2256 float* dst, size_t step, Size sz, void* scalars )
2258 addWeighted_<float, double>(src1, step1, src2, step2, dst, step, sz, scalars);
2261 static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
2262 double* dst, size_t step, Size sz, void* scalars )
2264 addWeighted_<double, double>(src1, step1, src2, step2, dst, step, sz, scalars);
2267 static BinaryFunc* getAddWeightedTab()
2269 static BinaryFunc addWeightedTab[] =
2271 (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u),
2272 (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f,
2273 (BinaryFunc)addWeighted64f, 0
2276 return addWeightedTab;
2281 void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
2282 double beta, double gamma, OutputArray dst, int dtype )
2284 double scalars[] = {alpha, beta, gamma};
2285 arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
2289 /****************************************************************************************\
2291 \****************************************************************************************/
2296 template<typename T> static void
2297 cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
2298 uchar* dst, size_t step, Size size, int code)
2300 step1 /= sizeof(src1[0]);
2301 step2 /= sizeof(src2[0]);
2302 if( code == CMP_GE || code == CMP_LT )
2304 std::swap(src1, src2);
2305 std::swap(step1, step2);
2306 code = code == CMP_GE ? CMP_LE : CMP_GT;
2309 if( code == CMP_GT || code == CMP_LE )
2311 int m = code == CMP_GT ? 0 : 255;
2312 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2315 #if CV_ENABLE_UNROLLED
2316 for( ; x <= size.width - 4; x += 4 )
2319 t0 = -(src1[x] > src2[x]) ^ m;
2320 t1 = -(src1[x+1] > src2[x+1]) ^ m;
2321 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
2322 t0 = -(src1[x+2] > src2[x+2]) ^ m;
2323 t1 = -(src1[x+3] > src2[x+3]) ^ m;
2324 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
2327 for( ; x < size.width; x++ )
2328 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2331 else if( code == CMP_EQ || code == CMP_NE )
2333 int m = code == CMP_EQ ? 0 : 255;
2334 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2337 #if CV_ENABLE_UNROLLED
2338 for( ; x <= size.width - 4; x += 4 )
2341 t0 = -(src1[x] == src2[x]) ^ m;
2342 t1 = -(src1[x+1] == src2[x+1]) ^ m;
2343 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
2344 t0 = -(src1[x+2] == src2[x+2]) ^ m;
2345 t1 = -(src1[x+3] == src2[x+3]) ^ m;
2346 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
2349 for( ; x < size.width; x++ )
2350 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
2356 inline static IppCmpOp convert_cmp(int _cmpop)
2358 return _cmpop == CMP_EQ ? ippCmpEq :
2359 _cmpop == CMP_GT ? ippCmpGreater :
2360 _cmpop == CMP_GE ? ippCmpGreaterEq :
2361 _cmpop == CMP_LT ? ippCmpLess :
2362 _cmpop == CMP_LE ? ippCmpLessEq :
2367 static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
2368 uchar* dst, size_t step, Size size, void* _cmpop)
2371 IppCmpOp op = convert_cmp(*(int *)_cmpop);
2374 fixSteps(size, sizeof(dst[0]), step1, step2, step);
2375 if( ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)size, op) >= 0 )
2379 //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2380 int code = *(int*)_cmpop;
2381 step1 /= sizeof(src1[0]);
2382 step2 /= sizeof(src2[0]);
2383 if( code == CMP_GE || code == CMP_LT )
2385 std::swap(src1, src2);
2386 std::swap(step1, step2);
2387 code = code == CMP_GE ? CMP_LE : CMP_GT;
2390 if( code == CMP_GT || code == CMP_LE )
2392 int m = code == CMP_GT ? 0 : 255;
2393 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2398 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
2399 __m128i c128 = _mm_set1_epi8 (-128);
2400 for( ; x <= size.width - 16; x += 16 )
2402 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
2403 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
2404 // no simd for 8u comparison, that's why we need the trick
2405 r00 = _mm_sub_epi8(r00,c128);
2406 r10 = _mm_sub_epi8(r10,c128);
2408 r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
2409 _mm_storeu_si128((__m128i*)(dst + x),r00);
2415 for( ; x < size.width; x++ ){
2416 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2420 else if( code == CMP_EQ || code == CMP_NE )
2422 int m = code == CMP_EQ ? 0 : 255;
2423 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2428 __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
2429 for( ; x <= size.width - 16; x += 16 )
2431 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
2432 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
2433 r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
2434 _mm_storeu_si128((__m128i*)(dst + x), r00);
2438 for( ; x < size.width; x++ )
2439 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
2444 static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
2445 uchar* dst, size_t step, Size size, void* _cmpop)
2447 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2450 static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
2451 uchar* dst, size_t step, Size size, void* _cmpop)
2454 IppCmpOp op = convert_cmp(*(int *)_cmpop);
2457 fixSteps(size, sizeof(dst[0]), step1, step2, step);
2458 if( ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)size, op) >= 0 )
2462 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2465 static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
2466 uchar* dst, size_t step, Size size, void* _cmpop)
2469 IppCmpOp op = convert_cmp(*(int *)_cmpop);
2472 fixSteps(size, sizeof(dst[0]), step1, step2, step);
2473 if( ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)size, op) >= 0 )
2477 //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2479 int code = *(int*)_cmpop;
2480 step1 /= sizeof(src1[0]);
2481 step2 /= sizeof(src2[0]);
2482 if( code == CMP_GE || code == CMP_LT )
2484 std::swap(src1, src2);
2485 std::swap(step1, step2);
2486 code = code == CMP_GE ? CMP_LE : CMP_GT;
2489 if( code == CMP_GT || code == CMP_LE )
2491 int m = code == CMP_GT ? 0 : 255;
2492 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2497 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
2498 for( ; x <= size.width - 16; x += 16 )
2500 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
2501 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
2502 r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
2503 __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
2504 __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
2505 r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
2506 r11 = _mm_packs_epi16(r00, r01);
2507 _mm_storeu_si128((__m128i*)(dst + x), r11);
2509 if( x <= size.width-8)
2511 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
2512 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
2513 r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
2514 r10 = _mm_packs_epi16(r00, r00);
2515 _mm_storel_epi64((__m128i*)(dst + x), r10);
2522 for( ; x < size.width; x++ ){
2523 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2527 else if( code == CMP_EQ || code == CMP_NE )
2529 int m = code == CMP_EQ ? 0 : 255;
2530 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2535 __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
2536 for( ; x <= size.width - 16; x += 16 )
2538 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
2539 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
2540 r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
2541 __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
2542 __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
2543 r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
2544 r11 = _mm_packs_epi16(r00, r01);
2545 _mm_storeu_si128((__m128i*)(dst + x), r11);
2547 if( x <= size.width - 8)
2549 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
2550 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
2551 r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
2552 r10 = _mm_packs_epi16(r00, r00);
2553 _mm_storel_epi64((__m128i*)(dst + x), r10);
2559 for( ; x < size.width; x++ )
2560 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
2565 static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
2566 uchar* dst, size_t step, Size size, void* _cmpop)
2568 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2571 static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
2572 uchar* dst, size_t step, Size size, void* _cmpop)
2575 IppCmpOp op = convert_cmp(*(int *)_cmpop);
2578 fixSteps(size, sizeof(dst[0]), step1, step2, step);
2579 if( ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)size, op) >= 0 )
2583 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2586 static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
2587 uchar* dst, size_t step, Size size, void* _cmpop)
2589 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2592 static BinaryFunc getCmpFunc(int depth)
2594 static BinaryFunc cmpTab[] =
2596 (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s),
2597 (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s),
2598 (BinaryFunc)GET_OPTIMIZED(cmp32s),
2599 (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f,
2603 return cmpTab[depth];
2606 static double getMinVal(int depth)
2608 static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
2612 static double getMaxVal(int depth)
2614 static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
2620 static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op, bool haveScalar)
2622 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
2623 int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
2624 int type2 = _src2.type();
2628 if ( (!doubleSupport && (depth1 == CV_64F || _src2.depth() == CV_64F)) ||
2629 !_src1.sameSize(_src2) || type1 != type2)
2634 if (cn > 1 || depth1 <= CV_32S) // FIXIT: if (cn > 4): Need to clear CPU-based compare behavior
2638 if (!doubleSupport && depth1 == CV_64F)
2641 int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
2642 int scalarcn = kercn == 3 ? 4 : kercn;
2644 const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
2647 String buildOptions = format(
2648 "-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
2649 " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s%s -D srcT1_C1=%s"
2650 " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s",
2651 (haveScalar ? "UNARY_OP" : "BINARY_OP"),
2652 ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
2653 ocl::typeToStr(CV_8UC(kercn)), kercn,
2654 ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
2655 operationMap[op], doubleSupport ? " -D DOUBLE_SUPPORT" : "",
2656 ocl::typeToStr(depth1), ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
2657 ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)),
2658 doubleSupport ? " -D DOUBLE_SUPPORT" : ""
2661 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, buildOptions);
2665 UMat src1 = _src1.getUMat();
2666 Size size = src1.size();
2667 _dst.create(size, CV_8UC(cn));
2668 UMat dst = _dst.getUMat();
2672 size_t esz = CV_ELEM_SIZE1(type1)*scalarcn;
2673 double buf[4]={0,0,0,0};
2674 Mat src2sc = _src2.getMat();
2676 if (!src2sc.empty())
2677 convertAndUnrollScalar(src2sc, type1, (uchar*)buf, 1);
2679 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
2681 k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
2682 ocl::KernelArg::WriteOnly(dst, cn, kercn),
2687 CV_DbgAssert(type1 == type2);
2688 UMat src2 = _src2.getUMat();
2689 CV_DbgAssert(size == src2.size());
2691 _dst.create(size, CV_8UC(cn));
2693 k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
2694 ocl::KernelArg::ReadOnlyNoSize(src2),
2695 ocl::KernelArg::WriteOnly(dst, cn, kercn));
2698 size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows };
2699 return k.run(2, globalsize, NULL, false);
2706 void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
2708 CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
2709 op == CMP_NE || op == CMP_GE || op == CMP_GT );
2711 bool haveScalar = false;
2713 if ((_src1.isMatx() + _src2.isMatx()) == 1
2714 || !_src1.sameSize(_src2)
2715 || _src1.type() != _src2.type())
2717 if (checkScalar(_src1, _src2.type(), _src1.kind(), _src2.kind()))
2719 op = op == CMP_LT ? CMP_GT : op == CMP_LE ? CMP_GE :
2720 op == CMP_GE ? CMP_LE : op == CMP_GT ? CMP_LT : op;
2721 // src1 is a scalar; swap it with src2
2722 compare(_src2, _src1, _dst, op);
2725 else if( !checkScalar(_src2, _src1.type(), _src2.kind(), _src1.kind()) )
2726 CV_Error( CV_StsUnmatchedSizes,
2727 "The operation is neither 'array op array' (where arrays have the same size and the same type), "
2728 "nor 'array op scalar', nor 'scalar op array'" );
2732 CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
2733 ocl_compare(_src1, _src2, _dst, op, haveScalar))
2735 int kind1 = _src1.kind(), kind2 = _src2.kind();
2736 Mat src1 = _src1.getMat(), src2 = _src2.getMat();
2738 if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
2740 int cn = src1.channels();
2741 _dst.create(src1.size(), CV_8UC(cn));
2742 Mat dst = _dst.getMat();
2743 Size sz = getContinuousSize(src1, src2, dst, src1.channels());
2744 getCmpFunc(src1.depth())(src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, &op);
2748 int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
2750 _dst.create(src1.dims, src1.size, CV_8UC(cn));
2751 src1 = src1.reshape(1); src2 = src2.reshape(1);
2752 Mat dst = _dst.getMat().reshape(1);
2754 size_t esz = src1.elemSize();
2755 size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
2756 BinaryFunc func = getCmpFunc(depth1);
2760 const Mat* arrays[] = { &src1, &src2, &dst, 0 };
2763 NAryMatIterator it(arrays, ptrs);
2764 size_t total = it.size;
2766 for( size_t i = 0; i < it.nplanes; i++, ++it )
2767 func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
2771 const Mat* arrays[] = { &src1, &dst, 0 };
2774 NAryMatIterator it(arrays, ptrs);
2775 size_t total = it.size, blocksize = std::min(total, blocksize0);
2777 AutoBuffer<uchar> _buf(blocksize*esz);
2780 if( depth1 > CV_32S )
2781 convertAndUnrollScalar( src2, depth1, buf, blocksize );
2785 getConvertFunc(depth2, CV_64F)(src2.data, 0, 0, 0, (uchar*)&fval, 0, Size(1,1), 0);
2786 if( fval < getMinVal(depth1) )
2788 dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
2792 if( fval > getMaxVal(depth1) )
2794 dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
2798 int ival = cvRound(fval);
2801 if( op == CMP_LT || op == CMP_GE )
2802 ival = cvCeil(fval);
2803 else if( op == CMP_LE || op == CMP_GT )
2804 ival = cvFloor(fval);
2807 dst = Scalar::all(op == CMP_NE ? 255 : 0);
2811 convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
2814 for( size_t i = 0; i < it.nplanes; i++, ++it )
2816 for( size_t j = 0; j < total; j += blocksize )
2818 int bsz = (int)MIN(total - j, blocksize);
2819 func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op);
2827 /****************************************************************************************\
2829 \****************************************************************************************/
2834 template<typename T> static void
2835 inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
2836 const T* src3, size_t step3, uchar* dst, size_t step,
2839 step1 /= sizeof(src1[0]);
2840 step2 /= sizeof(src2[0]);
2841 step3 /= sizeof(src3[0]);
2843 for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
2846 #if CV_ENABLE_UNROLLED
2847 for( ; x <= size.width - 4; x += 4 )
2850 t0 = src2[x] <= src1[x] && src1[x] <= src3[x];
2851 t1 = src2[x+1] <= src1[x+1] && src1[x+1] <= src3[x+1];
2852 dst[x] = (uchar)-t0; dst[x+1] = (uchar)-t1;
2853 t0 = src2[x+2] <= src1[x+2] && src1[x+2] <= src3[x+2];
2854 t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
2855 dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
2858 for( ; x < size.width; x++ )
2859 dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
2864 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
2865 const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
2867 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
2870 static void inRange8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
2871 const schar* src3, size_t step3, uchar* dst, size_t step, Size size)
2873 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
2876 static void inRange16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
2877 const ushort* src3, size_t step3, uchar* dst, size_t step, Size size)
2879 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
2882 static void inRange16s(const short* src1, size_t step1, const short* src2, size_t step2,
2883 const short* src3, size_t step3, uchar* dst, size_t step, Size size)
2885 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
2888 static void inRange32s(const int* src1, size_t step1, const int* src2, size_t step2,
2889 const int* src3, size_t step3, uchar* dst, size_t step, Size size)
2891 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
2894 static void inRange32f(const float* src1, size_t step1, const float* src2, size_t step2,
2895 const float* src3, size_t step3, uchar* dst, size_t step, Size size)
2897 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
2900 static void inRange64f(const double* src1, size_t step1, const double* src2, size_t step2,
2901 const double* src3, size_t step3, uchar* dst, size_t step, Size size)
2903 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
2906 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
2908 int k = cn % 4 ? cn % 4 : 4;
2911 for( i = j = 0; i < len; i++, j += cn )
2914 for( i = j = 0; i < len; i++, j += cn )
2915 dst[i] = src[j] & src[j+1];
2917 for( i = j = 0; i < len; i++, j += cn )
2918 dst[i] = src[j] & src[j+1] & src[j+2];
2920 for( i = j = 0; i < len; i++, j += cn )
2921 dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
2923 for( ; k < cn; k += 4 )
2925 for( i = 0, j = k; i < len; i++, j += cn )
2926 dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
2930 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
2931 const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );
2933 static InRangeFunc getInRangeFunc(int depth)
2935 static InRangeFunc inRangeTab[] =
2937 (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
2938 (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
2939 (InRangeFunc)inRange64f, 0
2942 return inRangeTab[depth];
2947 static bool ocl_inRange( InputArray _src, InputArray _lowerb,
2948 InputArray _upperb, OutputArray _dst )
2950 int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
2951 Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
2952 int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
2953 int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
2954 int cn = CV_MAT_CN(stype);
2955 bool lbScalar = false, ubScalar = false;
2957 if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
2958 ssize != lsize || stype != ltype )
2960 if( !checkScalar(_lowerb, stype, lkind, skind) )
2961 CV_Error( CV_StsUnmatchedSizes,
2962 "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
2966 if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
2967 ssize != usize || stype != utype )
2969 if( !checkScalar(_upperb, stype, ukind, skind) )
2970 CV_Error( CV_StsUnmatchedSizes,
2971 "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
2975 if (lbScalar != ubScalar)
2978 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
2979 haveScalar = lbScalar && ubScalar;
2981 if ( (!doubleSupport && sdepth == CV_64F) ||
2982 (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
2985 ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc,
2986 format("%s-D cn=%d -D T=%s%s", haveScalar ? "-D HAVE_SCALAR " : "",
2987 cn, ocl::typeToStr(sdepth), doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
2991 _dst.create(ssize, CV_8UC1);
2992 UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru;
2993 Mat lscalar, uscalar;
2995 if (lbScalar && ubScalar)
2997 lscalar = _lowerb.getMat();
2998 uscalar = _upperb.getMat();
3000 size_t esz = src.elemSize();
3001 size_t blocksize = 36;
3003 AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
3004 uchar *buf = alignPtr(_buf + blocksize*cn, 16);
3006 if( ldepth != sdepth && sdepth < CV_32S )
3008 int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
3009 int* iubuf = ilbuf + cn;
3011 BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S);
3012 sccvtfunc(lscalar.data, 0, 0, 0, (uchar*)ilbuf, 0, Size(cn, 1), 0);
3013 sccvtfunc(uscalar.data, 0, 0, 0, (uchar*)iubuf, 0, Size(cn, 1), 0);
3014 int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth));
3016 for( int k = 0; k < cn; k++ )
3018 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
3019 ilbuf[k] = minval+1, iubuf[k] = minval;
3021 lscalar = Mat(cn, 1, CV_32S, ilbuf);
3022 uscalar = Mat(cn, 1, CV_32S, iubuf);
3025 lscalar.convertTo(lscalar, stype);
3026 uscalar.convertTo(uscalar, stype);
3030 lscalaru = _lowerb.getUMat();
3031 uscalaru = _upperb.getUMat();
3034 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
3035 dstarg = ocl::KernelArg::WriteOnly(dst);
3039 lscalar.copyTo(lscalaru);
3040 uscalar.copyTo(uscalaru);
3042 ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
3043 ocl::KernelArg::PtrReadOnly(uscalaru));
3046 ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
3047 ocl::KernelArg::ReadOnlyNoSize(uscalaru));
3049 size_t globalsize[2] = { ssize.width, ssize.height };
3050 return ker.run(2, globalsize, NULL, false);
3057 void cv::inRange(InputArray _src, InputArray _lowerb,
3058 InputArray _upperb, OutputArray _dst)
3060 CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
3061 _upperb.dims() <= 2 && _dst.isUMat(),
3062 ocl_inRange(_src, _lowerb, _upperb, _dst))
3064 int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
3065 Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
3067 bool lbScalar = false, ubScalar = false;
3069 if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
3070 src.size != lb.size || src.type() != lb.type() )
3072 if( !checkScalar(lb, src.type(), lkind, skind) )
3073 CV_Error( CV_StsUnmatchedSizes,
3074 "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
3078 if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
3079 src.size != ub.size || src.type() != ub.type() )
3081 if( !checkScalar(ub, src.type(), ukind, skind) )
3082 CV_Error( CV_StsUnmatchedSizes,
3083 "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
3087 CV_Assert(lbScalar == ubScalar);
3089 int cn = src.channels(), depth = src.depth();
3091 size_t esz = src.elemSize();
3092 size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
3094 _dst.create(src.dims, src.size, CV_8UC1);
3095 Mat dst = _dst.getMat();
3096 InRangeFunc func = getInRangeFunc(depth);
3098 const Mat* arrays_sc[] = { &src, &dst, 0 };
3099 const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
3102 NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
3103 size_t total = it.size, blocksize = std::min(total, blocksize0);
3105 AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
3106 uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0;
3107 buf = alignPtr(buf + blocksize*cn, 16);
3109 if( lbScalar && ubScalar )
3112 ubuf = buf = alignPtr(buf + blocksize*esz, 16);
3114 CV_Assert( lb.type() == ub.type() );
3115 int scdepth = lb.depth();
3117 if( scdepth != depth && depth < CV_32S )
3119 int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
3120 int* iubuf = ilbuf + cn;
3122 BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
3123 sccvtfunc(lb.data, 0, 0, 0, (uchar*)ilbuf, 0, Size(cn, 1), 0);
3124 sccvtfunc(ub.data, 0, 0, 0, (uchar*)iubuf, 0, Size(cn, 1), 0);
3125 int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
3127 for( int k = 0; k < cn; k++ )
3129 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
3130 ilbuf[k] = minval+1, iubuf[k] = minval;
3132 lb = Mat(cn, 1, CV_32S, ilbuf);
3133 ub = Mat(cn, 1, CV_32S, iubuf);
3136 convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
3137 convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
3140 for( size_t i = 0; i < it.nplanes; i++, ++it )
3142 for( size_t j = 0; j < total; j += blocksize )
3144 int bsz = (int)MIN(total - j, blocksize);
3145 size_t delta = bsz*esz;
3146 uchar *lptr = lbuf, *uptr = ubuf;
3154 int idx = !lbScalar ? 3 : 2;
3158 func( ptrs[0], 0, lptr, 0, uptr, 0, cn == 1 ? ptrs[1] : mbuf, 0, Size(bsz*cn, 1));
3160 inRangeReduce(mbuf, ptrs[1], bsz, cn);
3167 /****************************************************************************************\
3168 * Earlier API: cvAdd etc. *
3169 \****************************************************************************************/
3172 cvNot( const CvArr* srcarr, CvArr* dstarr )
3174 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3175 CV_Assert( src.size == dst.size && src.type() == dst.type() );
3176 cv::bitwise_not( src, dst );
3181 cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
3183 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
3184 dst = cv::cvarrToMat(dstarr), mask;
3185 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3187 mask = cv::cvarrToMat(maskarr);
3188 cv::bitwise_and( src1, src2, dst, mask );
3193 cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
3195 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
3196 dst = cv::cvarrToMat(dstarr), mask;
3197 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3199 mask = cv::cvarrToMat(maskarr);
3200 cv::bitwise_or( src1, src2, dst, mask );
3205 cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
3207 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
3208 dst = cv::cvarrToMat(dstarr), mask;
3209 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3211 mask = cv::cvarrToMat(maskarr);
3212 cv::bitwise_xor( src1, src2, dst, mask );
3217 cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
3219 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
3220 CV_Assert( src.size == dst.size && src.type() == dst.type() );
3222 mask = cv::cvarrToMat(maskarr);
3223 cv::bitwise_and( src, (const cv::Scalar&)s, dst, mask );
3228 cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
3230 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
3231 CV_Assert( src.size == dst.size && src.type() == dst.type() );
3233 mask = cv::cvarrToMat(maskarr);
3234 cv::bitwise_or( src, (const cv::Scalar&)s, dst, mask );
3239 cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
3241 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
3242 CV_Assert( src.size == dst.size && src.type() == dst.type() );
3244 mask = cv::cvarrToMat(maskarr);
3245 cv::bitwise_xor( src, (const cv::Scalar&)s, dst, mask );
3249 CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
3251 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
3252 dst = cv::cvarrToMat(dstarr), mask;
3253 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3255 mask = cv::cvarrToMat(maskarr);
3256 cv::add( src1, src2, dst, mask, dst.type() );
3260 CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
3262 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
3263 dst = cv::cvarrToMat(dstarr), mask;
3264 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3266 mask = cv::cvarrToMat(maskarr);
3267 cv::subtract( src1, src2, dst, mask, dst.type() );
3271 CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
3273 cv::Mat src1 = cv::cvarrToMat(srcarr1),
3274 dst = cv::cvarrToMat(dstarr), mask;
3275 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3277 mask = cv::cvarrToMat(maskarr);
3278 cv::add( src1, (const cv::Scalar&)value, dst, mask, dst.type() );
3282 CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
3284 cv::Mat src1 = cv::cvarrToMat(srcarr1),
3285 dst = cv::cvarrToMat(dstarr), mask;
3286 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3288 mask = cv::cvarrToMat(maskarr);
3289 cv::subtract( (const cv::Scalar&)value, src1, dst, mask, dst.type() );
3293 CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
3294 CvArr* dstarr, double scale )
3296 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
3297 dst = cv::cvarrToMat(dstarr);
3298 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3299 cv::multiply( src1, src2, dst, scale, dst.type() );
3303 CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
3304 CvArr* dstarr, double scale )
3306 cv::Mat src2 = cv::cvarrToMat(srcarr2),
3307 dst = cv::cvarrToMat(dstarr), mask;
3308 CV_Assert( src2.size == dst.size && src2.channels() == dst.channels() );
3311 cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale, dst.type() );
3313 cv::divide( scale, src2, dst, dst.type() );
3318 cvAddWeighted( const CvArr* srcarr1, double alpha,
3319 const CvArr* srcarr2, double beta,
3320 double gamma, CvArr* dstarr )
3322 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
3323 dst = cv::cvarrToMat(dstarr);
3324 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3325 cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
3330 cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
3332 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3333 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3335 cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
3340 cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
3342 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3343 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3345 cv::absdiff( src1, (const cv::Scalar&)scalar, dst );
3350 cvInRange( const void* srcarr1, const void* srcarr2,
3351 const void* srcarr3, void* dstarr )
3353 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3354 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3356 cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
3361 cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
3363 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3364 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3366 cv::inRange( src1, (const cv::Scalar&)lowerb, (const cv::Scalar&)upperb, dst );
3371 cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
3373 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3374 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3376 cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
3381 cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
3383 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3384 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3386 cv::compare( src1, value, dst, cmp_op );
3391 cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
3393 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3394 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3396 cv::min( src1, cv::cvarrToMat(srcarr2), dst );
3401 cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
3403 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3404 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3406 cv::max( src1, cv::cvarrToMat(srcarr2), dst );
3411 cvMinS( const void* srcarr1, double value, void* dstarr )
3413 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3414 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3416 cv::min( src1, value, dst );
3421 cvMaxS( const void* srcarr1, double value, void* dstarr )
3423 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3424 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3426 cv::max( src1, value, dst );