modules/core/src/arithm.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // Redistribution and use in source and binary forms, with or without modification,
  19 // are permitted provided that the following conditions are met:
  20 //
  21 //   * Redistribution's of source code must retain the above copyright notice,
  22 //     this list of conditions and the following disclaimer.
  23 //
  24 //   * Redistribution's in binary form must reproduce the above copyright notice,
  25 //     this list of conditions and the following disclaimer in the documentation
  26 //     and/or other materials provided with the distribution.
  27 //
  28 //   * The name of the copyright holders may not be used to endorse or promote products
  29 //     derived from this software without specific prior written permission.
  30 //
  31 // This software is provided by the copyright holders and contributors "as is" and
  32 // any express or implied warranties, including, but not limited to, the implied
  33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34 // In no event shall the Intel Corporation or contributors be liable for any direct,
  35 // indirect, incidental, special, exemplary, or consequential damages
  36 // (including, but not limited to, procurement of substitute goods or services;
  37 // loss of use, data, or profits; or business interruption) however caused
  38 // and on any theory of liability, whether in contract, strict liability,
  39 // or tort (including negligence or otherwise) arising in any way out of
  40 // the use of this software, even if advised of the possibility of such damage.
  41 //
  42 //M*/
  43
  44 /* ////////////////////////////////////////////////////////////////////
  45 //
  46 //  Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
  47 //
  48 // */
  49
  50 #include "precomp.hpp"
  51 #include "opencl_kernels_core.hpp"
  52
  53 namespace cv
  54 {
  55
  56 /****************************************************************************************\
  57 *                                   logical operations                                   *
  58 \****************************************************************************************/
  59
  60 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
  61 {
  62     int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
  63     size_t esz = CV_ELEM_SIZE(buftype);
  64     getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
  65     // unroll the scalar
  66     if( scn < cn )
  67     {
  68         CV_Assert( scn == 1 );
  69         size_t esz1 = CV_ELEM_SIZE1(buftype);
  70         for( size_t i = esz1; i < esz; i++ )
  71             scbuf[i] = scbuf[i - esz1];
  72     }
  73     for( size_t i = esz; i < blocksize*esz; i++ )
  74         scbuf[i] = scbuf[i - esz];
  75 }
  76
  77
  78 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
  79        OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
  80        OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
  81        OCL_OP_RDIV_SCALE=15 };
  82
  83 #ifdef HAVE_OPENCL
  84
  85 static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
  86     "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
  87     "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
  88
  89 static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
  90                           InputArray _mask, bool bitwise, int oclop, bool haveScalar )
  91 {
  92     bool haveMask = !_mask.empty();
  93     int srctype = _src1.type();
  94     int srcdepth = CV_MAT_DEPTH(srctype);
  95     int cn = CV_MAT_CN(srctype);
  96
  97     const ocl::Device d = ocl::Device::getDefault();
  98     bool doubleSupport = d.doubleFPConfig() > 0;
  99     if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
 100             (!doubleSupport && srcdepth == CV_64F && !bitwise))
 101         return false;
 102
 103     char opts[1024];
 104     int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
 105     int scalarcn = kercn == 3 ? 4 : kercn;
 106     int rowsPerWI = d.isIntel() ? 4 : 1;
 107
 108     sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
 109             haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
 110             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
 111                 ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
 112             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
 113                 ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
 114             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
 115                 ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
 116             kercn, rowsPerWI);
 117
 118     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
 119     if (k.empty())
 120         return false;
 121
 122     UMat src1 = _src1.getUMat(), src2;
 123     UMat dst = _dst.getUMat(), mask = _mask.getUMat();
 124
 125     ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
 126     ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
 127                                        ocl::KernelArg::WriteOnly(dst, cn, kercn);
 128     ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
 129
 130     if( haveScalar )
 131     {
 132         size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
 133         double buf[4] = {0,0,0,0};
 134
 135         if( oclop != OCL_OP_NOT )
 136         {
 137             Mat src2sc = _src2.getMat();
 138             convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
 139         }
 140
 141         ocl::KernelArg scalararg = ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, buf, esz);
 142
 143         if( !haveMask )
 144             k.args(src1arg, dstarg, scalararg);
 145         else
 146             k.args(src1arg, maskarg, dstarg, scalararg);
 147     }
 148     else
 149     {
 150         src2 = _src2.getUMat();
 151         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
 152
 153         if( !haveMask )
 154             k.args(src1arg, src2arg, dstarg);
 155         else
 156             k.args(src1arg, src2arg, maskarg, dstarg);
 157     }
 158
 159     size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI };
 160     return k.run(2, globalsize, 0, false);
 161 }
 162
 163 #endif
 164
 165 static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
 166                        InputArray _mask, const BinaryFuncC* tab,
 167                        bool bitwise, int oclop )
 168 {
 169     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
 170     int kind1 = psrc1->kind(), kind2 = psrc2->kind();
 171     int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
 172     int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
 173     int dims1 = psrc1->dims(), dims2 = psrc2->dims();
 174     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
 175     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
 176 #ifdef HAVE_OPENCL
 177     bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
 178             dims1 <= 2 && dims2 <= 2;
 179 #endif
 180     bool haveMask = !_mask.empty(), haveScalar = false;
 181     BinaryFuncC func;
 182
 183     if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
 184     {
 185         _dst.create(sz1, type1);
 186         CV_OCL_RUN(use_opencl,
 187                    ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
 188
 189         if( bitwise )
 190         {
 191             func = *tab;
 192             cn = (int)CV_ELEM_SIZE(type1);
 193         }
 194         else
 195             func = tab[depth1];
 196
 197         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
 198         Size sz = getContinuousSize(src1, src2, dst);
 199         size_t len = sz.width*(size_t)cn;
 200         if( len == (size_t)(int)len )
 201         {
 202             sz.width = (int)len;
 203             func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, 0);
 204             return;
 205         }
 206     }
 207
 208     if( oclop == OCL_OP_NOT )
 209         haveScalar = true;
 210     else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
 211         !psrc1->sameSize(*psrc2) || type1 != type2 )
 212     {
 213         if( checkScalar(*psrc1, type2, kind1, kind2) )
 214         {
 215             // src1 is a scalar; swap it with src2
 216             swap(psrc1, psrc2);
 217             swap(type1, type2);
 218             swap(depth1, depth2);
 219             swap(cn, cn2);
 220             swap(sz1, sz2);
 221         }
 222         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
 223             CV_Error( CV_StsUnmatchedSizes,
 224                       "The operation is neither 'array op array' (where arrays have the same size and type), "
 225                       "nor 'array op scalar', nor 'scalar op array'" );
 226         haveScalar = true;
 227     }
 228     else
 229     {
 230         CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
 231     }
 232
 233     size_t esz = CV_ELEM_SIZE(type1);
 234     size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
 235     BinaryFunc copymask = 0;
 236     bool reallocate = false;
 237
 238     if( haveMask )
 239     {
 240         int mtype = _mask.type();
 241         CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
 242         copymask = getCopyMaskFunc(esz);
 243         reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
 244     }
 245
 246     AutoBuffer<uchar> _buf;
 247     uchar *scbuf = 0, *maskbuf = 0;
 248
 249     _dst.createSameSize(*psrc1, type1);
 250     // if this is mask operation and dst has been reallocated,
 251     // we have to clear the destination
 252     if( haveMask && reallocate )
 253         _dst.setTo(0.);
 254
 255     CV_OCL_RUN(use_opencl,
 256                ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
 257
 258
 259     Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
 260     Mat dst = _dst.getMat(), mask = _mask.getMat();
 261
 262     if( bitwise )
 263     {
 264         func = *tab;
 265         cn = (int)esz;
 266     }
 267     else
 268         func = tab[depth1];
 269
 270     if( !haveScalar )
 271     {
 272         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
 273         uchar* ptrs[4];
 274
 275         NAryMatIterator it(arrays, ptrs);
 276         size_t total = it.size, blocksize = total;
 277
 278         if( blocksize*cn > INT_MAX )
 279             blocksize = INT_MAX/cn;
 280
 281         if( haveMask )
 282         {
 283             blocksize = std::min(blocksize, blocksize0);
 284             _buf.allocate(blocksize*esz);
 285             maskbuf = _buf.data();
 286         }
 287
 288         for( size_t i = 0; i < it.nplanes; i++, ++it )
 289         {
 290             for( size_t j = 0; j < total; j += blocksize )
 291             {
 292                 int bsz = (int)MIN(total - j, blocksize);
 293
 294                 func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, bsz*cn, 1, 0 );
 295                 if( haveMask )
 296                 {
 297                     copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
 298                     ptrs[3] += bsz;
 299                 }
 300
 301                 bsz *= (int)esz;
 302                 ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
 303             }
 304         }
 305     }
 306     else
 307     {
 308         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
 309         uchar* ptrs[3];
 310
 311         NAryMatIterator it(arrays, ptrs);
 312         size_t total = it.size, blocksize = std::min(total, blocksize0);
 313
 314         _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
 315         scbuf = _buf.data();
 316         maskbuf = alignPtr(scbuf + blocksize*esz, 16);
 317
 318         convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
 319
 320         for( size_t i = 0; i < it.nplanes; i++, ++it )
 321         {
 322             for( size_t j = 0; j < total; j += blocksize )
 323             {
 324                 int bsz = (int)MIN(total - j, blocksize);
 325
 326                 func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, bsz*cn, 1, 0 );
 327                 if( haveMask )
 328                 {
 329                     copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
 330                     ptrs[2] += bsz;
 331                 }
 332
 333                 bsz *= (int)esz;
 334                 ptrs[0] += bsz; ptrs[1] += bsz;
 335             }
 336         }
 337     }
 338 }
 339
 340 static BinaryFuncC* getMaxTab()
 341 {
 342     static BinaryFuncC maxTab[] =
 343     {
 344         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8s),
 345         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16s),
 346         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32s),
 347         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32f), (BinaryFuncC)cv::hal::max64f,
 348         0
 349     };
 350
 351     return maxTab;
 352 }
 353
 354 static BinaryFuncC* getMinTab()
 355 {
 356     static BinaryFuncC minTab[] =
 357     {
 358         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8s),
 359         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16s),
 360         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32s),
 361         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32f), (BinaryFuncC)cv::hal::min64f,
 362         0
 363     };
 364
 365     return minTab;
 366 }
 367
 368 }
 369
 370 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
 371 {
 372     CV_INSTRUMENT_REGION()
 373
 374     BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::and8u);
 375     binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
 376 }
 377
 378 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
 379 {
 380     CV_INSTRUMENT_REGION()
 381
 382     BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::or8u);
 383     binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
 384 }
 385
 386 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
 387 {
 388     CV_INSTRUMENT_REGION()
 389
 390     BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::xor8u);
 391     binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
 392 }
 393
 394 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
 395 {
 396     CV_INSTRUMENT_REGION()
 397
 398     BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::not8u);
 399     binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
 400 }
 401
 402 void cv::max( InputArray src1, InputArray src2, OutputArray dst )
 403 {
 404     CV_INSTRUMENT_REGION()
 405
 406     binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 407 }
 408
 409 void cv::min( InputArray src1, InputArray src2, OutputArray dst )
 410 {
 411     CV_INSTRUMENT_REGION()
 412
 413     binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 414 }
 415
 416 void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
 417 {
 418     CV_INSTRUMENT_REGION()
 419
 420     OutputArray _dst(dst);
 421     binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 422 }
 423
 424 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
 425 {
 426     CV_INSTRUMENT_REGION()
 427
 428     OutputArray _dst(dst);
 429     binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 430 }
 431
 432 void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
 433 {
 434     CV_INSTRUMENT_REGION()
 435
 436     OutputArray _dst(dst);
 437     binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 438 }
 439
 440 void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
 441 {
 442     CV_INSTRUMENT_REGION()
 443
 444     OutputArray _dst(dst);
 445     binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 446 }
 447
 448
 449 /****************************************************************************************\
 450 *                                      add/subtract                                      *
 451 \****************************************************************************************/
 452
 453 namespace cv
 454 {
 455
 456 static int actualScalarDepth(const double* data, int len)
 457 {
 458     int i = 0, minval = INT_MAX, maxval = INT_MIN;
 459     for(; i < len; ++i)
 460     {
 461         int ival = cvRound(data[i]);
 462         if( ival != data[i] )
 463             break;
 464         minval = MIN(minval, ival);
 465         maxval = MAX(maxval, ival);
 466     }
 467     return i < len ? CV_64F :
 468         minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
 469         minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
 470         minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
 471         minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
 472         CV_32S;
 473 }
 474
 475 #ifdef HAVE_OPENCL
 476
 477 static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 478                           InputArray _mask, int wtype,
 479                           void* usrdata, int oclop,
 480                           bool haveScalar )
 481 {
 482     const ocl::Device d = ocl::Device::getDefault();
 483     bool doubleSupport = d.doubleFPConfig() > 0;
 484     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
 485     bool haveMask = !_mask.empty();
 486
 487     if ( (haveMask || haveScalar) && cn > 4 )
 488         return false;
 489
 490     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
 491     if (!doubleSupport)
 492         wdepth = std::min(wdepth, CV_32F);
 493
 494     wtype = CV_MAKETYPE(wdepth, cn);
 495     int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
 496     if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
 497         return false;
 498
 499     int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
 500     int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
 501
 502     char cvtstr[4][32], opts[1024];
 503     sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
 504             "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
 505             "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
 506             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
 507             oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
 508             ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
 509             ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
 510             ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
 511             ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
 512             ocl::typeToStr(wdepth), wdepth,
 513             ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
 514             ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
 515             ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
 516             doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
 517             oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
 518             ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
 519
 520     size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
 521     const uchar* usrdata_p = (const uchar*)usrdata;
 522     const double* usrdata_d = (const double*)usrdata;
 523     float usrdata_f[3];
 524     int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
 525         oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
 526     if( usrdata && n > 0 && wdepth == CV_32F )
 527     {
 528         for( i = 0; i < n; i++ )
 529             usrdata_f[i] = (float)usrdata_d[i];
 530         usrdata_p = (const uchar*)usrdata_f;
 531     }
 532
 533     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
 534     if (k.empty())
 535         return false;
 536
 537     UMat src1 = _src1.getUMat(), src2;
 538     UMat dst = _dst.getUMat(), mask = _mask.getUMat();
 539
 540     ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
 541     ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
 542                                        ocl::KernelArg::WriteOnly(dst, cn, kercn);
 543     ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
 544
 545     if( haveScalar )
 546     {
 547         size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
 548         double buf[4]={0,0,0,0};
 549         Mat src2sc = _src2.getMat();
 550
 551         if( !src2sc.empty() )
 552             convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
 553         ocl::KernelArg scalararg = ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, buf, esz);
 554
 555         if( !haveMask )
 556         {
 557             if(n == 0)
 558                 k.args(src1arg, dstarg, scalararg);
 559             else if(n == 1)
 560                 k.args(src1arg, dstarg, scalararg,
 561                        ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, usrdata_p, usrdata_esz));
 562             else
 563                 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
 564         }
 565         else
 566             k.args(src1arg, maskarg, dstarg, scalararg);
 567     }
 568     else
 569     {
 570         src2 = _src2.getUMat();
 571         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
 572
 573         if( !haveMask )
 574         {
 575             if (n == 0)
 576                 k.args(src1arg, src2arg, dstarg);
 577             else if (n == 1)
 578                 k.args(src1arg, src2arg, dstarg,
 579                        ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, usrdata_p, usrdata_esz));
 580             else if (n == 3)
 581                 k.args(src1arg, src2arg, dstarg,
 582                        ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, usrdata_p, usrdata_esz),
 583                        ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
 584                        ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
 585             else
 586                 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
 587         }
 588         else
 589             k.args(src1arg, src2arg, maskarg, dstarg);
 590     }
 591
 592     size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI };
 593     return k.run(2, globalsize, NULL, false);
 594 }
 595
 596 #endif
 597
 598 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 599                       InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false,
 600                       void* usrdata=0, int oclop=-1 )
 601 {
 602     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
 603     int kind1 = psrc1->kind(), kind2 = psrc2->kind();
 604     bool haveMask = !_mask.empty();
 605     bool reallocate = false;
 606     int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
 607     int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
 608     int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
 609     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
 610     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
 611 #ifdef HAVE_OPENCL
 612     bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
 613 #endif
 614     bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
 615     bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
 616
 617     if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
 618         !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
 619                        (_dst.fixedType() && _dst.type() == type1)) &&
 620         ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
 621     {
 622         _dst.createSameSize(*psrc1, type1);
 623         CV_OCL_RUN(use_opencl,
 624             ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
 625                           (!usrdata ? type1 : std::max(depth1, CV_32F)),
 626                           usrdata, oclop, false))
 627
 628         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
 629         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
 630         tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
 631         return;
 632     }
 633
 634     bool haveScalar = false, swapped12 = false;
 635
 636     if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
 637         (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
 638         (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
 639     {
 640         if( checkScalar(*psrc1, type2, kind1, kind2) )
 641         {
 642             // src1 is a scalar; swap it with src2
 643             swap(psrc1, psrc2);
 644             swap(sz1, sz2);
 645             swap(type1, type2);
 646             swap(depth1, depth2);
 647             swap(cn, cn2);
 648             swap(dims1, dims2);
 649             swapped12 = true;
 650             if( oclop == OCL_OP_SUB )
 651                 oclop = OCL_OP_RSUB;
 652             if ( oclop == OCL_OP_DIV_SCALE )
 653                 oclop = OCL_OP_RDIV_SCALE;
 654         }
 655         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
 656             CV_Error( CV_StsUnmatchedSizes,
 657                      "The operation is neither 'array op array' "
 658                      "(where arrays have the same size and the same number of channels), "
 659                      "nor 'array op scalar', nor 'scalar op array'" );
 660         haveScalar = true;
 661         CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
 662
 663         if (!muldiv)
 664         {
 665             Mat sc = psrc2->getMat();
 666             depth2 = actualScalarDepth(sc.ptr<double>(), sz2 == Size(1, 1) ? cn2 : cn);
 667             if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
 668                 depth2 = CV_32F;
 669         }
 670         else
 671             depth2 = CV_64F;
 672     }
 673
 674     if( dtype < 0 )
 675     {
 676         if( _dst.fixedType() )
 677             dtype = _dst.type();
 678         else
 679         {
 680             if( !haveScalar && type1 != type2 )
 681                 CV_Error(CV_StsBadArg,
 682                      "When the input arrays in add/subtract/multiply/divide functions have different types, "
 683                      "the output array type must be explicitly specified");
 684             dtype = type1;
 685         }
 686     }
 687     dtype = CV_MAT_DEPTH(dtype);
 688
 689     if( depth1 == depth2 && dtype == depth1 )
 690         wtype = dtype;
 691     else if( !muldiv )
 692     {
 693         wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
 694                 depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
 695         wtype = std::max(wtype, dtype);
 696
 697         // when the result of addition should be converted to an integer type,
 698         // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
 699         // instead of converting the other input to floating-point and then converting the operation result back to integers.
 700         if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
 701             wtype = CV_32S;
 702     }
 703     else
 704     {
 705         wtype = std::max(depth1, std::max(depth2, CV_32F));
 706         wtype = std::max(wtype, dtype);
 707     }
 708
 709     dtype = CV_MAKETYPE(dtype, cn);
 710     wtype = CV_MAKETYPE(wtype, cn);
 711
 712     if( haveMask )
 713     {
 714         int mtype = _mask.type();
 715         CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
 716         reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
 717     }
 718
 719     _dst.createSameSize(*psrc1, dtype);
 720     if( reallocate )
 721         _dst.setTo(0.);
 722
 723     CV_OCL_RUN(use_opencl,
 724                ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
 725                usrdata, oclop, haveScalar))
 726
 727     BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
 728     BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
 729     BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
 730
 731     size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
 732     size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
 733     size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
 734     BinaryFunc copymask = getCopyMaskFunc(dsz);
 735     Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
 736
 737     AutoBuffer<uchar> _buf;
 738     uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
 739     size_t bufesz = (cvtsrc1 ? wsz : 0) +
 740                     (cvtsrc2 || haveScalar ? wsz : 0) +
 741                     (cvtdst ? wsz : 0) +
 742                     (haveMask ? dsz : 0);
 743     BinaryFuncC func = tab[CV_MAT_DEPTH(wtype)];
 744
 745     if( !haveScalar )
 746     {
 747         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
 748         uchar* ptrs[4];
 749
 750         NAryMatIterator it(arrays, ptrs);
 751         size_t total = it.size, blocksize = total;
 752
 753         if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
 754             blocksize = std::min(blocksize, blocksize0);
 755
 756         _buf.allocate(bufesz*blocksize + 64);
 757         buf = _buf.data();
 758         if( cvtsrc1 )
 759             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
 760         if( cvtsrc2 )
 761             buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
 762         wbuf = maskbuf = buf;
 763         if( cvtdst )
 764             buf = alignPtr(buf + blocksize*wsz, 16);
 765         if( haveMask )
 766             maskbuf = buf;
 767
 768         for( size_t i = 0; i < it.nplanes; i++, ++it )
 769         {
 770             for( size_t j = 0; j < total; j += blocksize )
 771             {
 772                 int bsz = (int)MIN(total - j, blocksize);
 773                 Size bszn(bsz*cn, 1);
 774                 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
 775                 uchar* dptr = ptrs[2];
 776                 if( cvtsrc1 )
 777                 {
 778                     cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
 779                     sptr1 = buf1;
 780                 }
 781                 if( ptrs[0] == ptrs[1] )
 782                     sptr2 = sptr1;
 783                 else if( cvtsrc2 )
 784                 {
 785                     cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
 786                     sptr2 = buf2;
 787                 }
 788
 789                 if( !haveMask && !cvtdst )
 790                     func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
 791                 else
 792                 {
 793                     func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata );
 794                     if( !haveMask )
 795                         cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
 796                     else if( !cvtdst )
 797                     {
 798                         copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
 799                         ptrs[3] += bsz;
 800                     }
 801                     else
 802                     {
 803                         cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
 804                         copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
 805                         ptrs[3] += bsz;
 806                     }
 807                 }
 808                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
 809             }
 810         }
 811     }
 812     else
 813     {
 814         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
 815         uchar* ptrs[3];
 816
 817         NAryMatIterator it(arrays, ptrs);
 818         size_t total = it.size, blocksize = std::min(total, blocksize0);
 819
 820         _buf.allocate(bufesz*blocksize + 64);
 821         buf = _buf.data();
 822         if( cvtsrc1 )
 823             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
 824         buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
 825         wbuf = maskbuf = buf;
 826         if( cvtdst )
 827             buf = alignPtr(buf + blocksize*wsz, 16);
 828         if( haveMask )
 829             maskbuf = buf;
 830
 831         convertAndUnrollScalar( src2, wtype, buf2, blocksize);
 832
 833         for( size_t i = 0; i < it.nplanes; i++, ++it )
 834         {
 835             for( size_t j = 0; j < total; j += blocksize )
 836             {
 837                 int bsz = (int)MIN(total - j, blocksize);
 838                 Size bszn(bsz*cn, 1);
 839                 const uchar *sptr1 = ptrs[0];
 840                 const uchar* sptr2 = buf2;
 841                 uchar* dptr = ptrs[1];
 842
 843                 if( cvtsrc1 )
 844                 {
 845                     cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
 846                     sptr1 = buf1;
 847                 }
 848
 849                 if( swapped12 )
 850                     std::swap(sptr1, sptr2);
 851
 852                 if( !haveMask && !cvtdst )
 853                     func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
 854                 else
 855                 {
 856                     func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata );
 857                     if( !haveMask )
 858                         cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
 859                     else if( !cvtdst )
 860                     {
 861                         copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
 862                         ptrs[2] += bsz;
 863                     }
 864                     else
 865                     {
 866                         cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
 867                         copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
 868                         ptrs[2] += bsz;
 869                     }
 870                 }
 871                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
 872             }
 873         }
 874     }
 875 }
 876
 877 static BinaryFuncC* getAddTab()
 878 {
 879     static BinaryFuncC addTab[] =
 880     {
 881         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8s),
 882         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16s),
 883         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32s),
 884         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32f), (BinaryFuncC)cv::hal::add64f,
 885         0
 886     };
 887
 888     return addTab;
 889 }
 890
 891 static BinaryFuncC* getSubTab()
 892 {
 893     static BinaryFuncC subTab[] =
 894     {
 895         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8s),
 896         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16s),
 897         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32s),
 898         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32f), (BinaryFuncC)cv::hal::sub64f,
 899         0
 900     };
 901
 902     return subTab;
 903 }
 904
 905 static BinaryFuncC* getAbsDiffTab()
 906 {
 907     static BinaryFuncC absDiffTab[] =
 908     {
 909         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8s),
 910         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16s),
 911         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32s),
 912         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32f), (BinaryFuncC)cv::hal::absdiff64f,
 913         0
 914     };
 915
 916     return absDiffTab;
 917 }
 918
 919 }
 920
 921 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
 922           InputArray mask, int dtype )
 923 {
 924     CV_INSTRUMENT_REGION()
 925
 926     arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
 927 }
 928
 929 void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
 930                InputArray mask, int dtype )
 931 {
 932     CV_INSTRUMENT_REGION()
 933
 934 #ifdef HAVE_TEGRA_OPTIMIZATION
 935     if (tegra::useTegra())
 936     {
 937         int kind1 = _src1.kind(), kind2 = _src2.kind();
 938         Mat src1 = _src1.getMat(), src2 = _src2.getMat();
 939         bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2);
 940         bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1);
 941
 942         if (!src1Scalar && !src2Scalar &&
 943             src1.depth() == CV_8U && src2.type() == src1.type() &&
 944             src1.dims == 2 && src2.size() == src1.size() &&
 945             mask.empty())
 946         {
 947             if (dtype < 0)
 948             {
 949                 if (_dst.fixedType())
 950                 {
 951                     dtype = _dst.depth();
 952                 }
 953                 else
 954                 {
 955                     dtype = src1.depth();
 956                 }
 957             }
 958
 959             dtype = CV_MAT_DEPTH(dtype);
 960
 961             if (!_dst.fixedType() || dtype == _dst.depth())
 962             {
 963                 _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels()));
 964
 965                 if (dtype == CV_16S)
 966                 {
 967                     Mat dst = _dst.getMat();
 968                     if(tegra::subtract_8u8u16s(src1, src2, dst))
 969                         return;
 970                 }
 971                 else if (dtype == CV_32F)
 972                 {
 973                     Mat dst = _dst.getMat();
 974                     if(tegra::subtract_8u8u32f(src1, src2, dst))
 975                         return;
 976                 }
 977                 else if (dtype == CV_8S)
 978                 {
 979                     Mat dst = _dst.getMat();
 980                     if(tegra::subtract_8u8u8s(src1, src2, dst))
 981                         return;
 982                 }
 983             }
 984         }
 985     }
 986 #endif
 987     arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
 988 }
 989
 990 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
 991 {
 992     CV_INSTRUMENT_REGION()
 993
 994     arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
 995 }
 996
 997 /****************************************************************************************\
 998 *                                    multiply/divide                                     *
 999 \****************************************************************************************/
1000
1001 namespace cv
1002 {
1003
1004 static BinaryFuncC* getMulTab()
1005 {
1006     static BinaryFuncC mulTab[] =
1007     {
1008         (BinaryFuncC)cv::hal::mul8u, (BinaryFuncC)cv::hal::mul8s, (BinaryFuncC)cv::hal::mul16u,
1009         (BinaryFuncC)cv::hal::mul16s, (BinaryFuncC)cv::hal::mul32s, (BinaryFuncC)cv::hal::mul32f,
1010         (BinaryFuncC)cv::hal::mul64f, 0
1011     };
1012
1013     return mulTab;
1014 }
1015
1016 static BinaryFuncC* getDivTab()
1017 {
1018     static BinaryFuncC divTab[] =
1019     {
1020         (BinaryFuncC)cv::hal::div8u, (BinaryFuncC)cv::hal::div8s, (BinaryFuncC)cv::hal::div16u,
1021         (BinaryFuncC)cv::hal::div16s, (BinaryFuncC)cv::hal::div32s, (BinaryFuncC)cv::hal::div32f,
1022         (BinaryFuncC)cv::hal::div64f, 0
1023     };
1024
1025     return divTab;
1026 }
1027
1028 static BinaryFuncC* getRecipTab()
1029 {
1030     static BinaryFuncC recipTab[] =
1031     {
1032         (BinaryFuncC)cv::hal::recip8u, (BinaryFuncC)cv::hal::recip8s, (BinaryFuncC)cv::hal::recip16u,
1033         (BinaryFuncC)cv::hal::recip16s, (BinaryFuncC)cv::hal::recip32s, (BinaryFuncC)cv::hal::recip32f,
1034         (BinaryFuncC)cv::hal::recip64f, 0
1035     };
1036
1037     return recipTab;
1038 }
1039
1040 }
1041
1042 void cv::multiply(InputArray src1, InputArray src2,
1043                   OutputArray dst, double scale, int dtype)
1044 {
1045     CV_INSTRUMENT_REGION()
1046
1047     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
1048               true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
1049 }
1050
1051 void cv::divide(InputArray src1, InputArray src2,
1052                 OutputArray dst, double scale, int dtype)
1053 {
1054     CV_INSTRUMENT_REGION()
1055
1056     arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
1057 }
1058
1059 void cv::divide(double scale, InputArray src2,
1060                 OutputArray dst, int dtype)
1061 {
1062     CV_INSTRUMENT_REGION()
1063
1064     arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
1065 }
1066
1067 /****************************************************************************************\
1068 *                                      addWeighted                                       *
1069 \****************************************************************************************/
1070
1071 namespace cv
1072 {
1073
1074 static BinaryFuncC* getAddWeightedTab()
1075 {
1076     static BinaryFuncC addWeightedTab[] =
1077     {
1078         (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16u),
1079         (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted32s), (BinaryFuncC)cv::hal::addWeighted32f,
1080         (BinaryFuncC)cv::hal::addWeighted64f, 0
1081     };
1082
1083     return addWeightedTab;
1084 }
1085
1086 }
1087
1088 void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
1089                       double beta, double gamma, OutputArray dst, int dtype )
1090 {
1091     CV_INSTRUMENT_REGION()
1092
1093     double scalars[] = {alpha, beta, gamma};
1094     arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
1095 }
1096
1097
1098 /****************************************************************************************\
1099 *                                          compare                                       *
1100 \****************************************************************************************/
1101
1102 namespace cv
1103 {
1104
1105 static BinaryFuncC getCmpFunc(int depth)
1106 {
1107     static BinaryFuncC cmpTab[] =
1108     {
1109         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8s),
1110         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16s),
1111         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32s),
1112         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32f), (BinaryFuncC)cv::hal::cmp64f,
1113         0
1114     };
1115
1116     return cmpTab[depth];
1117 }
1118
1119 static double getMinVal(int depth)
1120 {
1121     static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
1122     return tab[depth];
1123 }
1124
1125 static double getMaxVal(int depth)
1126 {
1127     static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
1128     return tab[depth];
1129 }
1130
1131 #ifdef HAVE_OPENCL
1132
1133 static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op, bool haveScalar)
1134 {
1135     const ocl::Device& dev = ocl::Device::getDefault();
1136     bool doubleSupport = dev.doubleFPConfig() > 0;
1137     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1),
1138             type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2);
1139
1140     if (!doubleSupport && depth1 == CV_64F)
1141         return false;
1142
1143     if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
1144             return false;
1145
1146     int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
1147     // Workaround for bug with "?:" operator in AMD OpenCL compiler
1148     if (depth1 >= CV_16U)
1149         kercn = 1;
1150
1151     int scalarcn = kercn == 3 ? 4 : kercn;
1152     const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
1153     char cvt[40];
1154
1155     String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
1156                          " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
1157                          " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
1158                          haveScalar ? "UNARY_OP" : "BINARY_OP",
1159                          ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
1160                          ocl::typeToStr(CV_8UC(kercn)), kercn,
1161                          ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
1162                          operationMap[op], ocl::typeToStr(depth1),
1163                          ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
1164                          ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
1165                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
1166
1167     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
1168     if (k.empty())
1169         return false;
1170
1171     UMat src1 = _src1.getUMat();
1172     Size size = src1.size();
1173     _dst.create(size, CV_8UC(cn));
1174     UMat dst = _dst.getUMat();
1175
1176     if (haveScalar)
1177     {
1178         size_t esz = CV_ELEM_SIZE1(type1) * scalarcn;
1179         double buf[4] = { 0, 0, 0, 0 };
1180         Mat src2 = _src2.getMat();
1181
1182         if( depth1 > CV_32S )
1183             convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn );
1184         else
1185         {
1186             double fval = 0;
1187             getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar *)&fval, 1, Size(1, 1), 0);
1188             if( fval < getMinVal(depth1) )
1189                 return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true;
1190
1191             if( fval > getMaxVal(depth1) )
1192                 return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true;
1193
1194             int ival = cvRound(fval);
1195             if( fval != ival )
1196             {
1197                 if( op == CMP_LT || op == CMP_GE )
1198                     ival = cvCeil(fval);
1199                 else if( op == CMP_LE || op == CMP_GT )
1200                     ival = cvFloor(fval);
1201                 else
1202                     return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true;
1203             }
1204             convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn);
1205         }
1206
1207         ocl::KernelArg scalararg = ocl::KernelArg(ocl::KernelArg::CONSTANT, 0, 0, 0, buf, esz);
1208
1209         k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
1210                ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg);
1211     }
1212     else
1213     {
1214         UMat src2 = _src2.getUMat();
1215
1216         k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
1217                ocl::KernelArg::ReadOnlyNoSize(src2),
1218                ocl::KernelArg::WriteOnly(dst, cn, kercn));
1219     }
1220
1221     size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
1222     return k.run(2, globalsize, NULL, false);
1223 }
1224
1225 #endif
1226
1227 }
1228
1229 void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
1230 {
1231     CV_INSTRUMENT_REGION()
1232
1233     CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
1234                op == CMP_NE || op == CMP_GE || op == CMP_GT );
1235
1236     CV_Assert(_src1.empty() == _src2.empty());
1237     if (_src1.empty() && _src2.empty())
1238     {
1239         _dst.release();
1240         return;
1241     }
1242
1243     bool haveScalar = false;
1244
1245     if ((_src1.isMatx() + _src2.isMatx()) == 1
1246             || !_src1.sameSize(_src2)
1247             || _src1.type() != _src2.type())
1248     {
1249         bool is_src1_scalar = checkScalar(_src1, _src2.type(), _src1.kind(), _src2.kind());
1250         bool is_src2_scalar = checkScalar(_src2, _src1.type(), _src2.kind(), _src1.kind());
1251
1252         if (is_src1_scalar && !is_src2_scalar)
1253         {
1254             op = op == CMP_LT ? CMP_GT : op == CMP_LE ? CMP_GE :
1255                 op == CMP_GE ? CMP_LE : op == CMP_GT ? CMP_LT : op;
1256             // src1 is a scalar; swap it with src2
1257             compare(_src2, _src1, _dst, op);
1258             return;
1259         }
1260         else if( (is_src1_scalar && is_src2_scalar) || (!is_src1_scalar && !is_src2_scalar) )
1261             CV_Error( CV_StsUnmatchedSizes,
1262                      "The operation is neither 'array op array' (where arrays have the same size and the same type), "
1263                      "nor 'array op scalar', nor 'scalar op array'" );
1264         haveScalar = true;
1265     }
1266
1267     CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
1268                ocl_compare(_src1, _src2, _dst, op, haveScalar))
1269
1270     int kind1 = _src1.kind(), kind2 = _src2.kind();
1271     Mat src1 = _src1.getMat(), src2 = _src2.getMat();
1272
1273     if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
1274     {
1275         int cn = src1.channels();
1276         _dst.create(src1.size(), CV_8UC(cn));
1277         Mat dst = _dst.getMat();
1278         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
1279         getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, &op);
1280         return;
1281     }
1282
1283     int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
1284
1285     _dst.create(src1.dims, src1.size, CV_8UC(cn));
1286     src1 = src1.reshape(1); src2 = src2.reshape(1);
1287     Mat dst = _dst.getMat().reshape(1);
1288
1289     size_t esz = std::max(src1.elemSize(), (size_t)1);
1290     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
1291     BinaryFuncC func = getCmpFunc(depth1);
1292
1293     if( !haveScalar )
1294     {
1295         const Mat* arrays[] = { &src1, &src2, &dst, 0 };
1296         uchar* ptrs[3];
1297
1298         NAryMatIterator it(arrays, ptrs);
1299         size_t total = it.size;
1300
1301         for( size_t i = 0; i < it.nplanes; i++, ++it )
1302             func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, (int)total, 1, &op );
1303     }
1304     else
1305     {
1306         const Mat* arrays[] = { &src1, &dst, 0 };
1307         uchar* ptrs[2];
1308
1309         NAryMatIterator it(arrays, ptrs);
1310         size_t total = it.size, blocksize = std::min(total, blocksize0);
1311
1312         AutoBuffer<uchar> _buf(blocksize*esz);
1313         uchar *buf = _buf.data();
1314
1315         if( depth1 > CV_32S )
1316             convertAndUnrollScalar( src2, depth1, buf, blocksize );
1317         else
1318         {
1319             double fval=0;
1320             getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar*)&fval, 1, Size(1,1), 0);
1321             if( fval < getMinVal(depth1) )
1322             {
1323                 dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
1324                 return;
1325             }
1326
1327             if( fval > getMaxVal(depth1) )
1328             {
1329                 dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
1330                 return;
1331             }
1332
1333             int ival = cvRound(fval);
1334             if( fval != ival )
1335             {
1336                 if( op == CMP_LT || op == CMP_GE )
1337                     ival = cvCeil(fval);
1338                 else if( op == CMP_LE || op == CMP_GT )
1339                     ival = cvFloor(fval);
1340                 else
1341                 {
1342                     dst = Scalar::all(op == CMP_NE ? 255 : 0);
1343                     return;
1344                 }
1345             }
1346             convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
1347         }
1348
1349         for( size_t i = 0; i < it.nplanes; i++, ++it )
1350         {
1351             for( size_t j = 0; j < total; j += blocksize )
1352             {
1353                 int bsz = (int)MIN(total - j, blocksize);
1354                 func( ptrs[0], 0, buf, 0, ptrs[1], 0, bsz, 1, &op);
1355                 ptrs[0] += bsz*esz;
1356                 ptrs[1] += bsz;
1357             }
1358         }
1359     }
1360 }
1361
1362 /****************************************************************************************\
1363 *                                        inRange                                         *
1364 \****************************************************************************************/
1365
1366 namespace cv
1367 {
1368
1369 template <typename T>
1370 struct InRange_SIMD
1371 {
1372     int operator () (const T *, const T *, const T *, uchar *, int) const
1373     {
1374         return 0;
1375     }
1376 };
1377
1378 #if CV_SIMD128
1379
1380 template <>
1381 struct InRange_SIMD<uchar>
1382 {
1383     int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
1384         uchar * dst, int len) const
1385     {
1386         int x = 0;
1387         const int width = v_uint8x16::nlanes;
1388
1389         for (; x <= len - width; x += width)
1390         {
1391             v_uint8x16 values = v_load(src1 + x);
1392             v_uint8x16 low = v_load(src2 + x);
1393             v_uint8x16 high = v_load(src3 + x);
1394
1395             v_store(dst + x, (values >= low) & (high >= values));
1396         }
1397         return x;
1398     }
1399 };
1400
1401 template <>
1402 struct InRange_SIMD<schar>
1403 {
1404     int operator () (const schar * src1, const schar * src2, const schar * src3,
1405         uchar * dst, int len) const
1406     {
1407         int x = 0;
1408         const int width = v_int8x16::nlanes;
1409
1410         for (; x <= len - width; x += width)
1411         {
1412             v_int8x16 values = v_load(src1 + x);
1413             v_int8x16 low = v_load(src2 + x);
1414             v_int8x16 high = v_load(src3 + x);
1415
1416             v_store((schar*)(dst + x), (values >= low) & (high >= values));
1417         }
1418         return x;
1419     }
1420 };
1421
1422 template <>
1423 struct InRange_SIMD<ushort>
1424 {
1425     int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
1426         uchar * dst, int len) const
1427     {
1428         int x = 0;
1429         const int width = v_uint16x8::nlanes * 2;
1430
1431         for (; x <= len - width; x += width)
1432         {
1433             v_uint16x8 values1 = v_load(src1 + x);
1434             v_uint16x8 low1 = v_load(src2 + x);
1435             v_uint16x8 high1 = v_load(src3 + x);
1436
1437             v_uint16x8 values2 = v_load(src1 + x + v_uint16x8::nlanes);
1438             v_uint16x8 low2 = v_load(src2 + x + v_uint16x8::nlanes);
1439             v_uint16x8 high2 = v_load(src3 + x + v_uint16x8::nlanes);
1440
1441             v_store(dst + x, v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
1442         }
1443         return x;
1444     }
1445 };
1446
1447 template <>
1448 struct InRange_SIMD<short>
1449 {
1450     int operator () (const short * src1, const short * src2, const short * src3,
1451         uchar * dst, int len) const
1452     {
1453         int x = 0;
1454         const int width = (int)v_int16x8::nlanes * 2;
1455
1456         for (; x <= len - width; x += width)
1457         {
1458             v_int16x8 values1 = v_load(src1 + x);
1459             v_int16x8 low1 = v_load(src2 + x);
1460             v_int16x8 high1 = v_load(src3 + x);
1461
1462             v_int16x8 values2 = v_load(src1 + x + v_int16x8::nlanes);
1463             v_int16x8 low2 = v_load(src2 + x + v_int16x8::nlanes);
1464             v_int16x8 high2 = v_load(src3 + x + v_int16x8::nlanes);
1465
1466             v_store((schar*)(dst + x), v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
1467         }
1468         return x;
1469     }
1470 };
1471
1472 template <>
1473 struct InRange_SIMD<int>
1474 {
1475     int operator () (const int * src1, const int * src2, const int * src3,
1476         uchar * dst, int len) const
1477     {
1478         int x = 0;
1479         const int width = (int)v_int32x4::nlanes * 2;
1480
1481         for (; x <= len - width; x += width)
1482         {
1483             v_int32x4 values1 = v_load(src1 + x);
1484             v_int32x4 low1 = v_load(src2 + x);
1485             v_int32x4 high1 = v_load(src3 + x);
1486
1487             v_int32x4 values2 = v_load(src1 + x + v_int32x4::nlanes);
1488             v_int32x4 low2 = v_load(src2 + x + v_int32x4::nlanes);
1489             v_int32x4 high2 = v_load(src3 + x + v_int32x4::nlanes);
1490
1491             v_pack_store(dst + x, v_reinterpret_as_u16(v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))));
1492         }
1493         return x;
1494     }
1495 };
1496
1497 template <>
1498 struct InRange_SIMD<float>
1499 {
1500     int operator () (const float * src1, const float * src2, const float * src3,
1501         uchar * dst, int len) const
1502     {
1503         int x = 0;
1504         const int width = (int)v_float32x4::nlanes * 2;
1505
1506         for (; x <= len - width; x += width)
1507         {
1508             v_float32x4 values1 = v_load(src1 + x);
1509             v_float32x4 low1 = v_load(src2 + x);
1510             v_float32x4 high1 = v_load(src3 + x);
1511
1512             v_float32x4 values2 = v_load(src1 + x + v_float32x4::nlanes);
1513             v_float32x4 low2 = v_load(src2 + x + v_float32x4::nlanes);
1514             v_float32x4 high2 = v_load(src3 + x + v_float32x4::nlanes);
1515
1516             v_pack_store(dst + x, v_pack(v_reinterpret_as_u32((values1 >= low1) & (high1 >= values1)), v_reinterpret_as_u32((values2 >= low2) & (high2 >= values2))));
1517         }
1518         return x;
1519     }
1520 };
1521
1522 #endif
1523
1524 template <typename T>
1525 static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
1526          const T* src3, size_t step3, uchar* dst, size_t step,
1527          Size size)
1528 {
1529     step1 /= sizeof(src1[0]);
1530     step2 /= sizeof(src2[0]);
1531     step3 /= sizeof(src3[0]);
1532
1533     InRange_SIMD<T> vop;
1534
1535     for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
1536     {
1537         int x = vop(src1, src2, src3, dst, size.width);
1538         #if CV_ENABLE_UNROLLED
1539         for( ; x <= size.width - 4; x += 4 )
1540         {
1541             int t0, t1;
1542             t0 = src2[x] <= src1[x] && src1[x] <= src3[x];
1543             t1 = src2[x+1] <= src1[x+1] && src1[x+1] <= src3[x+1];
1544             dst[x] = (uchar)-t0; dst[x+1] = (uchar)-t1;
1545             t0 = src2[x+2] <= src1[x+2] && src1[x+2] <= src3[x+2];
1546             t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
1547             dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
1548         }
1549         #endif
1550         for( ; x < size.width; x++ )
1551             dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
1552     }
1553 }
1554
1555
1556 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
1557                       const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
1558 {
1559     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
1560 }
1561
1562 static void inRange8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
1563                       const schar* src3, size_t step3, uchar* dst, size_t step, Size size)
1564 {
1565     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
1566 }
1567
1568 static void inRange16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
1569                        const ushort* src3, size_t step3, uchar* dst, size_t step, Size size)
1570 {
1571     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
1572 }
1573
1574 static void inRange16s(const short* src1, size_t step1, const short* src2, size_t step2,
1575                        const short* src3, size_t step3, uchar* dst, size_t step, Size size)
1576 {
1577     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
1578 }
1579
1580 static void inRange32s(const int* src1, size_t step1, const int* src2, size_t step2,
1581                        const int* src3, size_t step3, uchar* dst, size_t step, Size size)
1582 {
1583     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
1584 }
1585
1586 static void inRange32f(const float* src1, size_t step1, const float* src2, size_t step2,
1587                        const float* src3, size_t step3, uchar* dst, size_t step, Size size)
1588 {
1589     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
1590 }
1591
1592 static void inRange64f(const double* src1, size_t step1, const double* src2, size_t step2,
1593                        const double* src3, size_t step3, uchar* dst, size_t step, Size size)
1594 {
1595     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
1596 }
1597
1598 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
1599 {
1600     int k = cn % 4 ? cn % 4 : 4;
1601     size_t i, j;
1602     if( k == 1 )
1603         for( i = j = 0; i < len; i++, j += cn )
1604             dst[i] = src[j];
1605     else if( k == 2 )
1606         for( i = j = 0; i < len; i++, j += cn )
1607             dst[i] = src[j] & src[j+1];
1608     else if( k == 3 )
1609         for( i = j = 0; i < len; i++, j += cn )
1610             dst[i] = src[j] & src[j+1] & src[j+2];
1611     else
1612         for( i = j = 0; i < len; i++, j += cn )
1613             dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
1614
1615     for( ; k < cn; k += 4 )
1616     {
1617         for( i = 0, j = k; i < len; i++, j += cn )
1618             dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
1619     }
1620 }
1621
1622 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
1623                              const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );
1624
1625 static InRangeFunc getInRangeFunc(int depth)
1626 {
1627     static InRangeFunc inRangeTab[] =
1628     {
1629         (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
1630         (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
1631         (InRangeFunc)inRange64f, 0
1632     };
1633
1634     return inRangeTab[depth];
1635 }
1636
1637 #ifdef HAVE_OPENCL
1638
1639 static bool ocl_inRange( InputArray _src, InputArray _lowerb,
1640                          InputArray _upperb, OutputArray _dst )
1641 {
1642     const ocl::Device & d = ocl::Device::getDefault();
1643     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
1644     Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
1645     int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
1646     int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
1647     int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1;
1648     bool lbScalar = false, ubScalar = false;
1649
1650     if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
1651         ssize != lsize || stype != ltype )
1652     {
1653         if( !checkScalar(_lowerb, stype, lkind, skind) )
1654             CV_Error( CV_StsUnmatchedSizes,
1655                      "The lower boundary is neither an array of the same size and same type as src, nor a scalar");
1656         lbScalar = true;
1657     }
1658
1659     if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
1660         ssize != usize || stype != utype )
1661     {
1662         if( !checkScalar(_upperb, stype, ukind, skind) )
1663             CV_Error( CV_StsUnmatchedSizes,
1664                      "The upper boundary is neither an array of the same size and same type as src, nor a scalar");
1665         ubScalar = true;
1666     }
1667
1668     if (lbScalar != ubScalar)
1669         return false;
1670
1671     bool doubleSupport = d.doubleFPConfig() > 0,
1672             haveScalar = lbScalar && ubScalar;
1673
1674     if ( (!doubleSupport && sdepth == CV_64F) ||
1675          (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
1676         return false;
1677
1678     int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn);
1679     if (kercn % cn != 0)
1680         kercn = cn;
1681     int colsPerWI = kercn / cn;
1682     String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
1683                            haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
1684                            ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
1685                            doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);
1686
1687     ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts);
1688     if (ker.empty())
1689         return false;
1690
1691     _dst.create(ssize, CV_8UC1);
1692     UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru;
1693     Mat lscalar, uscalar;
1694
1695     if (lbScalar && ubScalar)
1696     {
1697         lscalar = _lowerb.getMat();
1698         uscalar = _upperb.getMat();
1699
1700         size_t esz = src.elemSize();
1701         size_t blocksize = 36;
1702
1703         AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
1704         uchar *buf = alignPtr(_buf.data() + blocksize*cn, 16);
1705
1706         if( ldepth != sdepth && sdepth < CV_32S )
1707         {
1708             int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
1709             int* iubuf = ilbuf + cn;
1710
1711             BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S);
1712             sccvtfunc(lscalar.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
1713             sccvtfunc(uscalar.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
1714             int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth));
1715
1716             for( int k = 0; k < cn; k++ )
1717             {
1718                 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
1719                     ilbuf[k] = minval+1, iubuf[k] = minval;
1720             }
1721             lscalar = Mat(cn, 1, CV_32S, ilbuf);
1722             uscalar = Mat(cn, 1, CV_32S, iubuf);
1723         }
1724
1725         lscalar.convertTo(lscalar, stype);
1726         uscalar.convertTo(uscalar, stype);
1727     }
1728     else
1729     {
1730         lscalaru = _lowerb.getUMat();
1731         uscalaru = _upperb.getUMat();
1732     }
1733
1734     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
1735             dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI);
1736
1737     if (haveScalar)
1738     {
1739         lscalar.copyTo(lscalaru);
1740         uscalar.copyTo(uscalaru);
1741
1742         ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
1743                ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI);
1744     }
1745     else
1746         ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
1747                ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
1748
1749     size_t globalsize[2] = { (size_t)ssize.width / colsPerWI, ((size_t)ssize.height + rowsPerWI - 1) / rowsPerWI };
1750     return ker.run(2, globalsize, NULL, false);
1751 }
1752
1753 #endif
1754
1755 }
1756
1757 void cv::inRange(InputArray _src, InputArray _lowerb,
1758                  InputArray _upperb, OutputArray _dst)
1759 {
1760     CV_INSTRUMENT_REGION()
1761
1762     CV_Assert(! _src.empty());
1763
1764     CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
1765                _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
1766                ocl_inRange(_src, _lowerb, _upperb, _dst))
1767
1768     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
1769     Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
1770
1771     bool lbScalar = false, ubScalar = false;
1772
1773     if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
1774         src.size != lb.size || src.type() != lb.type() )
1775     {
1776         if( !checkScalar(lb, src.type(), lkind, skind) )
1777             CV_Error( CV_StsUnmatchedSizes,
1778                      "The lower boundary is neither an array of the same size and same type as src, nor a scalar");
1779         lbScalar = true;
1780     }
1781
1782     if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
1783         src.size != ub.size || src.type() != ub.type() )
1784     {
1785         if( !checkScalar(ub, src.type(), ukind, skind) )
1786             CV_Error( CV_StsUnmatchedSizes,
1787                      "The upper boundary is neither an array of the same size and same type as src, nor a scalar");
1788         ubScalar = true;
1789     }
1790
1791     CV_Assert(lbScalar == ubScalar);
1792
1793     int cn = src.channels(), depth = src.depth();
1794
1795     size_t esz = src.elemSize();
1796     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
1797
1798     _dst.create(src.dims, src.size, CV_8UC1);
1799     Mat dst = _dst.getMat();
1800     InRangeFunc func = getInRangeFunc(depth);
1801
1802     const Mat* arrays_sc[] = { &src, &dst, 0 };
1803     const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
1804     uchar* ptrs[4];
1805
1806     NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
1807     size_t total = it.size, blocksize = std::min(total, blocksize0);
1808
1809     AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
1810     uchar *buf = _buf.data(), *mbuf = buf, *lbuf = 0, *ubuf = 0;
1811     buf = alignPtr(buf + blocksize*cn, 16);
1812
1813     if( lbScalar && ubScalar )
1814     {
1815         lbuf = buf;
1816         ubuf = buf = alignPtr(buf + blocksize*esz, 16);
1817
1818         CV_Assert( lb.type() == ub.type() );
1819         int scdepth = lb.depth();
1820
1821         if( scdepth != depth && depth < CV_32S )
1822         {
1823             int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
1824             int* iubuf = ilbuf + cn;
1825
1826             BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
1827             sccvtfunc(lb.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
1828             sccvtfunc(ub.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
1829             int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
1830
1831             for( int k = 0; k < cn; k++ )
1832             {
1833                 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
1834                     ilbuf[k] = minval+1, iubuf[k] = minval;
1835             }
1836             lb = Mat(cn, 1, CV_32S, ilbuf);
1837             ub = Mat(cn, 1, CV_32S, iubuf);
1838         }
1839
1840         convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
1841         convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
1842     }
1843
1844     for( size_t i = 0; i < it.nplanes; i++, ++it )
1845     {
1846         for( size_t j = 0; j < total; j += blocksize )
1847         {
1848             int bsz = (int)MIN(total - j, blocksize);
1849             size_t delta = bsz*esz;
1850             uchar *lptr = lbuf, *uptr = ubuf;
1851             if( !lbScalar )
1852             {
1853                 lptr = ptrs[2];
1854                 ptrs[2] += delta;
1855             }
1856             if( !ubScalar )
1857             {
1858                 int idx = !lbScalar ? 3 : 2;
1859                 uptr = ptrs[idx];
1860                 ptrs[idx] += delta;
1861             }
1862             func( ptrs[0], 0, lptr, 0, uptr, 0, cn == 1 ? ptrs[1] : mbuf, 0, Size(bsz*cn, 1));
1863             if( cn > 1 )
1864                 inRangeReduce(mbuf, ptrs[1], bsz, cn);
1865             ptrs[0] += delta;
1866             ptrs[1] += bsz;
1867         }
1868     }
1869 }
1870
1871 /****************************************************************************************\
1872 *                                Earlier API: cvAdd etc.                                 *
1873 \****************************************************************************************/
1874
1875 CV_IMPL void
1876 cvNot( const CvArr* srcarr, CvArr* dstarr )
1877 {
1878     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
1879     CV_Assert( src.size == dst.size && src.type() == dst.type() );
1880     cv::bitwise_not( src, dst );
1881 }
1882
1883
1884 CV_IMPL void
1885 cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
1886 {
1887     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
1888         dst = cv::cvarrToMat(dstarr), mask;
1889     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
1890     if( maskarr )
1891         mask = cv::cvarrToMat(maskarr);
1892     cv::bitwise_and( src1, src2, dst, mask );
1893 }
1894
1895
1896 CV_IMPL void
1897 cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
1898 {
1899     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
1900         dst = cv::cvarrToMat(dstarr), mask;
1901     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
1902     if( maskarr )
1903         mask = cv::cvarrToMat(maskarr);
1904     cv::bitwise_or( src1, src2, dst, mask );
1905 }
1906
1907
1908 CV_IMPL void
1909 cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
1910 {
1911     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
1912         dst = cv::cvarrToMat(dstarr), mask;
1913     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
1914     if( maskarr )
1915         mask = cv::cvarrToMat(maskarr);
1916     cv::bitwise_xor( src1, src2, dst, mask );
1917 }
1918
1919
1920 CV_IMPL void
1921 cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
1922 {
1923     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
1924     CV_Assert( src.size == dst.size && src.type() == dst.type() );
1925     if( maskarr )
1926         mask = cv::cvarrToMat(maskarr);
1927     cv::bitwise_and( src, (const cv::Scalar&)s, dst, mask );
1928 }
1929
1930
1931 CV_IMPL void
1932 cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
1933 {
1934     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
1935     CV_Assert( src.size == dst.size && src.type() == dst.type() );
1936     if( maskarr )
1937         mask = cv::cvarrToMat(maskarr);
1938     cv::bitwise_or( src, (const cv::Scalar&)s, dst, mask );
1939 }
1940
1941
1942 CV_IMPL void
1943 cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
1944 {
1945     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
1946     CV_Assert( src.size == dst.size && src.type() == dst.type() );
1947     if( maskarr )
1948         mask = cv::cvarrToMat(maskarr);
1949     cv::bitwise_xor( src, (const cv::Scalar&)s, dst, mask );
1950 }
1951
1952
1953 CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
1954 {
1955     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
1956         dst = cv::cvarrToMat(dstarr), mask;
1957     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
1958     if( maskarr )
1959         mask = cv::cvarrToMat(maskarr);
1960     cv::add( src1, src2, dst, mask, dst.type() );
1961 }
1962
1963
1964 CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
1965 {
1966     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
1967         dst = cv::cvarrToMat(dstarr), mask;
1968     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
1969     if( maskarr )
1970         mask = cv::cvarrToMat(maskarr);
1971     cv::subtract( src1, src2, dst, mask, dst.type() );
1972 }
1973
1974
1975 CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
1976 {
1977     cv::Mat src1 = cv::cvarrToMat(srcarr1),
1978         dst = cv::cvarrToMat(dstarr), mask;
1979     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
1980     if( maskarr )
1981         mask = cv::cvarrToMat(maskarr);
1982     cv::add( src1, (const cv::Scalar&)value, dst, mask, dst.type() );
1983 }
1984
1985
1986 CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
1987 {
1988     cv::Mat src1 = cv::cvarrToMat(srcarr1),
1989         dst = cv::cvarrToMat(dstarr), mask;
1990     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
1991     if( maskarr )
1992         mask = cv::cvarrToMat(maskarr);
1993     cv::subtract( (const cv::Scalar&)value, src1, dst, mask, dst.type() );
1994 }
1995
1996
1997 CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
1998                     CvArr* dstarr, double scale )
1999 {
2000     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
2001         dst = cv::cvarrToMat(dstarr);
2002     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
2003     cv::multiply( src1, src2, dst, scale, dst.type() );
2004 }
2005
2006
2007 CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
2008                     CvArr* dstarr, double scale )
2009 {
2010     cv::Mat src2 = cv::cvarrToMat(srcarr2),
2011         dst = cv::cvarrToMat(dstarr), mask;
2012     CV_Assert( src2.size == dst.size && src2.channels() == dst.channels() );
2013
2014     if( srcarr1 )
2015         cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale, dst.type() );
2016     else
2017         cv::divide( scale, src2, dst, dst.type() );
2018 }
2019
2020
2021 CV_IMPL void
2022 cvAddWeighted( const CvArr* srcarr1, double alpha,
2023                const CvArr* srcarr2, double beta,
2024                double gamma, CvArr* dstarr )
2025 {
2026     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
2027         dst = cv::cvarrToMat(dstarr);
2028     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
2029     cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
2030 }
2031
2032
2033 CV_IMPL  void
2034 cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
2035 {
2036     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2037     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
2038
2039     cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
2040 }
2041
2042
2043 CV_IMPL void
2044 cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
2045 {
2046     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2047     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
2048
2049     cv::absdiff( src1, (const cv::Scalar&)scalar, dst );
2050 }
2051
2052
2053 CV_IMPL void
2054 cvInRange( const void* srcarr1, const void* srcarr2,
2055            const void* srcarr3, void* dstarr )
2056 {
2057     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2058     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
2059
2060     cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
2061 }
2062
2063
2064 CV_IMPL void
2065 cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
2066 {
2067     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2068     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
2069
2070     cv::inRange( src1, (const cv::Scalar&)lowerb, (const cv::Scalar&)upperb, dst );
2071 }
2072
2073
2074 CV_IMPL void
2075 cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
2076 {
2077     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2078     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
2079
2080     cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
2081 }
2082
2083
2084 CV_IMPL void
2085 cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
2086 {
2087     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2088     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
2089
2090     cv::compare( src1, value, dst, cmp_op );
2091 }
2092
2093
2094 CV_IMPL void
2095 cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
2096 {
2097     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2098     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
2099
2100     cv::min( src1, cv::cvarrToMat(srcarr2), dst );
2101 }
2102
2103
2104 CV_IMPL void
2105 cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
2106 {
2107     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2108     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
2109
2110     cv::max( src1, cv::cvarrToMat(srcarr2), dst );
2111 }
2112
2113
2114 CV_IMPL void
2115 cvMinS( const void* srcarr1, double value, void* dstarr )
2116 {
2117     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2118     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
2119
2120     cv::min( src1, value, dst );
2121 }
2122
2123
2124 CV_IMPL void
2125 cvMaxS( const void* srcarr1, double value, void* dstarr )
2126 {
2127     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
2128     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
2129
2130     cv::max( src1, value, dst );
2131 }
2132
2133
2134
2135 namespace cv { namespace hal {
2136
2137 //=======================================
2138
2139 #if (ARITHM_USE_IPP == 1)
2140 static inline void fixSteps(int width, int height, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
2141 {
2142     if( height == 1 )
2143         step1 = step2 = step = width*elemSize;
2144 }
2145 #define CALL_IPP_BIN_E_12(fun) \
2146     CV_IPP_CHECK() \
2147     { \
2148         fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
2149         if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \
2150         { \
2151             CV_IMPL_ADD(CV_IMPL_IPP); \
2152             return; \
2153         } \
2154         setIppErrorStatus(); \
2155     }
2156
2157 #define CALL_IPP_BIN_E_21(fun) \
2158     CV_IPP_CHECK() \
2159     { \
2160         fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
2161         if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \
2162         { \
2163             CV_IMPL_ADD(CV_IMPL_IPP); \
2164             return; \
2165         } \
2166         setIppErrorStatus(); \
2167     }
2168
2169 #define CALL_IPP_BIN_12(fun) \
2170     CV_IPP_CHECK() \
2171     { \
2172         fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
2173         if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height))) \
2174         { \
2175             CV_IMPL_ADD(CV_IMPL_IPP); \
2176             return; \
2177         } \
2178         setIppErrorStatus(); \
2179     }
2180
2181 #define CALL_IPP_BIN_21(fun) \
2182     CV_IPP_CHECK() \
2183     { \
2184         fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
2185         if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \
2186         { \
2187             CV_IMPL_ADD(CV_IMPL_IPP); \
2188             return; \
2189         } \
2190         setIppErrorStatus(); \
2191     }
2192
2193 #else
2194 #define CALL_IPP_BIN_E_12(fun)
2195 #define CALL_IPP_BIN_E_21(fun)
2196 #define CALL_IPP_BIN_12(fun)
2197 #define CALL_IPP_BIN_21(fun)
2198 #endif
2199
2200
2201 //=======================================
2202 // Add
2203 //=======================================
2204
2205 void add8u( const uchar* src1, size_t step1,
2206                    const uchar* src2, size_t step2,
2207                    uchar* dst, size_t step, int width, int height, void* )
2208 {
2209     CALL_HAL(add8u, cv_hal_add8u, src1, step1, src2, step2, dst, step, width, height)
2210     CALL_IPP_BIN_E_12(ippiAdd_8u_C1RSfs)
2211     (vBinOp<uchar, cv::OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
2212 }
2213
2214 void add8s( const schar* src1, size_t step1,
2215                    const schar* src2, size_t step2,
2216                    schar* dst, size_t step, int width, int height, void* )
2217 {
2218     CALL_HAL(add8s, cv_hal_add8s, src1, step1, src2, step2, dst, step, width, height)
2219     vBinOp<schar, cv::OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, width, height);
2220 }
2221
2222 void add16u( const ushort* src1, size_t step1,
2223                     const ushort* src2, size_t step2,
2224                     ushort* dst, size_t step, int width, int height, void* )
2225 {
2226     CALL_HAL(add16u, cv_hal_add16u, src1, step1, src2, step2, dst, step, width, height)
2227     CALL_IPP_BIN_E_12(ippiAdd_16u_C1RSfs)
2228     (vBinOp<ushort, cv::OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
2229 }
2230
2231 void add16s( const short* src1, size_t step1,
2232                     const short* src2, size_t step2,
2233                     short* dst, size_t step, int width, int height, void* )
2234 {
2235     CALL_HAL(add16s, cv_hal_add16s, src1, step1, src2, step2, dst, step, width, height)
2236     CALL_IPP_BIN_E_12(ippiAdd_16s_C1RSfs)
2237     (vBinOp<short, cv::OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, width, height));
2238 }
2239
2240 void add32s( const int* src1, size_t step1,
2241                     const int* src2, size_t step2,
2242                     int* dst, size_t step, int width, int height, void* )
2243 {
2244     CALL_HAL(add32s, cv_hal_add32s, src1, step1, src2, step2, dst, step, width, height)
2245     vBinOp32<int, cv::OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, width, height);
2246 }
2247
2248 void add32f( const float* src1, size_t step1,
2249                     const float* src2, size_t step2,
2250                     float* dst, size_t step, int width, int height, void* )
2251 {
2252     CALL_HAL(add32f, cv_hal_add32f, src1, step1, src2, step2, dst, step, width, height)
2253     CALL_IPP_BIN_12(ippiAdd_32f_C1R)
2254     (vBinOp32<float, cv::OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, width, height));
2255 }
2256
2257 void add64f( const double* src1, size_t step1,
2258                     const double* src2, size_t step2,
2259                     double* dst, size_t step, int width, int height, void* )
2260 {
2261     CALL_HAL(add64f, cv_hal_add64f, src1, step1, src2, step2, dst, step, width, height)
2262     vBinOp64<double, cv::OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, width, height);
2263 }
2264
2265 //=======================================
2266 // Subtract
2267 //=======================================
2268
2269 void sub8u( const uchar* src1, size_t step1,
2270                    const uchar* src2, size_t step2,
2271                    uchar* dst, size_t step, int width, int height, void* )
2272 {
2273     CALL_HAL(sub8u, cv_hal_sub8u, src1, step1, src2, step2, dst, step, width, height)
2274     CALL_IPP_BIN_E_21(ippiSub_8u_C1RSfs)
2275     (vBinOp<uchar, cv::OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
2276 }
2277
2278 void sub8s( const schar* src1, size_t step1,
2279                    const schar* src2, size_t step2,
2280                    schar* dst, size_t step, int width, int height, void* )
2281 {
2282     CALL_HAL(sub8s, cv_hal_sub8s, src1, step1, src2, step2, dst, step, width, height)
2283     vBinOp<schar, cv::OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, width, height);
2284 }
2285
2286 void sub16u( const ushort* src1, size_t step1,
2287                     const ushort* src2, size_t step2,
2288                     ushort* dst, size_t step, int width, int height, void* )
2289 {
2290     CALL_HAL(sub16u, cv_hal_sub16u, src1, step1, src2, step2, dst, step, width, height)
2291     CALL_IPP_BIN_E_21(ippiSub_16u_C1RSfs)
2292     (vBinOp<ushort, cv::OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
2293 }
2294
2295 void sub16s( const short* src1, size_t step1,
2296                     const short* src2, size_t step2,
2297                     short* dst, size_t step, int width, int height, void* )
2298 {
2299     CALL_HAL(sub16s, cv_hal_sub16s, src1, step1, src2, step2, dst, step, width, height)
2300     CALL_IPP_BIN_E_21(ippiSub_16s_C1RSfs)
2301     (vBinOp<short, cv::OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, width, height));
2302 }
2303
2304 void sub32s( const int* src1, size_t step1,
2305                     const int* src2, size_t step2,
2306                     int* dst, size_t step, int width, int height, void* )
2307 {
2308     CALL_HAL(sub32s, cv_hal_sub32s, src1, step1, src2, step2, dst, step, width, height)
2309     vBinOp32<int, cv::OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, width, height);
2310 }
2311
2312 void sub32f( const float* src1, size_t step1,
2313                    const float* src2, size_t step2,
2314                    float* dst, size_t step, int width, int height, void* )
2315 {
2316     CALL_HAL(sub32f, cv_hal_sub32f, src1, step1, src2, step2, dst, step, width, height)
2317     CALL_IPP_BIN_21(ippiSub_32f_C1R)
2318     (vBinOp32<float, cv::OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, width, height));
2319 }
2320
2321 void sub64f( const double* src1, size_t step1,
2322                     const double* src2, size_t step2,
2323                     double* dst, size_t step, int width, int height, void* )
2324 {
2325     CALL_HAL(sub64f, cv_hal_sub64f, src1, step1, src2, step2, dst, step, width, height)
2326     vBinOp64<double, cv::OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, width, height);
2327 }
2328
2329 //=======================================
2330
2331 #if (ARITHM_USE_IPP == 1)
2332 #define CALL_IPP_MIN_MAX(fun, type) \
2333     CV_IPP_CHECK() \
2334     { \
2335         type* s1 = (type*)src1; \
2336         type* s2 = (type*)src2; \
2337         type* d  = dst; \
2338         fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
2339         int i = 0; \
2340         for(; i < height; i++) \
2341         { \
2342             if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width)) \
2343                 break; \
2344             s1 = (type*)((uchar*)s1 + step1); \
2345             s2 = (type*)((uchar*)s2 + step2); \
2346             d  = (type*)((uchar*)d + step); \
2347         } \
2348         if (i == height) \
2349         { \
2350             CV_IMPL_ADD(CV_IMPL_IPP); \
2351             return; \
2352         } \
2353         setIppErrorStatus(); \
2354     }
2355 #else
2356 #define CALL_IPP_MIN_MAX(fun, type)
2357 #endif
2358
2359 //=======================================
2360 // Max
2361 //=======================================
2362
2363 void max8u( const uchar* src1, size_t step1,
2364                    const uchar* src2, size_t step2,
2365                    uchar* dst, size_t step, int width, int height, void* )
2366 {
2367     CALL_HAL(max8u, cv_hal_max8u, src1, step1, src2, step2, dst, step, width, height)
2368     CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar)
2369     vBinOp<uchar, cv::OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
2370 }
2371
2372 void max8s( const schar* src1, size_t step1,
2373                    const schar* src2, size_t step2,
2374                    schar* dst, size_t step, int width, int height, void* )
2375 {
2376     CALL_HAL(max8s, cv_hal_max8s, src1, step1, src2, step2, dst, step, width, height)
2377     vBinOp<schar, cv::OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, width, height);
2378 }
2379
2380 void max16u( const ushort* src1, size_t step1,
2381                     const ushort* src2, size_t step2,
2382                     ushort* dst, size_t step, int width, int height, void* )
2383 {
2384     CALL_HAL(max16u, cv_hal_max16u, src1, step1, src2, step2, dst, step, width, height)
2385     CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort)
2386     vBinOp<ushort, cv::OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
2387 }
2388
2389 void max16s( const short* src1, size_t step1,
2390                     const short* src2, size_t step2,
2391                     short* dst, size_t step, int width, int height, void* )
2392 {
2393     CALL_HAL(max16s, cv_hal_max16s, src1, step1, src2, step2, dst, step, width, height)
2394     vBinOp<short, cv::OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, width, height);
2395 }
2396
2397 void max32s( const int* src1, size_t step1,
2398                     const int* src2, size_t step2,
2399                     int* dst, size_t step, int width, int height, void* )
2400 {
2401     CALL_HAL(max32s, cv_hal_max32s, src1, step1, src2, step2, dst, step, width, height)
2402     vBinOp32<int, cv::OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, width, height);
2403 }
2404
2405 void max32f( const float* src1, size_t step1,
2406                     const float* src2, size_t step2,
2407                     float* dst, size_t step, int width, int height, void* )
2408 {
2409     CALL_HAL(max32f, cv_hal_max32f, src1, step1, src2, step2, dst, step, width, height)
2410     CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float)
2411     vBinOp32<float, cv::OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, width, height);
2412 }
2413
2414 void max64f( const double* src1, size_t step1,
2415                     const double* src2, size_t step2,
2416                     double* dst, size_t step, int width, int height, void* )
2417 {
2418     CALL_HAL(max64f, cv_hal_max64f, src1, step1, src2, step2, dst, step, width, height)
2419     CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double)
2420     vBinOp64<double, cv::OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, width, height);
2421 }
2422
2423 //=======================================
2424 // Min
2425 //=======================================
2426
2427 void min8u( const uchar* src1, size_t step1,
2428                    const uchar* src2, size_t step2,
2429                    uchar* dst, size_t step, int width, int height, void* )
2430 {
2431     CALL_HAL(min8u, cv_hal_min8u, src1, step1, src2, step2, dst, step, width, height)
2432     CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar)
2433     vBinOp<uchar, cv::OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
2434 }
2435
2436 void min8s( const schar* src1, size_t step1,
2437                    const schar* src2, size_t step2,
2438                    schar* dst, size_t step, int width, int height, void* )
2439 {
2440     CALL_HAL(min8s, cv_hal_min8s, src1, step1, src2, step2, dst, step, width, height)
2441     vBinOp<schar, cv::OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, width, height);
2442 }
2443
2444 void min16u( const ushort* src1, size_t step1,
2445                     const ushort* src2, size_t step2,
2446                     ushort* dst, size_t step, int width, int height, void* )
2447 {
2448     CALL_HAL(min16u, cv_hal_min16u, src1, step1, src2, step2, dst, step, width, height)
2449     CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort)
2450     vBinOp<ushort, cv::OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
2451 }
2452
2453 void min16s( const short* src1, size_t step1,
2454                     const short* src2, size_t step2,
2455                     short* dst, size_t step, int width, int height, void* )
2456 {
2457     CALL_HAL(min16s, cv_hal_min16s, src1, step1, src2, step2, dst, step, width, height)
2458     vBinOp<short, cv::OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, width, height);
2459 }
2460
2461 void min32s( const int* src1, size_t step1,
2462                     const int* src2, size_t step2,
2463                     int* dst, size_t step, int width, int height, void* )
2464 {
2465     CALL_HAL(min32s, cv_hal_min32s, src1, step1, src2, step2, dst, step, width, height)
2466     vBinOp32<int, cv::OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, width, height);
2467 }
2468
2469 void min32f( const float* src1, size_t step1,
2470                     const float* src2, size_t step2,
2471                     float* dst, size_t step, int width, int height, void* )
2472 {
2473     CALL_HAL(min32f, cv_hal_min32f, src1, step1, src2, step2, dst, step, width, height)
2474     CALL_IPP_MIN_MAX(ippsMinEvery_32f, float)
2475     vBinOp32<float, cv::OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, width, height);
2476 }
2477
2478 void min64f( const double* src1, size_t step1,
2479                     const double* src2, size_t step2,
2480                     double* dst, size_t step, int width, int height, void* )
2481 {
2482     CALL_HAL(min64f, cv_hal_min64f, src1, step1, src2, step2, dst, step, width, height)
2483     CALL_IPP_MIN_MAX(ippsMinEvery_64f, double)
2484     vBinOp64<double, cv::OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, width, height);
2485 }
2486
2487 //=======================================
2488 // AbsDiff
2489 //=======================================
2490
2491 void absdiff8u( const uchar* src1, size_t step1,
2492                        const uchar* src2, size_t step2,
2493                        uchar* dst, size_t step, int width, int height, void* )
2494 {
2495     CALL_HAL(absdiff8u, cv_hal_absdiff8u, src1, step1, src2, step2, dst, step, width, height)
2496     CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R)
2497     (vBinOp<uchar, cv::OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
2498 }
2499
2500 void absdiff8s( const schar* src1, size_t step1,
2501                        const schar* src2, size_t step2,
2502                        schar* dst, size_t step, int width, int height, void* )
2503 {
2504     CALL_HAL(absdiff8s, cv_hal_absdiff8s, src1, step1, src2, step2, dst, step, width, height)
2505     vBinOp<schar, cv::OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, width, height);
2506 }
2507
2508 void absdiff16u( const ushort* src1, size_t step1,
2509                         const ushort* src2, size_t step2,
2510                         ushort* dst, size_t step, int width, int height, void* )
2511 {
2512     CALL_HAL(absdiff16u, cv_hal_absdiff16u, src1, step1, src2, step2, dst, step, width, height)
2513     CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R)
2514     (vBinOp<ushort, cv::OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
2515 }
2516
2517 void absdiff16s( const short* src1, size_t step1,
2518                         const short* src2, size_t step2,
2519                         short* dst, size_t step, int width, int height, void* )
2520 {
2521     CALL_HAL(absdiff16s, cv_hal_absdiff16s, src1, step1, src2, step2, dst, step, width, height)
2522     vBinOp<short, cv::OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, width, height);
2523 }
2524
2525 void absdiff32s( const int* src1, size_t step1,
2526                         const int* src2, size_t step2,
2527                         int* dst, size_t step, int width, int height, void* )
2528 {
2529     CALL_HAL(absdiff32s, cv_hal_absdiff32s, src1, step1, src2, step2, dst, step, width, height)
2530     vBinOp32<int, cv::OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, width, height);
2531 }
2532
2533 void absdiff32f( const float* src1, size_t step1,
2534                         const float* src2, size_t step2,
2535                         float* dst, size_t step, int width, int height, void* )
2536 {
2537     CALL_HAL(absdiff32f, cv_hal_absdiff32f, src1, step1, src2, step2, dst, step, width, height)
2538     CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R)
2539     (vBinOp32<float, cv::OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, width, height));
2540 }
2541
2542 void absdiff64f( const double* src1, size_t step1,
2543                         const double* src2, size_t step2,
2544                         double* dst, size_t step, int width, int height, void* )
2545 {
2546     CALL_HAL(absdiff64f, cv_hal_absdiff64f, src1, step1, src2, step2, dst, step, width, height)
2547     vBinOp64<double, cv::OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, width, height);
2548 }
2549
2550 //=======================================
2551 // Logical
2552 //=======================================
2553
2554 #if (ARITHM_USE_IPP == 1)
2555 #define CALL_IPP_UN(fun) \
2556     CV_IPP_CHECK() \
2557     { \
2558         fixSteps(width, height, sizeof(dst[0]), step1, step2, step); (void)src2; \
2559         if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \
2560         { \
2561             CV_IMPL_ADD(CV_IMPL_IPP); \
2562             return; \
2563         } \
2564         setIppErrorStatus(); \
2565     }
2566 #else
2567 #define CALL_IPP_UN(fun)
2568 #endif
2569
2570 void and8u( const uchar* src1, size_t step1,
2571                    const uchar* src2, size_t step2,
2572                    uchar* dst, size_t step, int width, int height, void* )
2573 {
2574     CALL_HAL(and8u, cv_hal_and8u, src1, step1, src2, step2, dst, step, width, height)
2575     CALL_IPP_BIN_12(ippiAnd_8u_C1R)
2576     (vBinOp<uchar, cv::OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
2577 }
2578
2579 void or8u( const uchar* src1, size_t step1,
2580                   const uchar* src2, size_t step2,
2581                   uchar* dst, size_t step, int width, int height, void* )
2582 {
2583     CALL_HAL(or8u, cv_hal_or8u, src1, step1, src2, step2, dst, step, width, height)
2584     CALL_IPP_BIN_12(ippiOr_8u_C1R)
2585     (vBinOp<uchar, cv::OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
2586 }
2587
2588 void xor8u( const uchar* src1, size_t step1,
2589                    const uchar* src2, size_t step2,
2590                    uchar* dst, size_t step, int width, int height, void* )
2591 {
2592     CALL_HAL(xor8u, cv_hal_xor8u, src1, step1, src2, step2, dst, step, width, height)
2593     CALL_IPP_BIN_12(ippiXor_8u_C1R)
2594     (vBinOp<uchar, cv::OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
2595 }
2596
2597 void not8u( const uchar* src1, size_t step1,
2598                    const uchar* src2, size_t step2,
2599                    uchar* dst, size_t step, int width, int height, void* )
2600 {
2601     CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
2602     CALL_IPP_UN(ippiNot_8u_C1R)
2603     (vBinOp<uchar, cv::OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
2604 }
2605
2606 //=======================================
2607
2608 #if ARITHM_USE_IPP
2609 inline static IppCmpOp convert_cmp(int _cmpop)
2610 {
2611     return _cmpop == CMP_EQ ? ippCmpEq :
2612         _cmpop == CMP_GT ? ippCmpGreater :
2613         _cmpop == CMP_GE ? ippCmpGreaterEq :
2614         _cmpop == CMP_LT ? ippCmpLess :
2615         _cmpop == CMP_LE ? ippCmpLessEq :
2616         (IppCmpOp)-1;
2617 }
2618 #define CALL_IPP_CMP(fun) \
2619     CV_IPP_CHECK() \
2620     { \
2621         IppCmpOp op = convert_cmp(*(int *)_cmpop); \
2622         if( op  >= 0 ) \
2623         { \
2624             fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
2625             if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \
2626             { \
2627                 CV_IMPL_ADD(CV_IMPL_IPP); \
2628                 return; \
2629             } \
2630             setIppErrorStatus(); \
2631         } \
2632     }
2633 #else
2634 #define CALL_IPP_CMP(fun)
2635 #endif
2636
2637 //=======================================
2638 // Compare
2639 //=======================================
2640
2641 void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
2642                   uchar* dst, size_t step, int width, int height, void* _cmpop)
2643 {
2644     CALL_HAL(cmp8u, cv_hal_cmp8u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
2645     CALL_IPP_CMP(ippiCompare_8u_C1R)
2646   //vz optimized  cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
2647     int code = *(int*)_cmpop;
2648     step1 /= sizeof(src1[0]);
2649     step2 /= sizeof(src2[0]);
2650     if( code == CMP_GE || code == CMP_LT )
2651     {
2652         std::swap(src1, src2);
2653         std::swap(step1, step2);
2654         code = code == CMP_GE ? CMP_LE : CMP_GT;
2655     }
2656
2657     if( code == CMP_GT || code == CMP_LE )
2658     {
2659         int m = code == CMP_GT ? 0 : 255;
2660         for( ; height--; src1 += step1, src2 += step2, dst += step )
2661         {
2662             int x =0;
2663 #if CV_SIMD128
2664             if( hasSIMD128() )
2665             {
2666                 v_uint8x16 mask = v_setall_u8((uchar)m);
2667
2668                 for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes )
2669                 {
2670                     v_store(dst + x, (v_load(src1 + x) > v_load(src2 + x)) ^ mask);
2671                 }
2672             }
2673 #endif
2674
2675             for( ; x < width; x++ ){
2676                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2677             }
2678         }
2679     }
2680     else if( code == CMP_EQ || code == CMP_NE )
2681     {
2682         int m = code == CMP_EQ ? 0 : 255;
2683         for( ; height--; src1 += step1, src2 += step2, dst += step )
2684         {
2685             int x = 0;
2686 #if CV_SIMD128
2687             if( hasSIMD128() )
2688             {
2689                 v_uint8x16 mask = v_setall_u8((uchar)m);
2690
2691                 for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes )
2692                 {
2693                     v_store(dst+x, (v_load(src1+x) == v_load(src2+x)) ^ mask);
2694                 }
2695             }
2696 #endif
2697            for( ; x < width; x++ )
2698                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
2699         }
2700     }
2701 }
2702
2703 void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
2704                   uchar* dst, size_t step, int width, int height, void* _cmpop)
2705 {
2706     CALL_HAL(cmp8s, cv_hal_cmp8s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
2707     cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
2708 }
2709
2710 void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
2711                   uchar* dst, size_t step, int width, int height, void* _cmpop)
2712 {
2713     CALL_HAL(cmp16u, cv_hal_cmp16u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
2714     CALL_IPP_CMP(ippiCompare_16u_C1R)
2715     cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
2716 }
2717
2718 void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
2719                   uchar* dst, size_t step, int width, int height, void* _cmpop)
2720 {
2721     CALL_HAL(cmp16s, cv_hal_cmp16s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
2722     CALL_IPP_CMP(ippiCompare_16s_C1R)
2723    //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
2724
2725     int code = *(int*)_cmpop;
2726     step1 /= sizeof(src1[0]);
2727     step2 /= sizeof(src2[0]);
2728     if( code == CMP_GE || code == CMP_LT )
2729     {
2730         std::swap(src1, src2);
2731         std::swap(step1, step2);
2732         code = code == CMP_GE ? CMP_LE : CMP_GT;
2733     }
2734
2735     if( code == CMP_GT || code == CMP_LE )
2736     {
2737         int m = code == CMP_GT ? 0 : 255;
2738         for( ; height--; src1 += step1, src2 += step2, dst += step )
2739         {
2740             int x =0;
2741 #if CV_SIMD128
2742             if( hasSIMD128() )
2743             {
2744                 v_uint8x16 mask = v_setall_u8((uchar)m);
2745                 const int dWidth = v_uint8x16::nlanes;
2746
2747                 for( ; x <= width - dWidth; x += dWidth )
2748                 {
2749                     v_int16x8 in1 = v_load(src1 + x);
2750                     v_int16x8 in2 = v_load(src2 + x);
2751                     v_uint16x8 t1 = v_reinterpret_as_u16(in1 > in2);
2752
2753                     in1 = v_load(src1 + x + v_uint16x8::nlanes);
2754                     in2 = v_load(src2 + x + v_uint16x8::nlanes);
2755                     v_uint16x8 t2 = v_reinterpret_as_u16(in1 > in2);
2756
2757                     v_store(dst+x, (v_pack(t1, t2)) ^ mask);
2758                 }
2759             }
2760 #endif
2761             for( ; x < width; x++ ){
2762                  dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2763             }
2764         }
2765     }
2766     else if( code == CMP_EQ || code == CMP_NE )
2767     {
2768         int m = code == CMP_EQ ? 0 : 255;
2769         for( ; height--; src1 += step1, src2 += step2, dst += step )
2770         {
2771             int x = 0;
2772 #if CV_SIMD128
2773             if( hasSIMD128() )
2774             {
2775                 v_uint8x16 mask = v_setall_u8((uchar)m);
2776                 const int dWidth = v_uint8x16::nlanes;
2777
2778                 for( ; x <= width - dWidth; x += dWidth )
2779                 {
2780                     v_int16x8 in1 = v_load(src1 + x);
2781                     v_int16x8 in2 = v_load(src2 + x);
2782                     v_uint16x8 t1 = v_reinterpret_as_u16(in1 == in2);
2783
2784                     in1 = v_load(src1 + x + 8);
2785                     in2 = v_load(src2 + x + 8);
2786                     v_uint16x8 t2 = v_reinterpret_as_u16(in1 == in2);
2787
2788                     v_store(dst+x, (v_pack(t1, t2)^ mask));
2789                 }
2790             }
2791 #endif
2792             for( ; x < width; x++ )
2793                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
2794         }
2795     }
2796 }
2797
2798 void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
2799                    uchar* dst, size_t step, int width, int height, void* _cmpop)
2800 {
2801     CALL_HAL(cmp32s, cv_hal_cmp32s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
2802     cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
2803 }
2804
2805 void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
2806                   uchar* dst, size_t step, int width, int height, void* _cmpop)
2807 {
2808     CALL_HAL(cmp32f, cv_hal_cmp32f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
2809     CALL_IPP_CMP(ippiCompare_32f_C1R)
2810     cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
2811 }
2812
2813 void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
2814                   uchar* dst, size_t step, int width, int height, void* _cmpop)
2815 {
2816     CALL_HAL(cmp64f, cv_hal_cmp64f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
2817     cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
2818 }
2819
2820 //=======================================
2821
2822 #if defined HAVE_IPP
2823 #define CALL_IPP_MUL(fun) \
2824     CV_IPP_CHECK() \
2825     { \
2826         if (std::fabs(fscale - 1) <= FLT_EPSILON) \
2827         { \
2828             if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \
2829             { \
2830                 CV_IMPL_ADD(CV_IMPL_IPP); \
2831                 return; \
2832             } \
2833             setIppErrorStatus(); \
2834         } \
2835     }
2836
2837 #define CALL_IPP_MUL_2(fun) \
2838     CV_IPP_CHECK() \
2839     { \
2840         if (std::fabs(fscale - 1) <= FLT_EPSILON) \
2841         { \
2842             if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)) >= 0) \
2843             { \
2844                 CV_IMPL_ADD(CV_IMPL_IPP); \
2845                 return; \
2846             } \
2847             setIppErrorStatus(); \
2848         } \
2849     }
2850
2851 #else
2852 #define CALL_IPP_MUL(fun)
2853 #define CALL_IPP_MUL_2(fun)
2854 #endif
2855
2856 //=======================================
2857 // Multilpy
2858 //=======================================
2859
2860 void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
2861                    uchar* dst, size_t step, int width, int height, void* scale)
2862 {
2863     CALL_HAL(mul8u, cv_hal_mul8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2864     float fscale = (float)*(const double*)scale;
2865     CALL_IPP_MUL(ippiMul_8u_C1RSfs)
2866     mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
2867 }
2868
2869 void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
2870                    schar* dst, size_t step, int width, int height, void* scale)
2871 {
2872     CALL_HAL(mul8s, cv_hal_mul8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2873     mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale);
2874 }
2875
2876 void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
2877                     ushort* dst, size_t step, int width, int height, void* scale)
2878 {
2879     CALL_HAL(mul16u, cv_hal_mul16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2880     float fscale = (float)*(const double*)scale;
2881     CALL_IPP_MUL(ippiMul_16u_C1RSfs)
2882     mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
2883 }
2884
2885 void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
2886                     short* dst, size_t step, int width, int height, void* scale)
2887 {
2888     CALL_HAL(mul16s, cv_hal_mul16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2889     float fscale = (float)*(const double*)scale;
2890     CALL_IPP_MUL(ippiMul_16s_C1RSfs)
2891     mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
2892 }
2893
2894 void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
2895                     int* dst, size_t step, int width, int height, void* scale)
2896 {
2897     CALL_HAL(mul32s, cv_hal_mul32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2898     mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2899 }
2900
2901 void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
2902                     float* dst, size_t step, int width, int height, void* scale)
2903 {
2904     CALL_HAL(mul32f, cv_hal_mul32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2905     float fscale = (float)*(const double*)scale;
2906     CALL_IPP_MUL_2(ippiMul_32f_C1R)
2907     mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
2908 }
2909
2910 void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
2911                     double* dst, size_t step, int width, int height, void* scale)
2912 {
2913     CALL_HAL(mul64f, cv_hal_mul64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2914     mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2915 }
2916
2917 //=======================================
2918 // Divide
2919 //=======================================
2920
2921 void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
2922                    uchar* dst, size_t step, int width, int height, void* scale)
2923 {
2924     CALL_HAL(div8u, cv_hal_div8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2925     if( src1 )
2926         div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2927     else
2928         recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
2929 }
2930
2931 void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
2932                   schar* dst, size_t step, int width, int height, void* scale)
2933 {
2934     CALL_HAL(div8s, cv_hal_div8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2935     div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2936 }
2937
2938 void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
2939                     ushort* dst, size_t step, int width, int height, void* scale)
2940 {
2941     CALL_HAL(div16u, cv_hal_div16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2942     div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2943 }
2944
2945 void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
2946                     short* dst, size_t step, int width, int height, void* scale)
2947 {
2948     CALL_HAL(div16s, cv_hal_div16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2949     div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2950 }
2951
2952 void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
2953                     int* dst, size_t step, int width, int height, void* scale)
2954 {
2955     CALL_HAL(div32s, cv_hal_div32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2956     div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2957 }
2958
2959 void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
2960                     float* dst, size_t step, int width, int height, void* scale)
2961 {
2962     CALL_HAL(div32f, cv_hal_div32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2963     div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2964 }
2965
2966 void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
2967                     double* dst, size_t step, int width, int height, void* scale)
2968 {
2969     CALL_HAL(div64f, cv_hal_div64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
2970     div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
2971 }
2972
2973 //=======================================
2974 // Reciprocial
2975 //=======================================
2976
2977 void recip8u( const uchar*, size_t, const uchar* src2, size_t step2,
2978                   uchar* dst, size_t step, int width, int height, void* scale)
2979 {
2980     CALL_HAL(recip8u, cv_hal_recip8u, src2, step2, dst, step, width, height, *(const double*)scale)
2981     recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
2982 }
2983
2984 void recip8s( const schar*, size_t, const schar* src2, size_t step2,
2985                   schar* dst, size_t step, int width, int height, void* scale)
2986 {
2987     CALL_HAL(recip8s, cv_hal_recip8s, src2, step2, dst, step, width, height, *(const double*)scale)
2988     recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
2989 }
2990
2991 void recip16u( const ushort*, size_t, const ushort* src2, size_t step2,
2992                    ushort* dst, size_t step, int width, int height, void* scale)
2993 {
2994     CALL_HAL(recip16u, cv_hal_recip16u, src2, step2, dst, step, width, height, *(const double*)scale)
2995     recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
2996 }
2997
2998 void recip16s( const short*, size_t, const short* src2, size_t step2,
2999                    short* dst, size_t step, int width, int height, void* scale)
3000 {
3001     CALL_HAL(recip16s, cv_hal_recip16s, src2, step2, dst, step, width, height, *(const double*)scale)
3002     recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
3003 }
3004
3005 void recip32s( const int*, size_t, const int* src2, size_t step2,
3006                    int* dst, size_t step, int width, int height, void* scale)
3007 {
3008     CALL_HAL(recip32s, cv_hal_recip32s, src2, step2, dst, step, width, height, *(const double*)scale)
3009     recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
3010 }
3011
3012 void recip32f( const float*, size_t, const float* src2, size_t step2,
3013                    float* dst, size_t step, int width, int height, void* scale)
3014 {
3015     CALL_HAL(recip32f, cv_hal_recip32f, src2, step2, dst, step, width, height, *(const double*)scale)
3016     recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
3017 }
3018
3019 void recip64f( const double*, size_t, const double* src2, size_t step2,
3020                    double* dst, size_t step, int width, int height, void* scale)
3021 {
3022     CALL_HAL(recip64f, cv_hal_recip64f, src2, step2, dst, step, width, height, *(const double*)scale)
3023     recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
3024 }
3025
3026 //=======================================
3027 // Add weighted
3028 //=======================================
3029
3030 void
3031 addWeighted8u( const uchar* src1, size_t step1,
3032                const uchar* src2, size_t step2,
3033                uchar* dst, size_t step, int width, int height,
3034                void* scalars )
3035 {
3036     CALL_HAL(addWeighted8u, cv_hal_addWeighted8u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
3037     const double* scalars_ = (const double*)scalars;
3038     float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2];
3039
3040     for( ; height--; src1 += step1, src2 += step2, dst += step )
3041     {
3042         int x = 0;
3043
3044 #if CV_SIMD128
3045         if( hasSIMD128() )
3046         {
3047             v_float32x4 g = v_setall_f32(gamma);
3048             v_float32x4 a = v_setall_f32(alpha);
3049             v_float32x4 b = v_setall_f32(beta);
3050
3051             for( ; x <= width - v_uint16x8::nlanes; x += v_uint16x8::nlanes )
3052             {
3053                 v_uint16x8 in1_16 = v_load_expand(src1 + x);
3054                 v_int32x4 in1_32_l, in1_32_h;
3055                 v_expand(v_reinterpret_as_s16(in1_16), in1_32_l, in1_32_h);
3056                 v_float32x4 in1_f_l = v_cvt_f32(in1_32_l);
3057                 v_float32x4 in1_f_h = v_cvt_f32(in1_32_h);
3058
3059                 v_uint16x8 in2_16 = v_load_expand(src2 + x);
3060                 v_int32x4 in2_32_l, in2_32_h;
3061                 v_expand(v_reinterpret_as_s16(in2_16), in2_32_l, in2_32_h);
3062                 v_float32x4 in2_f_l = v_cvt_f32(in2_32_l);
3063                 v_float32x4 in2_f_h = v_cvt_f32(in2_32_h);
3064
3065                 v_int32x4 out_l = v_round(in1_f_l * a + in2_f_l * b + g);
3066                 v_int32x4 out_h = v_round(in1_f_h * a + in2_f_h * b + g);
3067
3068                 v_int16x8 out_16 = v_pack(out_l, out_h);
3069                 v_pack_u_store(dst + x, out_16);
3070             }
3071         }
3072 #endif
3073         #if CV_ENABLE_UNROLLED
3074         for( ; x <= width - 4; x += 4 )
3075         {
3076             float t0, t1;
3077             t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
3078             t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
3079
3080             dst[x] = saturate_cast<uchar>(t0);
3081             dst[x+1] = saturate_cast<uchar>(t1);
3082
3083             t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
3084             t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
3085
3086             dst[x+2] = saturate_cast<uchar>(t0);
3087             dst[x+3] = saturate_cast<uchar>(t1);
3088         }
3089         #endif
3090
3091         for( ; x < width; x++ )
3092         {
3093             float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
3094             dst[x] = saturate_cast<uchar>(t0);
3095         }
3096     }
3097 }
3098
3099 void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
3100                            schar* dst, size_t step, int width, int height, void* scalars )
3101 {
3102     CALL_HAL(addWeighted8s, cv_hal_addWeighted8s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
3103     addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
3104 }
3105
3106 void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
3107                             ushort* dst, size_t step, int width, int height, void* scalars )
3108 {
3109     CALL_HAL(addWeighted16u, cv_hal_addWeighted16u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
3110     addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
3111 }
3112
3113 void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
3114                             short* dst, size_t step, int width, int height, void* scalars )
3115 {
3116     CALL_HAL(addWeighted16s, cv_hal_addWeighted16s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
3117     addWeighted_<short, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
3118 }
3119
3120 void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
3121                             int* dst, size_t step, int width, int height, void* scalars )
3122 {
3123     CALL_HAL(addWeighted32s, cv_hal_addWeighted32s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
3124     addWeighted_<int, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
3125 }
3126
3127 void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
3128                             float* dst, size_t step, int width, int height, void* scalars )
3129 {
3130     CALL_HAL(addWeighted32f, cv_hal_addWeighted32f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
3131     addWeighted_<float, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
3132 }
3133
3134 void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
3135                             double* dst, size_t step, int width, int height, void* scalars )
3136 {
3137     CALL_HAL(addWeighted64f, cv_hal_addWeighted64f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
3138     addWeighted_<double, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
3139 }
3140
3141 }} // cv::hal::
3142
3143 /* End of file. */