modules/core/src/convert.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "precomp.hpp"
  44 #include "opencl_kernels_core.hpp"
  45
  46 #ifdef __APPLE__
  47 #undef CV_NEON
  48 #define CV_NEON 0
  49 #endif
  50
  51 namespace cv
  52 {
  53
  54 /****************************************************************************************\
  55 *                                       split & merge                                    *
  56 \****************************************************************************************/
  57
  58 #if CV_NEON
  59 template<typename T> struct VSplit2;
  60 template<typename T> struct VSplit3;
  61 template<typename T> struct VSplit4;
  62
  63 #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
  64     template<>                                                                    \
  65     struct name<data_type>{                                                       \
  66         void operator()(const data_type* src, data_type* dst0, data_type* dst1){  \
  67             reg_type r = load_func(src);                                          \
  68             store_func(dst0, r.val[0]);                                           \
  69             store_func(dst1, r.val[1]);                                           \
  70         }                                                                         \
  71     }
  72
  73 #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
  74     template<>                                                                    \
  75     struct name<data_type>{                                                       \
  76         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
  77                         data_type* dst2){                                         \
  78             reg_type r = load_func(src);                                          \
  79             store_func(dst0, r.val[0]);                                           \
  80             store_func(dst1, r.val[1]);                                           \
  81             store_func(dst2, r.val[2]);                                           \
  82         }                                                                         \
  83     }
  84
  85 #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
  86     template<>                                                                    \
  87     struct name<data_type>{                                                       \
  88         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
  89                         data_type* dst2, data_type* dst3){                        \
  90             reg_type r = load_func(src);                                          \
  91             store_func(dst0, r.val[0]);                                           \
  92             store_func(dst1, r.val[1]);                                           \
  93             store_func(dst2, r.val[2]);                                           \
  94             store_func(dst3, r.val[3]);                                           \
  95         }                                                                         \
  96     }
  97
  98 SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
  99 SPLIT2_KERNEL_TEMPLATE(VSplit2, schar ,   int8x16x2_t, vld2q_s8 , vst1q_s8 );
 100 SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
 101 SPLIT2_KERNEL_TEMPLATE(VSplit2, short ,   int16x8x2_t, vld2q_s16, vst1q_s16);
 102 SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
 103 SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32);
 104 SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
 105
 106 SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
 107 SPLIT3_KERNEL_TEMPLATE(VSplit3, schar ,   int8x16x3_t, vld3q_s8 , vst1q_s8 );
 108 SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
 109 SPLIT3_KERNEL_TEMPLATE(VSplit3, short ,   int16x8x3_t, vld3q_s16, vst1q_s16);
 110 SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
 111 SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32);
 112 SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
 113
 114 SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
 115 SPLIT4_KERNEL_TEMPLATE(VSplit4, schar ,   int8x16x4_t, vld4q_s8 , vst1q_s8 );
 116 SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
 117 SPLIT4_KERNEL_TEMPLATE(VSplit4, short ,   int16x8x4_t, vld4q_s16, vst1q_s16);
 118 SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
 119 SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32);
 120 SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
 121 #endif
 122
 123 template<typename T> static void
 124 split_( const T* src, T** dst, int len, int cn )
 125 {
 126     int k = cn % 4 ? cn % 4 : 4;
 127     int i, j;
 128     if( k == 1 )
 129     {
 130         T* dst0 = dst[0];
 131
 132         if(cn == 1)
 133         {
 134             memcpy(dst0, src, len * sizeof(T));
 135         }
 136         else
 137         {
 138             for( i = 0, j = 0 ; i < len; i++, j += cn )
 139                 dst0[i] = src[j];
 140         }
 141     }
 142     else if( k == 2 )
 143     {
 144         T *dst0 = dst[0], *dst1 = dst[1];
 145         i = j = 0;
 146
 147 #if CV_NEON
 148         if(cn == 2)
 149         {
 150             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
 151             int inc_j = 2 * inc_i;
 152
 153             VSplit2<T> vsplit;
 154             for( ; i < len - inc_i; i += inc_i, j += inc_j)
 155                 vsplit(src + j, dst0 + i, dst1 + i);
 156         }
 157 #endif
 158         for( ; i < len; i++, j += cn )
 159         {
 160             dst0[i] = src[j];
 161             dst1[i] = src[j+1];
 162         }
 163     }
 164     else if( k == 3 )
 165     {
 166         T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
 167         i = j = 0;
 168
 169 #if CV_NEON
 170         if(cn == 3)
 171         {
 172             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
 173             int inc_j = 3 * inc_i;
 174
 175             VSplit3<T> vsplit;
 176             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
 177                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
 178         }
 179 #endif
 180         for( ; i < len; i++, j += cn )
 181         {
 182             dst0[i] = src[j];
 183             dst1[i] = src[j+1];
 184             dst2[i] = src[j+2];
 185         }
 186     }
 187     else
 188     {
 189         T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
 190         i = j = 0;
 191
 192 #if CV_NEON
 193         if(cn == 4)
 194         {
 195             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
 196             int inc_j = 4 * inc_i;
 197
 198             VSplit4<T> vsplit;
 199             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
 200                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
 201         }
 202 #endif
 203         for( ; i < len; i++, j += cn )
 204         {
 205             dst0[i] = src[j]; dst1[i] = src[j+1];
 206             dst2[i] = src[j+2]; dst3[i] = src[j+3];
 207         }
 208     }
 209
 210     for( ; k < cn; k += 4 )
 211     {
 212         T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
 213         for( i = 0, j = k; i < len; i++, j += cn )
 214         {
 215             dst0[i] = src[j]; dst1[i] = src[j+1];
 216             dst2[i] = src[j+2]; dst3[i] = src[j+3];
 217         }
 218     }
 219 }
 220
 221
 222 #if CV_NEON
 223 template<typename T> struct VMerge2;
 224 template<typename T> struct VMerge3;
 225 template<typename T> struct VMerge4;
 226
 227 #define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
 228     template<>                                                                    \
 229     struct name<data_type>{                                                       \
 230         void operator()(const data_type* src0, const data_type* src1,             \
 231                         data_type* dst){                                          \
 232             reg_type r;                                                           \
 233             r.val[0] = load_func(src0);                                           \
 234             r.val[1] = load_func(src1);                                           \
 235             store_func(dst, r);                                                   \
 236         }                                                                         \
 237     }
 238
 239 #define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
 240     template<>                                                                    \
 241     struct name<data_type>{                                                       \
 242         void operator()(const data_type* src0, const data_type* src1,             \
 243                         const data_type* src2, data_type* dst){                   \
 244             reg_type r;                                                           \
 245             r.val[0] = load_func(src0);                                           \
 246             r.val[1] = load_func(src1);                                           \
 247             r.val[2] = load_func(src2);                                           \
 248             store_func(dst, r);                                                   \
 249         }                                                                         \
 250     }
 251
 252 #define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
 253     template<>                                                                    \
 254     struct name<data_type>{                                                       \
 255         void operator()(const data_type* src0, const data_type* src1,             \
 256                         const data_type* src2, const data_type* src3,             \
 257                         data_type* dst){                                          \
 258             reg_type r;                                                           \
 259             r.val[0] = load_func(src0);                                           \
 260             r.val[1] = load_func(src1);                                           \
 261             r.val[2] = load_func(src2);                                           \
 262             r.val[3] = load_func(src3);                                           \
 263             store_func(dst, r);                                                   \
 264         }                                                                         \
 265     }
 266
 267 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
 268 MERGE2_KERNEL_TEMPLATE(VMerge2, schar ,   int8x16x2_t, vld1q_s8 , vst2q_s8 );
 269 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
 270 MERGE2_KERNEL_TEMPLATE(VMerge2, short ,   int16x8x2_t, vld1q_s16, vst2q_s16);
 271 MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
 272 MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32);
 273 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
 274
 275 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
 276 MERGE3_KERNEL_TEMPLATE(VMerge3, schar ,   int8x16x3_t, vld1q_s8 , vst3q_s8 );
 277 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
 278 MERGE3_KERNEL_TEMPLATE(VMerge3, short ,   int16x8x3_t, vld1q_s16, vst3q_s16);
 279 MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
 280 MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32);
 281 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
 282
 283 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
 284 MERGE4_KERNEL_TEMPLATE(VMerge4, schar ,   int8x16x4_t, vld1q_s8 , vst4q_s8 );
 285 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
 286 MERGE4_KERNEL_TEMPLATE(VMerge4, short ,   int16x8x4_t, vld1q_s16, vst4q_s16);
 287 MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
 288 MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32);
 289 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
 290 #endif
 291
 292 template<typename T> static void
 293 merge_( const T** src, T* dst, int len, int cn )
 294 {
 295     int k = cn % 4 ? cn % 4 : 4;
 296     int i, j;
 297     if( k == 1 )
 298     {
 299         const T* src0 = src[0];
 300         for( i = j = 0; i < len; i++, j += cn )
 301             dst[j] = src0[i];
 302     }
 303     else if( k == 2 )
 304     {
 305         const T *src0 = src[0], *src1 = src[1];
 306         i = j = 0;
 307 #if CV_NEON
 308         if(cn == 2)
 309         {
 310             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
 311             int inc_j = 2 * inc_i;
 312
 313             VMerge2<T> vmerge;
 314             for( ; i < len - inc_i; i += inc_i, j += inc_j)
 315                 vmerge(src0 + i, src1 + i, dst + j);
 316         }
 317 #endif
 318         for( ; i < len; i++, j += cn )
 319         {
 320             dst[j] = src0[i];
 321             dst[j+1] = src1[i];
 322         }
 323     }
 324     else if( k == 3 )
 325     {
 326         const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
 327         i = j = 0;
 328 #if CV_NEON
 329         if(cn == 3)
 330         {
 331             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
 332             int inc_j = 3 * inc_i;
 333
 334             VMerge3<T> vmerge;
 335             for( ; i < len - inc_i; i += inc_i, j += inc_j)
 336                 vmerge(src0 + i, src1 + i, src2 + i, dst + j);
 337         }
 338 #endif
 339         for( ; i < len; i++, j += cn )
 340         {
 341             dst[j] = src0[i];
 342             dst[j+1] = src1[i];
 343             dst[j+2] = src2[i];
 344         }
 345     }
 346     else
 347     {
 348         const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
 349         i = j = 0;
 350 #if CV_NEON
 351         if(cn == 4)
 352         {
 353             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
 354             int inc_j = 4 * inc_i;
 355
 356             VMerge4<T> vmerge;
 357             for( ; i < len - inc_i; i += inc_i, j += inc_j)
 358                 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
 359         }
 360 #endif
 361         for( ; i < len; i++, j += cn )
 362         {
 363             dst[j] = src0[i]; dst[j+1] = src1[i];
 364             dst[j+2] = src2[i]; dst[j+3] = src3[i];
 365         }
 366     }
 367
 368     for( ; k < cn; k += 4 )
 369     {
 370         const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
 371         for( i = 0, j = k; i < len; i++, j += cn )
 372         {
 373             dst[j] = src0[i]; dst[j+1] = src1[i];
 374             dst[j+2] = src2[i]; dst[j+3] = src3[i];
 375         }
 376     }
 377 }
 378
 379 static void split8u(const uchar* src, uchar** dst, int len, int cn )
 380 {
 381     split_(src, dst, len, cn);
 382 }
 383
 384 static void split16u(const ushort* src, ushort** dst, int len, int cn )
 385 {
 386     split_(src, dst, len, cn);
 387 }
 388
 389 static void split32s(const int* src, int** dst, int len, int cn )
 390 {
 391     split_(src, dst, len, cn);
 392 }
 393
 394 static void split64s(const int64* src, int64** dst, int len, int cn )
 395 {
 396     split_(src, dst, len, cn);
 397 }
 398
 399 static void merge8u(const uchar** src, uchar* dst, int len, int cn )
 400 {
 401     merge_(src, dst, len, cn);
 402 }
 403
 404 static void merge16u(const ushort** src, ushort* dst, int len, int cn )
 405 {
 406     merge_(src, dst, len, cn);
 407 }
 408
 409 static void merge32s(const int** src, int* dst, int len, int cn )
 410 {
 411     merge_(src, dst, len, cn);
 412 }
 413
 414 static void merge64s(const int64** src, int64* dst, int len, int cn )
 415 {
 416     merge_(src, dst, len, cn);
 417 }
 418
 419 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
 420 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
 421
 422 static SplitFunc getSplitFunc(int depth)
 423 {
 424     static SplitFunc splitTab[] =
 425     {
 426         (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u),
 427         (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0
 428     };
 429
 430     return splitTab[depth];
 431 }
 432
 433 static MergeFunc getMergeFunc(int depth)
 434 {
 435     static MergeFunc mergeTab[] =
 436     {
 437         (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u),
 438         (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0
 439     };
 440
 441     return mergeTab[depth];
 442 }
 443
 444 }
 445
 446 void cv::split(const Mat& src, Mat* mv)
 447 {
 448     int k, depth = src.depth(), cn = src.channels();
 449     if( cn == 1 )
 450     {
 451         src.copyTo(mv[0]);
 452         return;
 453     }
 454
 455     SplitFunc func = getSplitFunc(depth);
 456     CV_Assert( func != 0 );
 457
 458     int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1();
 459     int blocksize0 = (BLOCK_SIZE + esz-1)/esz;
 460     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
 461     const Mat** arrays = (const Mat**)(uchar*)_buf;
 462     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
 463
 464     arrays[0] = &src;
 465     for( k = 0; k < cn; k++ )
 466     {
 467         mv[k].create(src.dims, src.size, depth);
 468         arrays[k+1] = &mv[k];
 469     }
 470
 471     NAryMatIterator it(arrays, ptrs, cn+1);
 472     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
 473
 474     for( size_t i = 0; i < it.nplanes; i++, ++it )
 475     {
 476         for( int j = 0; j < total; j += blocksize )
 477         {
 478             int bsz = std::min(total - j, blocksize);
 479             func( ptrs[0], &ptrs[1], bsz, cn );
 480
 481             if( j + blocksize < total )
 482             {
 483                 ptrs[0] += bsz*esz;
 484                 for( k = 0; k < cn; k++ )
 485                     ptrs[k+1] += bsz*esz1;
 486             }
 487         }
 488     }
 489 }
 490
 491 #ifdef HAVE_OPENCL
 492
 493 namespace cv {
 494
 495 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
 496 {
 497     int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
 498             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
 499
 500     String dstargs, processelem, indexdecl;
 501     for (int i = 0; i < cn; ++i)
 502     {
 503         dstargs += format("DECLARE_DST_PARAM(%d)", i);
 504         indexdecl += format("DECLARE_INDEX(%d)", i);
 505         processelem += format("PROCESS_ELEM(%d)", i);
 506     }
 507
 508     ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
 509                   format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s"
 510                          " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s",
 511                          ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
 512                          processelem.c_str(), indexdecl.c_str()));
 513     if (k.empty())
 514         return false;
 515
 516     Size size = _m.size();
 517     _mv.create(cn, 1, depth);
 518     for (int i = 0; i < cn; ++i)
 519         _mv.create(size, depth, i);
 520
 521     std::vector<UMat> dst;
 522     _mv.getUMatVector(dst);
 523
 524     int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
 525     for (int i = 0; i < cn; ++i)
 526         argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
 527     k.set(argidx, rowsPerWI);
 528
 529     size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
 530     return k.run(2, globalsize, NULL, false);
 531 }
 532
 533 }
 534
 535 #endif
 536
 537 void cv::split(InputArray _m, OutputArrayOfArrays _mv)
 538 {
 539     CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(),
 540                ocl_split(_m, _mv))
 541
 542     Mat m = _m.getMat();
 543     if( m.empty() )
 544     {
 545         _mv.release();
 546         return;
 547     }
 548
 549     CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() );
 550
 551     Size size = m.size();
 552     int depth = m.depth(), cn = m.channels();
 553     _mv.create(cn, 1, depth);
 554     for (int i = 0; i < cn; ++i)
 555         _mv.create(size, depth, i);
 556
 557     std::vector<Mat> dst;
 558     _mv.getMatVector(dst);
 559
 560     split(m, &dst[0]);
 561 }
 562
 563 void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
 564 {
 565     CV_Assert( mv && n > 0 );
 566
 567     int depth = mv[0].depth();
 568     bool allch1 = true;
 569     int k, cn = 0;
 570     size_t i;
 571
 572     for( i = 0; i < n; i++ )
 573     {
 574         CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth);
 575         allch1 = allch1 && mv[i].channels() == 1;
 576         cn += mv[i].channels();
 577     }
 578
 579     CV_Assert( 0 < cn && cn <= CV_CN_MAX );
 580     _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn));
 581     Mat dst = _dst.getMat();
 582
 583     if( n == 1 )
 584     {
 585         mv[0].copyTo(dst);
 586         return;
 587     }
 588
 589     if( !allch1 )
 590     {
 591         AutoBuffer<int> pairs(cn*2);
 592         int j, ni=0;
 593
 594         for( i = 0, j = 0; i < n; i++, j += ni )
 595         {
 596             ni = mv[i].channels();
 597             for( k = 0; k < ni; k++ )
 598             {
 599                 pairs[(j+k)*2] = j + k;
 600                 pairs[(j+k)*2+1] = j + k;
 601             }
 602         }
 603         mixChannels( mv, n, &dst, 1, &pairs[0], cn );
 604         return;
 605     }
 606
 607     size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
 608     int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
 609     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
 610     const Mat** arrays = (const Mat**)(uchar*)_buf;
 611     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
 612
 613     arrays[0] = &dst;
 614     for( k = 0; k < cn; k++ )
 615         arrays[k+1] = &mv[k];
 616
 617     NAryMatIterator it(arrays, ptrs, cn+1);
 618     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
 619     MergeFunc func = getMergeFunc(depth);
 620
 621     for( i = 0; i < it.nplanes; i++, ++it )
 622     {
 623         for( int j = 0; j < total; j += blocksize )
 624         {
 625             int bsz = std::min(total - j, blocksize);
 626             func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn );
 627
 628             if( j + blocksize < total )
 629             {
 630                 ptrs[0] += bsz*esz;
 631                 for( int t = 0; t < cn; t++ )
 632                     ptrs[t+1] += bsz*esz1;
 633             }
 634         }
 635     }
 636 }
 637
 638 #ifdef HAVE_OPENCL
 639
 640 namespace cv {
 641
 642 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
 643 {
 644     std::vector<UMat> src, ksrc;
 645     _mv.getUMatVector(src);
 646     CV_Assert(!src.empty());
 647
 648     int type = src[0].type(), depth = CV_MAT_DEPTH(type),
 649             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
 650     Size size = src[0].size();
 651
 652     for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
 653     {
 654         int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
 655                 esz1 = CV_ELEM_SIZE1(idepth);
 656         if (src[i].dims > 2)
 657             return false;
 658
 659         CV_Assert(size == src[i].size() && depth == idepth);
 660
 661         for (int cn = 0; cn < icn; ++cn)
 662         {
 663             UMat tsrc = src[i];
 664             tsrc.offset += cn * esz1;
 665             ksrc.push_back(tsrc);
 666         }
 667     }
 668     int dcn = (int)ksrc.size();
 669
 670     String srcargs, processelem, cndecl, indexdecl;
 671     for (int i = 0; i < dcn; ++i)
 672     {
 673         srcargs += format("DECLARE_SRC_PARAM(%d)", i);
 674         processelem += format("PROCESS_ELEM(%d)", i);
 675         indexdecl += format("DECLARE_INDEX(%d)", i);
 676         cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
 677     }
 678
 679     ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
 680                   format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
 681                          " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
 682                          dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
 683                          indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
 684     if (k.empty())
 685         return false;
 686
 687     _dst.create(size, CV_MAKE_TYPE(depth, dcn));
 688     UMat dst = _dst.getUMat();
 689
 690     int argidx = 0;
 691     for (int i = 0; i < dcn; ++i)
 692         argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
 693     argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
 694     k.set(argidx, rowsPerWI);
 695
 696     size_t globalsize[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
 697     return k.run(2, globalsize, NULL, false);
 698 }
 699
 700 }
 701
 702 #endif
 703
 704 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst)
 705 {
 706     CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(),
 707                ocl_merge(_mv, _dst))
 708
 709     std::vector<Mat> mv;
 710     _mv.getMatVector(mv);
 711     merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst);
 712 }
 713
 714 /****************************************************************************************\
 715 *                       Generalized split/merge: mixing channels                         *
 716 \****************************************************************************************/
 717
 718 namespace cv
 719 {
 720
 721 template<typename T> static void
 722 mixChannels_( const T** src, const int* sdelta,
 723               T** dst, const int* ddelta,
 724               int len, int npairs )
 725 {
 726     int i, k;
 727     for( k = 0; k < npairs; k++ )
 728     {
 729         const T* s = src[k];
 730         T* d = dst[k];
 731         int ds = sdelta[k], dd = ddelta[k];
 732         if( s )
 733         {
 734             for( i = 0; i <= len - 2; i += 2, s += ds*2, d += dd*2 )
 735             {
 736                 T t0 = s[0], t1 = s[ds];
 737                 d[0] = t0; d[dd] = t1;
 738             }
 739             if( i < len )
 740                 d[0] = s[0];
 741         }
 742         else
 743         {
 744             for( i = 0; i <= len - 2; i += 2, d += dd*2 )
 745                 d[0] = d[dd] = 0;
 746             if( i < len )
 747                 d[0] = 0;
 748         }
 749     }
 750 }
 751
 752
 753 static void mixChannels8u( const uchar** src, const int* sdelta,
 754                            uchar** dst, const int* ddelta,
 755                            int len, int npairs )
 756 {
 757     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
 758 }
 759
 760 static void mixChannels16u( const ushort** src, const int* sdelta,
 761                             ushort** dst, const int* ddelta,
 762                             int len, int npairs )
 763 {
 764     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
 765 }
 766
 767 static void mixChannels32s( const int** src, const int* sdelta,
 768                             int** dst, const int* ddelta,
 769                             int len, int npairs )
 770 {
 771     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
 772 }
 773
 774 static void mixChannels64s( const int64** src, const int* sdelta,
 775                             int64** dst, const int* ddelta,
 776                             int len, int npairs )
 777 {
 778     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
 779 }
 780
 781 typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta,
 782         uchar** dst, const int* ddelta, int len, int npairs );
 783
 784 static MixChannelsFunc getMixchFunc(int depth)
 785 {
 786     static MixChannelsFunc mixchTab[] =
 787     {
 788         (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u,
 789         (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s,
 790         (MixChannelsFunc)mixChannels64s, 0
 791     };
 792
 793     return mixchTab[depth];
 794 }
 795
 796 }
 797
 798 void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, const int* fromTo, size_t npairs )
 799 {
 800     if( npairs == 0 )
 801         return;
 802     CV_Assert( src && nsrcs > 0 && dst && ndsts > 0 && fromTo && npairs > 0 );
 803
 804     size_t i, j, k, esz1 = dst[0].elemSize1();
 805     int depth = dst[0].depth();
 806
 807     AutoBuffer<uchar> buf((nsrcs + ndsts + 1)*(sizeof(Mat*) + sizeof(uchar*)) + npairs*(sizeof(uchar*)*2 + sizeof(int)*6));
 808     const Mat** arrays = (const Mat**)(uchar*)buf;
 809     uchar** ptrs = (uchar**)(arrays + nsrcs + ndsts);
 810     const uchar** srcs = (const uchar**)(ptrs + nsrcs + ndsts + 1);
 811     uchar** dsts = (uchar**)(srcs + npairs);
 812     int* tab = (int*)(dsts + npairs);
 813     int *sdelta = (int*)(tab + npairs*4), *ddelta = sdelta + npairs;
 814
 815     for( i = 0; i < nsrcs; i++ )
 816         arrays[i] = &src[i];
 817     for( i = 0; i < ndsts; i++ )
 818         arrays[i + nsrcs] = &dst[i];
 819     ptrs[nsrcs + ndsts] = 0;
 820
 821     for( i = 0; i < npairs; i++ )
 822     {
 823         int i0 = fromTo[i*2], i1 = fromTo[i*2+1];
 824         if( i0 >= 0 )
 825         {
 826             for( j = 0; j < nsrcs; i0 -= src[j].channels(), j++ )
 827                 if( i0 < src[j].channels() )
 828                     break;
 829             CV_Assert(j < nsrcs && src[j].depth() == depth);
 830             tab[i*4] = (int)j; tab[i*4+1] = (int)(i0*esz1);
 831             sdelta[i] = src[j].channels();
 832         }
 833         else
 834         {
 835             tab[i*4] = (int)(nsrcs + ndsts); tab[i*4+1] = 0;
 836             sdelta[i] = 0;
 837         }
 838
 839         for( j = 0; j < ndsts; i1 -= dst[j].channels(), j++ )
 840             if( i1 < dst[j].channels() )
 841                 break;
 842         CV_Assert(i1 >= 0 && j < ndsts && dst[j].depth() == depth);
 843         tab[i*4+2] = (int)(j + nsrcs); tab[i*4+3] = (int)(i1*esz1);
 844         ddelta[i] = dst[j].channels();
 845     }
 846
 847     NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts));
 848     int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1));
 849     MixChannelsFunc func = getMixchFunc(depth);
 850
 851     for( i = 0; i < it.nplanes; i++, ++it )
 852     {
 853         for( k = 0; k < npairs; k++ )
 854         {
 855             srcs[k] = ptrs[tab[k*4]] + tab[k*4+1];
 856             dsts[k] = ptrs[tab[k*4+2]] + tab[k*4+3];
 857         }
 858
 859         for( int t = 0; t < total; t += blocksize )
 860         {
 861             int bsz = std::min(total - t, blocksize);
 862             func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs );
 863
 864             if( t + blocksize < total )
 865                 for( k = 0; k < npairs; k++ )
 866                 {
 867                     srcs[k] += blocksize*sdelta[k]*esz1;
 868                     dsts[k] += blocksize*ddelta[k]*esz1;
 869                 }
 870         }
 871     }
 872 }
 873
 874 #ifdef HAVE_OPENCL
 875
 876 namespace cv {
 877
 878 static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx)
 879 {
 880     int totalChannels = 0;
 881     for (size_t i = 0, size = um.size(); i < size; ++i)
 882     {
 883         int ccn = um[i].channels();
 884         totalChannels += ccn;
 885
 886         if (totalChannels == cn)
 887         {
 888             idx = (int)(i + 1);
 889             cnidx = 0;
 890             return;
 891         }
 892         else if (totalChannels > cn)
 893         {
 894             idx = (int)i;
 895             cnidx = i == 0 ? cn : (cn - totalChannels + ccn);
 896             return;
 897         }
 898     }
 899
 900     idx = cnidx = -1;
 901 }
 902
 903 static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst,
 904                             const int* fromTo, size_t npairs)
 905 {
 906     std::vector<UMat> src, dst;
 907     _src.getUMatVector(src);
 908     _dst.getUMatVector(dst);
 909
 910     size_t nsrc = src.size(), ndst = dst.size();
 911     CV_Assert(nsrc > 0 && ndst > 0);
 912
 913     Size size = src[0].size();
 914     int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth),
 915             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
 916
 917     for (size_t i = 1, ssize = src.size(); i < ssize; ++i)
 918         CV_Assert(src[i].size() == size && src[i].depth() == depth);
 919     for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
 920         CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
 921
 922     String declsrc, decldst, declproc, declcn, indexdecl;
 923     std::vector<UMat> srcargs(npairs), dstargs(npairs);
 924
 925     for (size_t i = 0; i < npairs; ++i)
 926     {
 927         int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1];
 928         int src_idx, src_cnidx, dst_idx, dst_cnidx;
 929
 930         getUMatIndex(src, scn, src_idx, src_cnidx);
 931         getUMatIndex(dst, dcn, dst_idx, dst_cnidx);
 932
 933         CV_Assert(dst_idx >= 0 && src_idx >= 0);
 934
 935         srcargs[i] = src[src_idx];
 936         srcargs[i].offset += src_cnidx * esz;
 937
 938         dstargs[i] = dst[dst_idx];
 939         dstargs[i].offset += dst_cnidx * esz;
 940
 941         declsrc += format("DECLARE_INPUT_MAT(%d)", i);
 942         decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
 943         indexdecl += format("DECLARE_INDEX(%d)", i);
 944         declproc += format("PROCESS_ELEM(%d)", i);
 945         declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
 946     }
 947
 948     ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
 949                   format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s"
 950                          " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s",
 951                          ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(),
 952                          declproc.c_str(), indexdecl.c_str(), declcn.c_str()));
 953     if (k.empty())
 954         return false;
 955
 956     int argindex = 0;
 957     for (size_t i = 0; i < npairs; ++i)
 958         argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i]));
 959     for (size_t i = 0; i < npairs; ++i)
 960         argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i]));
 961     argindex = k.set(argindex, size.height);
 962     argindex = k.set(argindex, size.width);
 963     k.set(argindex, rowsPerWI);
 964
 965     size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
 966     return k.run(2, globalsize, NULL, false);
 967 }
 968
 969 }
 970
 971 #endif
 972
 973 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
 974                  const int* fromTo, size_t npairs)
 975 {
 976     if (npairs == 0 || fromTo == NULL)
 977         return;
 978
 979     CV_OCL_RUN(dst.isUMatVector(),
 980                ocl_mixChannels(src, dst, fromTo, npairs))
 981
 982     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
 983             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
 984             src.kind() != _InputArray::STD_VECTOR_UMAT;
 985     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
 986             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
 987             dst.kind() != _InputArray::STD_VECTOR_UMAT;
 988     int i;
 989     int nsrc = src_is_mat ? 1 : (int)src.total();
 990     int ndst = dst_is_mat ? 1 : (int)dst.total();
 991
 992     CV_Assert(nsrc > 0 && ndst > 0);
 993     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
 994     Mat* buf = _buf;
 995     for( i = 0; i < nsrc; i++ )
 996         buf[i] = src.getMat(src_is_mat ? -1 : i);
 997     for( i = 0; i < ndst; i++ )
 998         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
 999     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, fromTo, npairs);
1000 }
1001
1002 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
1003                      const std::vector<int>& fromTo)
1004 {
1005     if (fromTo.empty())
1006         return;
1007
1008     CV_OCL_RUN(dst.isUMatVector(),
1009                ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1))
1010
1011     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
1012             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
1013             src.kind() != _InputArray::STD_VECTOR_UMAT;
1014     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
1015             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
1016             dst.kind() != _InputArray::STD_VECTOR_UMAT;
1017     int i;
1018     int nsrc = src_is_mat ? 1 : (int)src.total();
1019     int ndst = dst_is_mat ? 1 : (int)dst.total();
1020
1021     CV_Assert(fromTo.size()%2 == 0 && nsrc > 0 && ndst > 0);
1022     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
1023     Mat* buf = _buf;
1024     for( i = 0; i < nsrc; i++ )
1025         buf[i] = src.getMat(src_is_mat ? -1 : i);
1026     for( i = 0; i < ndst; i++ )
1027         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
1028     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, &fromTo[0], fromTo.size()/2);
1029 }
1030
1031 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi)
1032 {
1033     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1034     CV_Assert( 0 <= coi && coi < cn );
1035     int ch[] = { coi, 0 };
1036
1037     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
1038     {
1039         UMat src = _src.getUMat();
1040         _dst.create(src.dims, &src.size[0], depth);
1041         UMat dst = _dst.getUMat();
1042         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
1043         return;
1044     }
1045
1046     Mat src = _src.getMat();
1047     _dst.create(src.dims, &src.size[0], depth);
1048     Mat dst = _dst.getMat();
1049     mixChannels(&src, 1, &dst, 1, ch, 1);
1050 }
1051
1052 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
1053 {
1054     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
1055     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
1056     CV_Assert( _src.sameSize(_dst) && sdepth == ddepth );
1057     CV_Assert( 0 <= coi && coi < dcn && scn == 1 );
1058
1059     int ch[] = { 0, coi };
1060     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
1061     {
1062         UMat src = _src.getUMat(), dst = _dst.getUMat();
1063         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
1064         return;
1065     }
1066
1067     Mat src = _src.getMat(), dst = _dst.getMat();
1068     mixChannels(&src, 1, &dst, 1, ch, 1);
1069 }
1070
1071 /****************************************************************************************\
1072 *                                convertScale[Abs]                                       *
1073 \****************************************************************************************/
1074
1075 namespace cv
1076 {
1077
1078 template<typename T, typename DT, typename WT>
1079 struct cvtScaleAbs_SIMD
1080 {
1081     int operator () (const T *, DT *, int, WT, WT) const
1082     {
1083         return 0;
1084     }
1085 };
1086
1087 #if CV_SSE2
1088
1089 template <>
1090 struct cvtScaleAbs_SIMD<uchar, uchar, float>
1091 {
1092     int operator () (const uchar * src, uchar * dst, int width,
1093                      float scale, float shift) const
1094     {
1095         int x = 0;
1096
1097         if (USE_SSE2)
1098         {
1099             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1100                 v_zero_f = _mm_setzero_ps();
1101             __m128i v_zero_i = _mm_setzero_si128();
1102
1103             for ( ; x <= width - 16; x += 16)
1104             {
1105                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1106                 __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
1107                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
1108                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1109                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
1110                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1111                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
1112                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
1113                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
1114                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
1115
1116                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
1117                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
1118                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
1119             }
1120         }
1121
1122         return x;
1123     }
1124 };
1125
1126 template <>
1127 struct cvtScaleAbs_SIMD<ushort, uchar, float>
1128 {
1129     int operator () (const ushort * src, uchar * dst, int width,
1130                      float scale, float shift) const
1131     {
1132         int x = 0;
1133
1134         if (USE_SSE2)
1135         {
1136             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1137                 v_zero_f = _mm_setzero_ps();
1138             __m128i v_zero_i = _mm_setzero_si128();
1139
1140             for ( ; x <= width - 8; x += 8)
1141             {
1142                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1143                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
1144                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1145                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
1146                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1147
1148                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
1149                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
1150             }
1151         }
1152
1153         return x;
1154     }
1155 };
1156
1157 template <>
1158 struct cvtScaleAbs_SIMD<short, uchar, float>
1159 {
1160     int operator () (const short * src, uchar * dst, int width,
1161                      float scale, float shift) const
1162     {
1163         int x = 0;
1164
1165         if (USE_SSE2)
1166         {
1167             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1168                 v_zero_f = _mm_setzero_ps();
1169             __m128i v_zero_i = _mm_setzero_si128();
1170
1171             for ( ; x <= width - 8; x += 8)
1172             {
1173                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1174                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
1175                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1176                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
1177                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1178
1179                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
1180                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
1181             }
1182         }
1183
1184         return x;
1185     }
1186 };
1187
1188 template <>
1189 struct cvtScaleAbs_SIMD<int, uchar, float>
1190 {
1191     int operator () (const int * src, uchar * dst, int width,
1192                      float scale, float shift) const
1193     {
1194         int x = 0;
1195
1196         if (USE_SSE2)
1197         {
1198             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1199                 v_zero_f = _mm_setzero_ps();
1200             __m128i v_zero_i = _mm_setzero_si128();
1201
1202             for ( ; x <= width - 8; x += 4)
1203             {
1204                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1205                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
1206                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1207
1208                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
1209                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
1210             }
1211         }
1212
1213         return x;
1214     }
1215 };
1216
1217 template <>
1218 struct cvtScaleAbs_SIMD<float, uchar, float>
1219 {
1220     int operator () (const float * src, uchar * dst, int width,
1221                      float scale, float shift) const
1222     {
1223         int x = 0;
1224
1225         if (USE_SSE2)
1226         {
1227             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1228                 v_zero_f = _mm_setzero_ps();
1229             __m128i v_zero_i = _mm_setzero_si128();
1230
1231             for ( ; x <= width - 8; x += 4)
1232             {
1233                 __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
1234                 v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
1235
1236                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
1237                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
1238             }
1239         }
1240
1241         return x;
1242     }
1243 };
1244
1245 #elif CV_NEON
1246
1247 template <>
1248 struct cvtScaleAbs_SIMD<uchar, uchar, float>
1249 {
1250     int operator () (const uchar * src, uchar * dst, int width,
1251                      float scale, float shift) const
1252     {
1253         int x = 0;
1254         float32x4_t v_shift = vdupq_n_f32(shift);
1255
1256         for ( ; x <= width - 16; x += 16)
1257         {
1258             uint8x16_t v_src = vld1q_u8(src + x);
1259             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
1260
1261             uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half));
1262             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1263             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1264
1265             v_quat = vmovl_u16(vget_high_u16(v_half));
1266             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1267             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1268
1269             v_half = vmovl_u8(vget_high_u8(v_src));
1270
1271             v_quat = vmovl_u16(vget_low_u16(v_half));
1272             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1273             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
1274
1275             v_quat = vmovl_u16(vget_high_u16(v_half));
1276             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1277             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
1278
1279             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1280                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1281             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
1282                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
1283
1284             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
1285         }
1286
1287         return x;
1288     }
1289 };
1290
1291 template <>
1292 struct cvtScaleAbs_SIMD<schar, uchar, float>
1293 {
1294     int operator () (const schar * src, uchar * dst, int width,
1295                      float scale, float shift) const
1296     {
1297         int x = 0;
1298         float32x4_t v_shift = vdupq_n_f32(shift);
1299
1300         for ( ; x <= width - 16; x += 16)
1301         {
1302             int8x16_t v_src = vld1q_s8(src + x);
1303             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
1304
1305             int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half));
1306             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1307             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1308
1309             v_quat = vmovl_s16(vget_high_s16(v_half));
1310             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1311             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1312
1313             v_half = vmovl_s8(vget_high_s8(v_src));
1314
1315             v_quat = vmovl_s16(vget_low_s16(v_half));
1316             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1317             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
1318
1319             v_quat = vmovl_s16(vget_high_s16(v_half));
1320             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1321             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
1322
1323             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1324                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1325             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
1326                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
1327
1328             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
1329         }
1330
1331         return x;
1332     }
1333 };
1334
1335 template <>
1336 struct cvtScaleAbs_SIMD<ushort, uchar, float>
1337 {
1338     int operator () (const ushort * src, uchar * dst, int width,
1339                      float scale, float shift) const
1340     {
1341         int x = 0;
1342         float32x4_t v_shift = vdupq_n_f32(shift);
1343
1344         for ( ; x <= width - 8; x += 8)
1345         {
1346             uint16x8_t v_src = vld1q_u16(src + x);
1347
1348             uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src));
1349             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
1350             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1351
1352             v_half = vmovl_u16(vget_high_u16(v_src));
1353             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
1354             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1355
1356             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1357                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1358
1359             vst1_u8(dst + x, vqmovn_u16(v_dst));
1360         }
1361
1362         return x;
1363     }
1364 };
1365
1366 template <>
1367 struct cvtScaleAbs_SIMD<short, uchar, float>
1368 {
1369     int operator () (const short * src, uchar * dst, int width,
1370                      float scale, float shift) const
1371     {
1372         int x = 0;
1373         float32x4_t v_shift = vdupq_n_f32(shift);
1374
1375         for ( ; x <= width - 8; x += 8)
1376         {
1377             int16x8_t v_src = vld1q_s16(src + x);
1378
1379             int32x4_t v_half = vmovl_s16(vget_low_s16(v_src));
1380             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
1381             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1382
1383             v_half = vmovl_s16(vget_high_s16(v_src));
1384             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
1385             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1386
1387             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1388                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1389
1390             vst1_u8(dst + x, vqmovn_u16(v_dst));
1391         }
1392
1393         return x;
1394     }
1395 };
1396
1397 template <>
1398 struct cvtScaleAbs_SIMD<int, uchar, float>
1399 {
1400     int operator () (const int * src, uchar * dst, int width,
1401                      float scale, float shift) const
1402     {
1403         int x = 0;
1404         float32x4_t v_shift = vdupq_n_f32(shift);
1405
1406         for ( ; x <= width - 8; x += 8)
1407         {
1408             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale);
1409             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1410             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
1411
1412             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale);
1413             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1414             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
1415
1416             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
1417             vst1_u8(dst + x, vqmovn_u16(v_dst));
1418         }
1419
1420         return x;
1421     }
1422 };
1423
1424 template <>
1425 struct cvtScaleAbs_SIMD<float, uchar, float>
1426 {
1427     int operator () (const float * src, uchar * dst, int width,
1428                      float scale, float shift) const
1429     {
1430         int x = 0;
1431         float32x4_t v_shift = vdupq_n_f32(shift);
1432
1433         for ( ; x <= width - 8; x += 8)
1434         {
1435             float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
1436             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1437             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
1438
1439             float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
1440             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1441             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
1442
1443             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
1444             vst1_u8(dst + x, vqmovn_u16(v_dst));
1445         }
1446
1447         return x;
1448     }
1449 };
1450
1451 #endif
1452
1453 template<typename T, typename DT, typename WT> static void
1454 cvtScaleAbs_( const T* src, size_t sstep,
1455               DT* dst, size_t dstep, Size size,
1456               WT scale, WT shift )
1457 {
1458     sstep /= sizeof(src[0]);
1459     dstep /= sizeof(dst[0]);
1460     cvtScaleAbs_SIMD<T, DT, WT> vop;
1461
1462     for( ; size.height--; src += sstep, dst += dstep )
1463     {
1464         int x = vop(src, dst, size.width, scale, shift);
1465
1466         #if CV_ENABLE_UNROLLED
1467         for( ; x <= size.width - 4; x += 4 )
1468         {
1469             DT t0, t1;
1470             t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift));
1471             t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift));
1472             dst[x] = t0; dst[x+1] = t1;
1473             t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift));
1474             t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift));
1475             dst[x+2] = t0; dst[x+3] = t1;
1476         }
1477         #endif
1478         for( ; x < size.width; x++ )
1479             dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
1480     }
1481 }
1482
1483 template <typename T, typename DT, typename WT>
1484 struct cvtScale_SIMD
1485 {
1486     int operator () (const T *, DT *, int, WT, WT) const
1487     {
1488         return 0;
1489     }
1490 };
1491
1492 #if CV_NEON
1493
1494 // from uchar
1495
1496 template <>
1497 struct cvtScale_SIMD<uchar, uchar, float>
1498 {
1499     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
1500     {
1501         int x = 0;
1502         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1503
1504         for ( ; x <= width - 8; x += 8)
1505         {
1506             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
1507             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1508             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1509
1510             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1511                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1512             vst1_u8(dst + x, vqmovn_u16(v_dst));
1513         }
1514
1515         return x;
1516     }
1517 };
1518
1519 template <>
1520 struct cvtScale_SIMD<uchar, schar, float>
1521 {
1522     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
1523     {
1524         int x = 0;
1525         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1526
1527         for ( ; x <= width - 8; x += 8)
1528         {
1529             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
1530             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1531             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1532
1533             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
1534                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
1535             vst1_s8(dst + x, vqmovn_s16(v_dst));
1536         }
1537
1538         return x;
1539     }
1540 };
1541
1542 template <>
1543 struct cvtScale_SIMD<uchar, ushort, float>
1544 {
1545     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
1546     {
1547         int x = 0;
1548         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1549
1550         for ( ; x <= width - 8; x += 8)
1551         {
1552             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
1553             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1554             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1555
1556             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1557                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1558             vst1q_u16(dst + x, v_dst);
1559         }
1560
1561         return x;
1562     }
1563 };
1564
1565 template <>
1566 struct cvtScale_SIMD<uchar, short, float>
1567 {
1568     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
1569     {
1570         int x = 0;
1571         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1572
1573         for ( ; x <= width - 8; x += 8)
1574         {
1575             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
1576             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1577             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1578
1579             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
1580                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
1581             vst1q_s16(dst + x, v_dst);
1582         }
1583
1584         return x;
1585     }
1586 };
1587
1588 template <>
1589 struct cvtScale_SIMD<uchar, int, float>
1590 {
1591     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
1592     {
1593         int x = 0;
1594         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1595
1596         for ( ; x <= width - 8; x += 8)
1597         {
1598             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
1599             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1600             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1601
1602             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
1603             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
1604         }
1605
1606         return x;
1607     }
1608 };
1609
1610 template <>
1611 struct cvtScale_SIMD<uchar, float, float>
1612 {
1613     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
1614     {
1615         int x = 0;
1616         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1617
1618         for ( ; x <= width - 8; x += 8)
1619         {
1620             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
1621             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
1622             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
1623         }
1624
1625         return x;
1626     }
1627 };
1628
1629 // from schar
1630
1631 template <>
1632 struct cvtScale_SIMD<schar, uchar, float>
1633 {
1634     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
1635     {
1636         int x = 0;
1637         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1638
1639         for ( ; x <= width - 8; x += 8)
1640         {
1641             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
1642             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1643             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1644
1645             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1646                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1647             vst1_u8(dst + x, vqmovn_u16(v_dst));
1648         }
1649
1650         return x;
1651     }
1652 };
1653
1654 template <>
1655 struct cvtScale_SIMD<schar, schar, float>
1656 {
1657     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
1658     {
1659         int x = 0;
1660         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1661
1662         for ( ; x <= width - 8; x += 8)
1663         {
1664             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
1665             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1666             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1667
1668             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
1669                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
1670             vst1_s8(dst + x, vqmovn_s16(v_dst));
1671         }
1672
1673         return x;
1674     }
1675 };
1676
1677 template <>
1678 struct cvtScale_SIMD<schar, ushort, float>
1679 {
1680     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
1681     {
1682         int x = 0;
1683         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1684
1685         for ( ; x <= width - 8; x += 8)
1686         {
1687             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
1688             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1689             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1690
1691             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1692                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1693             vst1q_u16(dst + x, v_dst);
1694         }
1695
1696         return x;
1697     }
1698 };
1699
1700 template <>
1701 struct cvtScale_SIMD<schar, short, float>
1702 {
1703     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
1704     {
1705         int x = 0;
1706         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1707
1708         for ( ; x <= width - 8; x += 8)
1709         {
1710             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
1711             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1712             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1713
1714             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
1715                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
1716             vst1q_s16(dst + x, v_dst);
1717         }
1718
1719         return x;
1720     }
1721 };
1722
1723 template <>
1724 struct cvtScale_SIMD<schar, int, float>
1725 {
1726     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
1727     {
1728         int x = 0;
1729         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1730
1731         for ( ; x <= width - 8; x += 8)
1732         {
1733             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
1734             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1735             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1736
1737             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
1738             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
1739         }
1740
1741         return x;
1742     }
1743 };
1744
1745 template <>
1746 struct cvtScale_SIMD<schar, float, float>
1747 {
1748     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
1749     {
1750         int x = 0;
1751         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1752
1753         for ( ; x <= width - 8; x += 8)
1754         {
1755             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
1756             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
1757             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
1758         }
1759
1760         return x;
1761     }
1762 };
1763
1764 // from ushort
1765
1766 template <>
1767 struct cvtScale_SIMD<ushort, uchar, float>
1768 {
1769     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
1770     {
1771         int x = 0;
1772         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1773
1774         for ( ; x <= width - 8; x += 8)
1775         {
1776             uint16x8_t v_src = vld1q_u16(src + x);
1777             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1778             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1779
1780             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1781                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1782             vst1_u8(dst + x, vqmovn_u16(v_dst));
1783         }
1784
1785         return x;
1786     }
1787 };
1788
1789 template <>
1790 struct cvtScale_SIMD<ushort, schar, float>
1791 {
1792     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
1793     {
1794         int x = 0;
1795         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1796
1797         for ( ; x <= width - 8; x += 8)
1798         {
1799             uint16x8_t v_src = vld1q_u16(src + x);
1800             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1801             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1802
1803             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
1804                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
1805             vst1_s8(dst + x, vqmovn_s16(v_dst));
1806         }
1807
1808         return x;
1809     }
1810 };
1811
1812 template <>
1813 struct cvtScale_SIMD<ushort, ushort, float>
1814 {
1815     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
1816     {
1817         int x = 0;
1818         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1819
1820         for ( ; x <= width - 8; x += 8)
1821         {
1822             uint16x8_t v_src = vld1q_u16(src + x);
1823             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1824             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1825
1826             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1827                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1828             vst1q_u16(dst + x, v_dst);
1829         }
1830
1831         return x;
1832     }
1833 };
1834
1835 template <>
1836 struct cvtScale_SIMD<ushort, short, float>
1837 {
1838     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
1839     {
1840         int x = 0;
1841         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1842
1843         for ( ; x <= width - 8; x += 8)
1844         {
1845             uint16x8_t v_src = vld1q_u16(src + x);
1846             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1847             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1848
1849             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
1850                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
1851             vst1q_s16(dst + x, v_dst);
1852         }
1853
1854         return x;
1855     }
1856 };
1857
1858 template <>
1859 struct cvtScale_SIMD<ushort, int, float>
1860 {
1861     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
1862     {
1863         int x = 0;
1864         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1865
1866         for ( ; x <= width - 8; x += 8)
1867         {
1868             uint16x8_t v_src = vld1q_u16(src + x);
1869             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
1870             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
1871
1872             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
1873             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
1874         }
1875
1876         return x;
1877     }
1878 };
1879
1880 template <>
1881 struct cvtScale_SIMD<ushort, float, float>
1882 {
1883     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
1884     {
1885         int x = 0;
1886         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1887
1888         for ( ; x <= width - 8; x += 8)
1889         {
1890             uint16x8_t v_src = vld1q_u16(src + x);
1891             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
1892             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
1893         }
1894
1895         return x;
1896     }
1897 };
1898
1899 // from short
1900
1901 template <>
1902 struct cvtScale_SIMD<short, uchar, float>
1903 {
1904     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
1905     {
1906         int x = 0;
1907         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1908
1909         for ( ; x <= width - 8; x += 8)
1910         {
1911             int16x8_t v_src = vld1q_s16(src + x);
1912             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1913             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1914
1915             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1916                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1917             vst1_u8(dst + x, vqmovn_u16(v_dst));
1918         }
1919
1920         return x;
1921     }
1922 };
1923
1924 template <>
1925 struct cvtScale_SIMD<short, schar, float>
1926 {
1927     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
1928     {
1929         int x = 0;
1930         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1931
1932         for ( ; x <= width - 8; x += 8)
1933         {
1934             int16x8_t v_src = vld1q_s16(src + x);
1935             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1936             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1937
1938             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
1939                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
1940             vst1_s8(dst + x, vqmovn_s16(v_dst));
1941         }
1942
1943         return x;
1944     }
1945 };
1946
1947 template <>
1948 struct cvtScale_SIMD<short, ushort, float>
1949 {
1950     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
1951     {
1952         int x = 0;
1953         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1954
1955         for ( ; x <= width - 8; x += 8)
1956         {
1957             int16x8_t v_src = vld1q_s16(src + x);
1958             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
1959             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
1960
1961             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
1962                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
1963             vst1q_u16(dst + x, v_dst);
1964         }
1965
1966         return x;
1967     }
1968 };
1969
1970 template <>
1971 struct cvtScale_SIMD<short, float, float>
1972 {
1973     int operator () (const short * src, float * dst, int width, float scale, float shift) const
1974     {
1975         int x = 0;
1976         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1977
1978         for ( ; x <= width - 8; x += 8)
1979         {
1980             int16x8_t v_src = vld1q_s16(src + x);
1981             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
1982             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
1983         }
1984
1985         return x;
1986     }
1987 };
1988
1989 // from int
1990
1991 template <>
1992 struct cvtScale_SIMD<int, uchar, float>
1993 {
1994     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
1995     {
1996         int x = 0;
1997         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
1998
1999         for ( ; x <= width - 8; x += 8)
2000         {
2001             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
2002             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
2003
2004             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2005                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2006             vst1_u8(dst + x, vqmovn_u16(v_dst));
2007         }
2008
2009         return x;
2010     }
2011 };
2012
2013 template <>
2014 struct cvtScale_SIMD<int, schar, float>
2015 {
2016     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
2017     {
2018         int x = 0;
2019         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2020
2021         for ( ; x <= width - 8; x += 8)
2022         {
2023             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
2024             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
2025
2026             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2027                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2028             vst1_s8(dst + x, vqmovn_s16(v_dst));
2029         }
2030
2031         return x;
2032     }
2033 };
2034
2035 template <>
2036 struct cvtScale_SIMD<int, ushort, float>
2037 {
2038     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
2039     {
2040         int x = 0;
2041         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2042
2043         for ( ; x <= width - 8; x += 8)
2044         {
2045             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
2046             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
2047
2048             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2049                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2050             vst1q_u16(dst + x, v_dst);
2051         }
2052
2053         return x;
2054     }
2055 };
2056
2057 template <>
2058 struct cvtScale_SIMD<int, short, float>
2059 {
2060     int operator () (const int * src, short * dst, int width, float scale, float shift) const
2061     {
2062         int x = 0;
2063         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2064
2065         for ( ; x <= width - 8; x += 8)
2066         {
2067             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
2068             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
2069
2070             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2071                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2072             vst1q_s16(dst + x, v_dst);
2073         }
2074
2075         return x;
2076     }
2077 };
2078
2079 // from float
2080
2081 template <>
2082 struct cvtScale_SIMD<float, uchar, float>
2083 {
2084     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
2085     {
2086         int x = 0;
2087         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2088
2089         for ( ; x <= width - 8; x += 8)
2090         {
2091             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
2092             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
2093
2094             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2095                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2096             vst1_u8(dst + x, vqmovn_u16(v_dst));
2097         }
2098
2099         return x;
2100     }
2101 };
2102
2103 template <>
2104 struct cvtScale_SIMD<float, schar, float>
2105 {
2106     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
2107     {
2108         int x = 0;
2109         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2110
2111         for ( ; x <= width - 8; x += 8)
2112         {
2113             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
2114             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
2115
2116             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2117                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2118             vst1_s8(dst + x, vqmovn_s16(v_dst));
2119         }
2120
2121         return x;
2122     }
2123 };
2124
2125 template <>
2126 struct cvtScale_SIMD<float, ushort, float>
2127 {
2128     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
2129     {
2130         int x = 0;
2131         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2132
2133         for ( ; x <= width - 8; x += 8)
2134         {
2135             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
2136             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
2137
2138             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2139                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2140             vst1q_u16(dst + x, v_dst);
2141         }
2142
2143         return x;
2144     }
2145 };
2146
2147 template <>
2148 struct cvtScale_SIMD<float, short, float>
2149 {
2150     int operator () (const float * src, short * dst, int width, float scale, float shift) const
2151     {
2152         int x = 0;
2153         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2154
2155         for ( ; x <= width - 8; x += 8)
2156         {
2157             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
2158             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
2159
2160             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2161                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2162             vst1q_s16(dst + x, v_dst);
2163         }
2164
2165         return x;
2166     }
2167 };
2168
2169 template <>
2170 struct cvtScale_SIMD<float, int, float>
2171 {
2172     int operator () (const float * src, int * dst, int width, float scale, float shift) const
2173     {
2174         int x = 0;
2175         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2176
2177         for ( ; x <= width - 4; x += 4)
2178             vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
2179
2180         return x;
2181     }
2182 };
2183
2184 template <>
2185 struct cvtScale_SIMD<float, float, float>
2186 {
2187     int operator () (const float * src, float * dst, int width, float scale, float shift) const
2188     {
2189         int x = 0;
2190         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
2191
2192         for ( ; x <= width - 4; x += 4)
2193             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
2194
2195         return x;
2196     }
2197 };
2198
2199 #endif
2200
2201 template<typename T, typename DT, typename WT> static void
2202 cvtScale_( const T* src, size_t sstep,
2203            DT* dst, size_t dstep, Size size,
2204            WT scale, WT shift )
2205 {
2206     sstep /= sizeof(src[0]);
2207     dstep /= sizeof(dst[0]);
2208
2209     cvtScale_SIMD<T, DT, WT> vop;
2210
2211     for( ; size.height--; src += sstep, dst += dstep )
2212     {
2213         int x = vop(src, dst, size.width, scale, shift);
2214
2215         #if CV_ENABLE_UNROLLED
2216         for( ; x <= size.width - 4; x += 4 )
2217         {
2218             DT t0, t1;
2219             t0 = saturate_cast<DT>(src[x]*scale + shift);
2220             t1 = saturate_cast<DT>(src[x+1]*scale + shift);
2221             dst[x] = t0; dst[x+1] = t1;
2222             t0 = saturate_cast<DT>(src[x+2]*scale + shift);
2223             t1 = saturate_cast<DT>(src[x+3]*scale + shift);
2224             dst[x+2] = t0; dst[x+3] = t1;
2225         }
2226         #endif
2227
2228         for( ; x < size.width; x++ )
2229             dst[x] = saturate_cast<DT>(src[x]*scale + shift);
2230     }
2231 }
2232
2233 //vz optimized template specialization
2234 template<> void
2235 cvtScale_<short, short, float>( const short* src, size_t sstep,
2236            short* dst, size_t dstep, Size size,
2237            float scale, float shift )
2238 {
2239     sstep /= sizeof(src[0]);
2240     dstep /= sizeof(dst[0]);
2241
2242     for( ; size.height--; src += sstep, dst += dstep )
2243     {
2244         int x = 0;
2245         #if CV_SSE2
2246             if(USE_SSE2)
2247             {
2248                 __m128 scale128 = _mm_set1_ps (scale);
2249                 __m128 shift128 = _mm_set1_ps (shift);
2250                 for(; x <= size.width - 8; x += 8 )
2251                 {
2252                     __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
2253                     __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
2254                     __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
2255                     __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
2256                     rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
2257                     rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
2258                     r0 = _mm_cvtps_epi32(rf0);
2259                     r1 = _mm_cvtps_epi32(rf1);
2260                     r0 = _mm_packs_epi32(r0, r1);
2261                     _mm_storeu_si128((__m128i*)(dst + x), r0);
2262                 }
2263             }
2264         #endif
2265
2266         for(; x < size.width; x++ )
2267             dst[x] = saturate_cast<short>(src[x]*scale + shift);
2268     }
2269 }
2270
2271 template<> void
2272 cvtScale_<short, int, float>( const short* src, size_t sstep,
2273            int* dst, size_t dstep, Size size,
2274            float scale, float shift )
2275 {
2276     sstep /= sizeof(src[0]);
2277     dstep /= sizeof(dst[0]);
2278
2279     for( ; size.height--; src += sstep, dst += dstep )
2280     {
2281         int x = 0;
2282
2283          #if CV_SSE2
2284             if(USE_SSE2)//~5X
2285             {
2286                 __m128 scale128 = _mm_set1_ps (scale);
2287                 __m128 shift128 = _mm_set1_ps (shift);
2288                 for(; x <= size.width - 8; x += 8 )
2289                 {
2290                     __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
2291                     __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
2292                     __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
2293                     __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
2294                     rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
2295                     rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
2296                     r0 = _mm_cvtps_epi32(rf0);
2297                     r1 = _mm_cvtps_epi32(rf1);
2298
2299                     _mm_storeu_si128((__m128i*)(dst + x), r0);
2300                     _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
2301                 }
2302             }
2303         #endif
2304
2305         //We will wait Haswell
2306         /*
2307         #if CV_AVX
2308             if(USE_AVX)//2X - bad variant
2309             {
2310                 ////TODO:AVX implementation (optimization?) required
2311                 __m256 scale256 = _mm256_set1_ps (scale);
2312                 __m256 shift256 = _mm256_set1_ps (shift);
2313                 for(; x <= size.width - 8; x += 8 )
2314                 {
2315                     __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
2316                     __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
2317                     __m256i res = _mm256_cvtps_epi32(r0);
2318                     _mm256_storeu_si256 ((__m256i*)(dst+x), res);
2319                 }
2320             }
2321         #endif*/
2322
2323         for(; x < size.width; x++ )
2324             dst[x] = saturate_cast<int>(src[x]*scale + shift);
2325     }
2326 }
2327
2328 template <typename T, typename DT>
2329 struct Cvt_SIMD
2330 {
2331     int operator() (const T *, DT *, int) const
2332     {
2333         return 0;
2334     }
2335 };
2336
2337 #if CV_NEON
2338
2339 // from uchar
2340
2341 template <>
2342 struct Cvt_SIMD<uchar, schar>
2343 {
2344     int operator() (const uchar * src, schar * dst, int width) const
2345     {
2346         int x = 0;
2347
2348         for ( ; x <= width - 8; x += 8)
2349             vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))));
2350
2351         return x;
2352     }
2353 };
2354
2355
2356 template <>
2357 struct Cvt_SIMD<uchar, ushort>
2358 {
2359     int operator() (const uchar * src, ushort * dst, int width) const
2360     {
2361         int x = 0;
2362
2363         for ( ; x <= width - 8; x += 8)
2364             vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x)));
2365
2366         return x;
2367     }
2368 };
2369
2370 template <>
2371 struct Cvt_SIMD<uchar, short>
2372 {
2373     int operator() (const uchar * src, short * dst, int width) const
2374     {
2375         int x = 0;
2376
2377         for ( ; x <= width - 8; x += 8)
2378             vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))));
2379
2380         return x;
2381     }
2382 };
2383
2384 template <>
2385 struct Cvt_SIMD<uchar, int>
2386 {
2387     int operator() (const uchar * src, int * dst, int width) const
2388     {
2389         int x = 0;
2390
2391         for ( ; x <= width - 8; x += 8)
2392         {
2393             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
2394             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
2395             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
2396         }
2397
2398         return x;
2399     }
2400 };
2401
2402 template <>
2403 struct Cvt_SIMD<uchar, float>
2404 {
2405     int operator() (const uchar * src, float * dst, int width) const
2406     {
2407         int x = 0;
2408
2409         for ( ; x <= width - 8; x += 8)
2410         {
2411             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
2412             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
2413             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
2414         }
2415
2416         return x;
2417     }
2418 };
2419
2420 // from schar
2421
2422 template <>
2423 struct Cvt_SIMD<schar, uchar>
2424 {
2425     int operator() (const schar * src, uchar * dst, int width) const
2426     {
2427         int x = 0;
2428
2429         for ( ; x <= width - 8; x += 8)
2430             vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x))));
2431
2432         return x;
2433     }
2434 };
2435
2436 template <>
2437 struct Cvt_SIMD<schar, short>
2438 {
2439     int operator() (const schar * src, short * dst, int width) const
2440     {
2441         int x = 0;
2442
2443         for ( ; x <= width - 8; x += 8)
2444             vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x)));
2445
2446         return x;
2447     }
2448 };
2449
2450 template <>
2451 struct Cvt_SIMD<schar, ushort>
2452 {
2453     int operator() (const schar * src, ushort * dst, int width) const
2454     {
2455         int x = 0;
2456
2457         for ( ; x <= width - 8; x += 8)
2458         {
2459             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
2460             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
2461                                             vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
2462         }
2463
2464         return x;
2465     }
2466 };
2467
2468
2469 template <>
2470 struct Cvt_SIMD<schar, int>
2471 {
2472     int operator() (const schar * src, int * dst, int width) const
2473     {
2474         int x = 0;
2475
2476         for ( ; x <= width - 8; x += 8)
2477         {
2478             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
2479             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
2480             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
2481         }
2482
2483         return x;
2484     }
2485 };
2486
2487 template <>
2488 struct Cvt_SIMD<schar, float>
2489 {
2490     int operator() (const schar * src, float * dst, int width) const
2491     {
2492         int x = 0;
2493
2494         for ( ; x <= width - 8; x += 8)
2495         {
2496             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
2497             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
2498             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
2499         }
2500
2501         return x;
2502     }
2503 };
2504
2505 // from ushort
2506
2507 template <>
2508 struct Cvt_SIMD<ushort, uchar>
2509 {
2510     int operator() (const ushort * src, uchar * dst, int width) const
2511     {
2512         int x = 0;
2513
2514         for ( ; x <= width - 16; x += 16)
2515         {
2516             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
2517             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2)));
2518         }
2519
2520         return x;
2521     }
2522 };
2523
2524 template <>
2525 struct Cvt_SIMD<ushort, schar>
2526 {
2527     int operator() (const ushort * src, schar * dst, int width) const
2528     {
2529         int x = 0;
2530
2531         for ( ; x <= width - 16; x += 16)
2532         {
2533             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
2534             int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
2535             int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
2536             int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
2537             int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
2538
2539             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
2540                                           vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
2541         }
2542
2543         return x;
2544     }
2545 };
2546
2547 template <>
2548 struct Cvt_SIMD<ushort, short>
2549 {
2550     int operator() (const ushort * src, short * dst, int width) const
2551     {
2552         int x = 0;
2553
2554         for ( ; x <= width - 8; x += 8)
2555         {
2556             uint16x8_t v_src = vld1q_u16(src + x);
2557             int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
2558             int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
2559
2560             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
2561         }
2562
2563         return x;
2564     }
2565 };
2566
2567 template <>
2568 struct Cvt_SIMD<ushort, int>
2569 {
2570     int operator() (const ushort * src, int * dst, int width) const
2571     {
2572         int x = 0;
2573
2574         for ( ; x <= width - 8; x += 8)
2575         {
2576             uint16x8_t v_src = vld1q_u16(src + x);
2577             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
2578             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
2579         }
2580
2581         return x;
2582     }
2583 };
2584
2585 template <>
2586 struct Cvt_SIMD<ushort, float>
2587 {
2588     int operator() (const ushort * src, float * dst, int width) const
2589     {
2590         int x = 0;
2591
2592         for ( ; x <= width - 8; x += 8)
2593         {
2594             uint16x8_t v_src = vld1q_u16(src + x);
2595             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
2596             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
2597         }
2598
2599         return x;
2600     }
2601 };
2602
2603 // from short
2604
2605 template <>
2606 struct Cvt_SIMD<short, uchar>
2607 {
2608     int operator() (const short * src, uchar * dst, int width) const
2609     {
2610         int x = 0;
2611
2612         for ( ; x <= width - 16; x += 16)
2613         {
2614             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
2615             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2)));
2616         }
2617
2618         return x;
2619     }
2620 };
2621
2622 template <>
2623 struct Cvt_SIMD<short, schar>
2624 {
2625     int operator() (const short * src, schar * dst, int width) const
2626     {
2627         int x = 0;
2628
2629         for ( ; x <= width - 16; x += 16)
2630         {
2631             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
2632             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2)));
2633         }
2634
2635         return x;
2636     }
2637 };
2638
2639 template <>
2640 struct Cvt_SIMD<short, ushort>
2641 {
2642     int operator() (const short * src, ushort * dst, int width) const
2643     {
2644         int x = 0;
2645
2646         for ( ; x <= width - 8; x += 8)
2647         {
2648             int16x8_t v_src = vld1q_s16(src + x);
2649             uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src)));
2650             uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src)));
2651             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
2652         }
2653
2654         return x;
2655     }
2656 };
2657
2658 template <>
2659 struct Cvt_SIMD<short, int>
2660 {
2661     int operator() (const short * src, int * dst, int width) const
2662     {
2663         int x = 0;
2664
2665         for ( ; x <= width - 8; x += 8)
2666         {
2667             int16x8_t v_src = vld1q_s16(src + x);
2668             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
2669             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
2670         }
2671
2672         return x;
2673     }
2674 };
2675
2676 template <>
2677 struct Cvt_SIMD<short, float>
2678 {
2679     int operator() (const short * src, float * dst, int width) const
2680     {
2681         int x = 0;
2682
2683         for ( ; x <= width - 8; x += 8)
2684         {
2685             int16x8_t v_src = vld1q_s16(src + x);
2686             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
2687             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
2688         }
2689
2690         return x;
2691     }
2692 };
2693
2694 // from int
2695
2696 template <>
2697 struct Cvt_SIMD<int, uchar>
2698 {
2699     int operator() (const int * src, uchar * dst, int width) const
2700     {
2701         int x = 0;
2702
2703         for ( ; x <= width - 16; x += 16)
2704         {
2705             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
2706             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
2707             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
2708             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4)));
2709             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
2710         }
2711
2712         return x;
2713     }
2714 };
2715
2716 template <>
2717 struct Cvt_SIMD<int, schar>
2718 {
2719     int operator() (const int * src, schar * dst, int width) const
2720     {
2721         int x = 0;
2722
2723         for ( ; x <= width - 16; x += 16)
2724         {
2725             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
2726             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
2727             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
2728             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
2729             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
2730         }
2731
2732         return x;
2733     }
2734 };
2735
2736
2737 template <>
2738 struct Cvt_SIMD<int, ushort>
2739 {
2740     int operator() (const int * src, ushort * dst, int width) const
2741     {
2742         int x = 0;
2743
2744         for ( ; x <= width - 8; x += 8)
2745         {
2746             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
2747             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
2748         }
2749
2750         return x;
2751     }
2752 };
2753
2754 template <>
2755 struct Cvt_SIMD<int, short>
2756 {
2757     int operator() (const int * src, short * dst, int width) const
2758     {
2759         int x = 0;
2760
2761         for ( ; x <= width - 8; x += 8)
2762         {
2763             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
2764             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
2765         }
2766
2767         return x;
2768     }
2769 };
2770
2771 template <>
2772 struct Cvt_SIMD<int, float>
2773 {
2774     int operator() (const int * src, float * dst, int width) const
2775     {
2776         int x = 0;
2777
2778         for ( ; x <= width - 4; x += 4)
2779             vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x)));
2780
2781         return x;
2782     }
2783 };
2784
2785 // from float
2786
2787 template <>
2788 struct Cvt_SIMD<float, uchar>
2789 {
2790     int operator() (const float * src, uchar * dst, int width) const
2791     {
2792         int x = 0;
2793
2794         for ( ; x <= width - 16; x += 16)
2795         {
2796             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
2797             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
2798             uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8));
2799             uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12));
2800             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
2801             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4)));
2802             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
2803         }
2804
2805         return x;
2806     }
2807 };
2808
2809 template <>
2810 struct Cvt_SIMD<float, schar>
2811 {
2812     int operator() (const float * src, schar * dst, int width) const
2813     {
2814         int x = 0;
2815
2816         for ( ; x <= width - 16; x += 16)
2817         {
2818             int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x));
2819             int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4));
2820             int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8));
2821             int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12));
2822             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
2823             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
2824             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
2825         }
2826
2827         return x;
2828     }
2829 };
2830
2831
2832 template <>
2833 struct Cvt_SIMD<float, ushort>
2834 {
2835     int operator() (const float * src, ushort * dst, int width) const
2836     {
2837         int x = 0;
2838
2839         for ( ; x <= width - 8; x += 8)
2840         {
2841             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
2842             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
2843             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
2844         }
2845
2846         return x;
2847     }
2848 };
2849
2850 template <>
2851 struct Cvt_SIMD<float, int>
2852 {
2853     int operator() (const float * src, int * dst, int width) const
2854     {
2855         int x = 0;
2856
2857         for ( ; x <= width - 4; x += 4)
2858             vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x)));
2859
2860         return x;
2861     }
2862 };
2863
2864 #endif
2865
2866 template<typename T, typename DT> static void
2867 cvt_( const T* src, size_t sstep,
2868       DT* dst, size_t dstep, Size size )
2869 {
2870     sstep /= sizeof(src[0]);
2871     dstep /= sizeof(dst[0]);
2872     Cvt_SIMD<T, DT> vop;
2873
2874     for( ; size.height--; src += sstep, dst += dstep )
2875     {
2876         int x = vop(src, dst, size.width);
2877         #if CV_ENABLE_UNROLLED
2878         for( ; x <= size.width - 4; x += 4 )
2879         {
2880             DT t0, t1;
2881             t0 = saturate_cast<DT>(src[x]);
2882             t1 = saturate_cast<DT>(src[x+1]);
2883             dst[x] = t0; dst[x+1] = t1;
2884             t0 = saturate_cast<DT>(src[x+2]);
2885             t1 = saturate_cast<DT>(src[x+3]);
2886             dst[x+2] = t0; dst[x+3] = t1;
2887         }
2888         #endif
2889         for( ; x < size.width; x++ )
2890             dst[x] = saturate_cast<DT>(src[x]);
2891     }
2892 }
2893
2894 //vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
2895 template<>  void
2896 cvt_<float, short>( const float* src, size_t sstep,
2897      short* dst, size_t dstep, Size size )
2898 {
2899     sstep /= sizeof(src[0]);
2900     dstep /= sizeof(dst[0]);
2901
2902     for( ; size.height--; src += sstep, dst += dstep )
2903     {
2904         int x = 0;
2905         #if   CV_SSE2
2906         if(USE_SSE2){
2907               for( ; x <= size.width - 8; x += 8 )
2908             {
2909                 __m128 src128 = _mm_loadu_ps (src + x);
2910                 __m128i src_int128 = _mm_cvtps_epi32 (src128);
2911
2912                 src128 = _mm_loadu_ps (src + x + 4);
2913                 __m128i src1_int128 = _mm_cvtps_epi32 (src128);
2914
2915                 src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
2916                 _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
2917             }
2918         }
2919         #endif
2920         for( ; x < size.width; x++ )
2921             dst[x] = saturate_cast<short>(src[x]);
2922     }
2923
2924 }
2925
2926
2927 template<typename T> static void
2928 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
2929 {
2930     sstep /= sizeof(src[0]);
2931     dstep /= sizeof(dst[0]);
2932
2933     for( ; size.height--; src += sstep, dst += dstep )
2934         memcpy(dst, src, size.width*sizeof(src[0]));
2935 }
2936
2937 #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \
2938 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
2939                          dtype* dst, size_t dstep, Size size, double* scale) \
2940 { \
2941     tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
2942 }
2943
2944 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
2945 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
2946 dtype* dst, size_t dstep, Size size, double* scale) \
2947 { \
2948     cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
2949 }
2950
2951 #if defined(HAVE_IPP)
2952 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
2953 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
2954                          dtype* dst, size_t dstep, Size size, double*) \
2955 { \
2956     if (src && dst)\
2957     {\
2958         if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0) \
2959             return; \
2960         setIppErrorStatus(); \
2961     }\
2962     cvt_(src, sstep, dst, dstep, size); \
2963 }
2964
2965 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
2966 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
2967                          dtype* dst, size_t dstep, Size size, double*) \
2968 { \
2969     if (src && dst)\
2970     {\
2971         if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0) \
2972             return; \
2973         setIppErrorStatus(); \
2974     }\
2975     cvt_(src, sstep, dst, dstep, size); \
2976 }
2977 #else
2978 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
2979 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
2980                          dtype* dst, size_t dstep, Size size, double*) \
2981 { \
2982     cvt_(src, sstep, dst, dstep, size); \
2983 }
2984 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
2985 #endif
2986
2987 #define DEF_CVT_FUNC(suffix, stype, dtype) \
2988 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
2989                          dtype* dst, size_t dstep, Size size, double*) \
2990 { \
2991     cvt_(src, sstep, dst, dstep, size); \
2992 }
2993
2994 #define DEF_CPY_FUNC(suffix, stype) \
2995 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
2996                          stype* dst, size_t dstep, Size size, double*) \
2997 { \
2998     cpy_(src, sstep, dst, dstep, size); \
2999 }
3000
3001
3002 DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
3003 DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
3004 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
3005 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
3006 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
3007 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
3008 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
3009
3010 DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
3011 DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
3012 DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
3013 DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
3014 DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
3015 DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
3016 DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
3017
3018 DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
3019 DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
3020 DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
3021 DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
3022 DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
3023 DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
3024 DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
3025
3026 DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
3027 DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
3028 DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
3029 DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
3030 DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
3031 DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
3032 DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
3033
3034 DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
3035 DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
3036 DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
3037 DEF_CVT_SCALE_FUNC(16s,    short, short, float)
3038 DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
3039 DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
3040 DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
3041
3042 DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
3043 DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
3044 DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
3045 DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
3046 DEF_CVT_SCALE_FUNC(32s,    int, int, double)
3047 DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
3048 DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
3049
3050 DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
3051 DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
3052 DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
3053 DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
3054 DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
3055 DEF_CVT_SCALE_FUNC(32f,    float, float, float)
3056 DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
3057
3058 DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
3059 DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
3060 DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
3061 DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
3062 DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
3063 DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
3064 DEF_CVT_SCALE_FUNC(64f,    double, double, double)
3065
3066 DEF_CPY_FUNC(8u,     uchar)
3067 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
3068 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
3069 DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
3070 DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
3071 DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
3072 DEF_CVT_FUNC(64f8u,  double, uchar)
3073
3074 DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
3075 DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
3076 DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
3077 DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
3078 DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
3079 DEF_CVT_FUNC(64f8s,  double, schar)
3080
3081 DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
3082 DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
3083 DEF_CPY_FUNC(16u,    ushort)
3084 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
3085 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
3086 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
3087 DEF_CVT_FUNC(64f16u, double, ushort)
3088
3089 DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
3090 DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
3091 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
3092 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
3093 DEF_CVT_FUNC(32f16s, float, short)
3094 DEF_CVT_FUNC(64f16s, double, short)
3095
3096 DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
3097 DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
3098 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
3099 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
3100 DEF_CPY_FUNC(32s,    int)
3101 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
3102 DEF_CVT_FUNC(64f32s, double, int)
3103
3104 DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
3105 DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
3106 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
3107 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
3108 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
3109 DEF_CVT_FUNC(64f32f, double, float)
3110
3111 DEF_CVT_FUNC(8u64f,  uchar, double)
3112 DEF_CVT_FUNC(8s64f,  schar, double)
3113 DEF_CVT_FUNC(16u64f, ushort, double)
3114 DEF_CVT_FUNC(16s64f, short, double)
3115 DEF_CVT_FUNC(32s64f, int, double)
3116 DEF_CVT_FUNC(32f64f, float, double)
3117 DEF_CPY_FUNC(64s,    int64)
3118
3119 static BinaryFunc getCvtScaleAbsFunc(int depth)
3120 {
3121     static BinaryFunc cvtScaleAbsTab[] =
3122     {
3123         (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u,
3124         (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u,
3125         (BinaryFunc)cvtScaleAbs64f8u, 0
3126     };
3127
3128     return cvtScaleAbsTab[depth];
3129 }
3130
3131 BinaryFunc getConvertFunc(int sdepth, int ddepth)
3132 {
3133     static BinaryFunc cvtTab[][8] =
3134     {
3135         {
3136             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
3137             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
3138             (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0
3139         },
3140         {
3141             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
3142             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
3143             (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0
3144         },
3145         {
3146             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
3147             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
3148             (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0
3149         },
3150         {
3151             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
3152             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
3153             (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0
3154         },
3155         {
3156             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
3157             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
3158             (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0
3159         },
3160         {
3161             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
3162             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
3163             (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0
3164         },
3165         {
3166             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
3167             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
3168             (BinaryFunc)(cvt64s), 0
3169         },
3170         {
3171             0, 0, 0, 0, 0, 0, 0, 0
3172         }
3173     };
3174
3175     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
3176 }
3177
3178 static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
3179 {
3180     static BinaryFunc cvtScaleTab[][8] =
3181     {
3182         {
3183             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
3184             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
3185             (BinaryFunc)cvtScale64f8u, 0
3186         },
3187         {
3188             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
3189             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
3190             (BinaryFunc)cvtScale64f8s, 0
3191         },
3192         {
3193             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
3194             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
3195             (BinaryFunc)cvtScale64f16u, 0
3196         },
3197         {
3198             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
3199             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
3200             (BinaryFunc)cvtScale64f16s, 0
3201         },
3202         {
3203             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
3204             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
3205             (BinaryFunc)cvtScale64f32s, 0
3206         },
3207         {
3208             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
3209             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
3210             (BinaryFunc)cvtScale64f32f, 0
3211         },
3212         {
3213             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
3214             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
3215             (BinaryFunc)cvtScale64f, 0
3216         },
3217         {
3218             0, 0, 0, 0, 0, 0, 0, 0
3219         }
3220     };
3221
3222     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
3223 }
3224
3225 #ifdef HAVE_OPENCL
3226
3227 static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
3228 {
3229     const ocl::Device & d = ocl::Device::getDefault();
3230     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
3231         kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
3232     bool doubleSupport = d.doubleFPConfig() > 0;
3233
3234     if (!doubleSupport && depth == CV_64F)
3235         return false;
3236
3237     char cvt[2][50];
3238     int wdepth = std::max(depth, CV_32F);
3239     String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
3240                          " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
3241                          " -D workT1=%s -D rowsPerWI=%d%s",
3242                          ocl::typeToStr(CV_8UC(kercn)),
3243                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
3244                          ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
3245                          ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
3246                          ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
3247                          ocl::typeToStr(wdepth), rowsPerWI,
3248                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
3249     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt);
3250     if (k.empty())
3251         return false;
3252
3253     UMat src = _src.getUMat();
3254     _dst.create(src.size(), CV_8UC(cn));
3255     UMat dst = _dst.getUMat();
3256
3257     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
3258             dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
3259
3260     if (wdepth == CV_32F)
3261         k.args(srcarg, dstarg, (float)alpha, (float)beta);
3262     else if (wdepth == CV_64F)
3263         k.args(srcarg, dstarg, alpha, beta);
3264
3265     size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
3266     return k.run(2, globalsize, NULL, false);
3267 }
3268
3269 #endif
3270
3271 }
3272
3273 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
3274 {
3275     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
3276                ocl_convertScaleAbs(_src, _dst, alpha, beta))
3277
3278     Mat src = _src.getMat();
3279     int cn = src.channels();
3280     double scale[] = {alpha, beta};
3281     _dst.create( src.dims, src.size, CV_8UC(cn) );
3282     Mat dst = _dst.getMat();
3283     BinaryFunc func = getCvtScaleAbsFunc(src.depth());
3284     CV_Assert( func != 0 );
3285
3286     if( src.dims <= 2 )
3287     {
3288         Size sz = getContinuousSize(src, dst, cn);
3289         func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale );
3290     }
3291     else
3292     {
3293         const Mat* arrays[] = {&src, &dst, 0};
3294         uchar* ptrs[2];
3295         NAryMatIterator it(arrays, ptrs);
3296         Size sz((int)it.size*cn, 1);
3297
3298         for( size_t i = 0; i < it.nplanes; i++, ++it )
3299             func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale );
3300     }
3301 }
3302
3303 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
3304 {
3305     bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
3306
3307     if( _type < 0 )
3308         _type = _dst.fixedType() ? _dst.type() : type();
3309     else
3310         _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
3311
3312     int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
3313     if( sdepth == ddepth && noScale )
3314     {
3315         copyTo(_dst);
3316         return;
3317     }
3318
3319     Mat src = *this;
3320
3321     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
3322     double scale[] = {alpha, beta};
3323     int cn = channels();
3324     CV_Assert( func != 0 );
3325
3326     if( dims <= 2 )
3327     {
3328         _dst.create( size(), _type );
3329         Mat dst = _dst.getMat();
3330         Size sz = getContinuousSize(src, dst, cn);
3331         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
3332     }
3333     else
3334     {
3335         _dst.create( dims, size, _type );
3336         Mat dst = _dst.getMat();
3337         const Mat* arrays[] = {&src, &dst, 0};
3338         uchar* ptrs[2];
3339         NAryMatIterator it(arrays, ptrs);
3340         Size sz((int)(it.size*cn), 1);
3341
3342         for( size_t i = 0; i < it.nplanes; i++, ++it )
3343             func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, scale);
3344     }
3345 }
3346
3347 /****************************************************************************************\
3348 *                                    LUT Transform                                       *
3349 \****************************************************************************************/
3350
3351 namespace cv
3352 {
3353
3354 template<typename T> static void
3355 LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn )
3356 {
3357     if( lutcn == 1 )
3358     {
3359         for( int i = 0; i < len*cn; i++ )
3360             dst[i] = lut[src[i]];
3361     }
3362     else
3363     {
3364         for( int i = 0; i < len*cn; i += cn )
3365             for( int k = 0; k < cn; k++ )
3366                 dst[i+k] = lut[src[i+k]*cn+k];
3367     }
3368 }
3369
3370 static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn )
3371 {
3372     LUT8u_( src, lut, dst, len, cn, lutcn );
3373 }
3374
3375 static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn )
3376 {
3377     LUT8u_( src, lut, dst, len, cn, lutcn );
3378 }
3379
3380 static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn )
3381 {
3382     LUT8u_( src, lut, dst, len, cn, lutcn );
3383 }
3384
3385 static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn )
3386 {
3387     LUT8u_( src, lut, dst, len, cn, lutcn );
3388 }
3389
3390 static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn )
3391 {
3392     LUT8u_( src, lut, dst, len, cn, lutcn );
3393 }
3394
3395 static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn )
3396 {
3397     LUT8u_( src, lut, dst, len, cn, lutcn );
3398 }
3399
3400 static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn )
3401 {
3402     LUT8u_( src, lut, dst, len, cn, lutcn );
3403 }
3404
3405 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn );
3406
3407 static LUTFunc lutTab[] =
3408 {
3409     (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s,
3410     (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0
3411 };
3412
3413 #ifdef HAVE_OPENCL
3414
3415 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
3416 {
3417     int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
3418
3419     UMat src = _src.getUMat(), lut = _lut.getUMat();
3420     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
3421     UMat dst = _dst.getUMat();
3422     int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_src, _dst)) : dcn;
3423
3424     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
3425                   format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
3426                          ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
3427     if (k.empty())
3428         return false;
3429
3430     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
3431         ocl::KernelArg::WriteOnly(dst, dcn, kercn));
3432
3433     size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 };
3434     return k.run(2, globalSize, NULL, false);
3435 }
3436
3437 #endif
3438
3439 #if defined(HAVE_IPP)
3440 namespace ipp {
3441
3442 #if 0 // there are no performance benefits (PR #2653)
3443 class IppLUTParallelBody_LUTC1 : public ParallelLoopBody
3444 {
3445 public:
3446     bool* ok;
3447     const Mat& src_;
3448     const Mat& lut_;
3449     Mat& dst_;
3450
3451     typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep,
3452                           IppiSize roiSize, const void* pTable, int nBitSize);
3453     IppFn fn;
3454
3455     int width;
3456
3457     IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
3458         : ok(_ok), src_(src), lut_(lut), dst_(dst)
3459     {
3460         width = dst.cols * dst.channels();
3461
3462         size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
3463
3464         fn =
3465                 elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R :
3466                 elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R :
3467                 NULL;
3468
3469         *ok = (fn != NULL);
3470     }
3471
3472     void operator()( const cv::Range& range ) const
3473     {
3474         if (!*ok)
3475             return;
3476
3477         const int row0 = range.start;
3478         const int row1 = range.end;
3479
3480         Mat src = src_.rowRange(row0, row1);
3481         Mat dst = dst_.rowRange(row0, row1);
3482
3483         IppiSize sz = { width, dst.rows };
3484
3485         CV_DbgAssert(fn != NULL);
3486         if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0)
3487         {
3488             setIppErrorStatus();
3489             *ok = false;
3490         }
3491     }
3492 private:
3493     IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&);
3494     IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&);
3495 };
3496 #endif
3497
3498 class IppLUTParallelBody_LUTCN : public ParallelLoopBody
3499 {
3500 public:
3501     bool *ok;
3502     const Mat& src_;
3503     const Mat& lut_;
3504     Mat& dst_;
3505
3506     int lutcn;
3507
3508     uchar* lutBuffer;
3509     uchar* lutTable[4];
3510
3511     IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
3512         : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL)
3513     {
3514         lutcn = lut.channels();
3515         IppiSize sz256 = {256, 1};
3516
3517         size_t elemSize1 = dst.elemSize1();
3518         CV_DbgAssert(elemSize1 == 1);
3519         lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4);
3520         lutTable[0] = lutBuffer + 0;
3521         lutTable[1] = lutBuffer + 1 * 256 * elemSize1;
3522         lutTable[2] = lutBuffer + 2 * 256 * elemSize1;
3523         lutTable[3] = lutBuffer + 3 * 256 * elemSize1;
3524
3525         CV_DbgAssert(lutcn == 3 || lutcn == 4);
3526         if (lutcn == 3)
3527         {
3528             IppStatus status = ippiCopy_8u_C3P3R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
3529             if (status < 0)
3530             {
3531                 setIppErrorStatus();
3532                 return;
3533             }
3534         }
3535         else if (lutcn == 4)
3536         {
3537             IppStatus status = ippiCopy_8u_C4P4R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
3538             if (status < 0)
3539             {
3540                 setIppErrorStatus();
3541                 return;
3542             }
3543         }
3544
3545         *ok = true;
3546     }
3547
3548     ~IppLUTParallelBody_LUTCN()
3549     {
3550         if (lutBuffer != NULL)
3551             ippFree(lutBuffer);
3552         lutBuffer = NULL;
3553         lutTable[0] = NULL;
3554     }
3555
3556     void operator()( const cv::Range& range ) const
3557     {
3558         if (!*ok)
3559             return;
3560
3561         const int row0 = range.start;
3562         const int row1 = range.end;
3563
3564         Mat src = src_.rowRange(row0, row1);
3565         Mat dst = dst_.rowRange(row0, row1);
3566
3567         if (lutcn == 3)
3568         {
3569             if (ippiLUTPalette_8u_C3R(
3570                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
3571                     ippiSize(dst.size()), lutTable, 8) >= 0)
3572                 return;
3573         }
3574         else if (lutcn == 4)
3575         {
3576             if (ippiLUTPalette_8u_C4R(
3577                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
3578                     ippiSize(dst.size()), lutTable, 8) >= 0)
3579                 return;
3580         }
3581         setIppErrorStatus();
3582         *ok = false;
3583     }
3584 private:
3585     IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&);
3586     IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&);
3587 };
3588 } // namespace ipp
3589 #endif // IPP
3590
3591 class LUTParallelBody : public ParallelLoopBody
3592 {
3593 public:
3594     bool* ok;
3595     const Mat& src_;
3596     const Mat& lut_;
3597     Mat& dst_;
3598
3599     LUTFunc func;
3600
3601     LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
3602         : ok(_ok), src_(src), lut_(lut), dst_(dst)
3603     {
3604         func = lutTab[lut.depth()];
3605         *ok = (func != NULL);
3606     }
3607
3608     void operator()( const cv::Range& range ) const
3609     {
3610         CV_DbgAssert(*ok);
3611
3612         const int row0 = range.start;
3613         const int row1 = range.end;
3614
3615         Mat src = src_.rowRange(row0, row1);
3616         Mat dst = dst_.rowRange(row0, row1);
3617
3618         int cn = src.channels();
3619         int lutcn = lut_.channels();
3620
3621         const Mat* arrays[] = {&src, &dst, 0};
3622         uchar* ptrs[2];
3623         NAryMatIterator it(arrays, ptrs);
3624         int len = (int)it.size;
3625
3626         for( size_t i = 0; i < it.nplanes; i++, ++it )
3627             func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn);
3628     }
3629 private:
3630     LUTParallelBody(const LUTParallelBody&);
3631     LUTParallelBody& operator=(const LUTParallelBody&);
3632 };
3633
3634 }
3635
3636 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
3637 {
3638     int cn = _src.channels(), depth = _src.depth();
3639     int lutcn = _lut.channels();
3640
3641     CV_Assert( (lutcn == cn || lutcn == 1) &&
3642         _lut.total() == 256 && _lut.isContinuous() &&
3643         (depth == CV_8U || depth == CV_8S) );
3644
3645     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
3646                ocl_LUT(_src, _lut, _dst))
3647
3648     Mat src = _src.getMat(), lut = _lut.getMat();
3649     _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
3650     Mat dst = _dst.getMat();
3651
3652     if (_src.dims() <= 2)
3653     {
3654         bool ok = false;
3655         Ptr<ParallelLoopBody> body;
3656 #if defined(HAVE_IPP)
3657         size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
3658 #if 0 // there are no performance benefits (PR #2653)
3659         if (lutcn == 1)
3660         {
3661             ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok);
3662             body.reset(p);
3663         }
3664         else
3665 #endif
3666         if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1)
3667         {
3668             ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok);
3669             body.reset(p);
3670         }
3671 #endif
3672         if (body == NULL || ok == false)
3673         {
3674             ok = false;
3675             ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok);
3676             body.reset(p);
3677         }
3678         if (body != NULL && ok)
3679         {
3680             Range all(0, dst.rows);
3681             if (dst.total()>>18)
3682                 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
3683             else
3684                 (*body)(all);
3685             if (ok)
3686                 return;
3687         }
3688     }
3689
3690     LUTFunc func = lutTab[lut.depth()];
3691     CV_Assert( func != 0 );
3692
3693     const Mat* arrays[] = {&src, &dst, 0};
3694     uchar* ptrs[2];
3695     NAryMatIterator it(arrays, ptrs);
3696     int len = (int)it.size;
3697
3698     for( size_t i = 0; i < it.nplanes; i++, ++it )
3699         func(ptrs[0], lut.ptr(), ptrs[1], len, cn, lutcn);
3700 }
3701
3702 namespace cv {
3703
3704 #ifdef HAVE_OPENCL
3705
3706 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
3707                            double scale, double delta )
3708 {
3709     UMat src = _src.getUMat();
3710
3711     if( _mask.empty() )
3712         src.convertTo( _dst, dtype, scale, delta );
3713     else if (src.channels() <= 4)
3714     {
3715         const ocl::Device & dev = ocl::Device::getDefault();
3716
3717         int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
3718                 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
3719                 rowsPerWI = dev.isIntel() ? 4 : 1;
3720
3721         float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
3722         bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
3723                 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
3724                 haveDelta = std::fabs(delta) > DBL_EPSILON,
3725                 doubleSupport = dev.doubleFPConfig() > 0;
3726
3727         if (!haveScale && !haveDelta && stype == dtype)
3728         {
3729             _src.copyTo(_dst, _mask);
3730             return true;
3731         }
3732         if (haveZeroScale)
3733         {
3734             _dst.setTo(Scalar(delta), _mask);
3735             return true;
3736         }
3737
3738         if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
3739             return false;
3740
3741         char cvt[2][40];
3742         String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
3743                              " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
3744                              ocl::typeToStr(stype), ocl::typeToStr(dtype),
3745                              ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
3746                              rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
3747                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
3748                              doubleSupport ? " -D DOUBLE_SUPPORT" : "",
3749                              haveScale ? " -D HAVE_SCALE" : "",
3750                              haveDelta ? " -D HAVE_DELTA" : "",
3751                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
3752
3753         ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
3754         if (k.empty())
3755             return false;
3756
3757         UMat mask = _mask.getUMat(), dst = _dst.getUMat();
3758
3759         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
3760                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
3761                 dstarg = ocl::KernelArg::ReadWrite(dst);
3762
3763         if (haveScale)
3764         {
3765             if (haveDelta)
3766                 k.args(srcarg, maskarg, dstarg, fscale, fdelta);
3767             else
3768                 k.args(srcarg, maskarg, dstarg, fscale);
3769         }
3770         else
3771         {
3772             if (haveDelta)
3773                 k.args(srcarg, maskarg, dstarg, fdelta);
3774             else
3775                 k.args(srcarg, maskarg, dstarg);
3776         }
3777
3778         size_t globalsize[2] = { src.cols, (src.rows + rowsPerWI - 1) / rowsPerWI };
3779         return k.run(2, globalsize, NULL, false);
3780     }
3781     else
3782     {
3783         UMat temp;
3784         src.convertTo( temp, dtype, scale, delta );
3785         temp.copyTo( _dst, _mask );
3786     }
3787
3788     return true;
3789 }
3790
3791 #endif
3792
3793 }
3794
3795 void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
3796                     int norm_type, int rtype, InputArray _mask )
3797 {
3798     double scale = 1, shift = 0;
3799     if( norm_type == CV_MINMAX )
3800     {
3801         double smin = 0, smax = 0;
3802         double dmin = MIN( a, b ), dmax = MAX( a, b );
3803         minMaxLoc( _src, &smin, &smax, 0, 0, _mask );
3804         scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
3805         shift = dmin - smin*scale;
3806     }
3807     else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
3808     {
3809         scale = norm( _src, norm_type, _mask );
3810         scale = scale > DBL_EPSILON ? a/scale : 0.;
3811         shift = 0;
3812     }
3813     else
3814         CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
3815
3816     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3817     if( rtype < 0 )
3818         rtype = _dst.fixedType() ? _dst.depth() : depth;
3819     _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn));
3820
3821     CV_OCL_RUN(_dst.isUMat(),
3822                ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
3823
3824     Mat src = _src.getMat(), dst = _dst.getMat();
3825     if( _mask.empty() )
3826         src.convertTo( dst, rtype, scale, shift );
3827     else
3828     {
3829         Mat temp;
3830         src.convertTo( temp, rtype, scale, shift );
3831         temp.copyTo( dst, _mask );
3832     }
3833 }
3834
3835 CV_IMPL void
3836 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
3837 {
3838     void* dptrs[] = { dstarr0, dstarr1, dstarr2, dstarr3 };
3839     cv::Mat src = cv::cvarrToMat(srcarr);
3840     int i, j, nz = 0;
3841     for( i = 0; i < 4; i++ )
3842         nz += dptrs[i] != 0;
3843     CV_Assert( nz > 0 );
3844     std::vector<cv::Mat> dvec(nz);
3845     std::vector<int> pairs(nz*2);
3846
3847     for( i = j = 0; i < 4; i++ )
3848     {
3849         if( dptrs[i] != 0 )
3850         {
3851             dvec[j] = cv::cvarrToMat(dptrs[i]);
3852             CV_Assert( dvec[j].size() == src.size() );
3853             CV_Assert( dvec[j].depth() == src.depth() );
3854             CV_Assert( dvec[j].channels() == 1 );
3855             CV_Assert( i < src.channels() );
3856             pairs[j*2] = i;
3857             pairs[j*2+1] = j;
3858             j++;
3859         }
3860     }
3861     if( nz == src.channels() )
3862         cv::split( src, dvec );
3863     else
3864     {
3865         cv::mixChannels( &src, 1, &dvec[0], nz, &pairs[0], nz );
3866     }
3867 }
3868
3869
3870 CV_IMPL void
3871 cvMerge( const void* srcarr0, const void* srcarr1, const void* srcarr2,
3872          const void* srcarr3, void* dstarr )
3873 {
3874     const void* sptrs[] = { srcarr0, srcarr1, srcarr2, srcarr3 };
3875     cv::Mat dst = cv::cvarrToMat(dstarr);
3876     int i, j, nz = 0;
3877     for( i = 0; i < 4; i++ )
3878         nz += sptrs[i] != 0;
3879     CV_Assert( nz > 0 );
3880     std::vector<cv::Mat> svec(nz);
3881     std::vector<int> pairs(nz*2);
3882
3883     for( i = j = 0; i < 4; i++ )
3884     {
3885         if( sptrs[i] != 0 )
3886         {
3887             svec[j] = cv::cvarrToMat(sptrs[i]);
3888             CV_Assert( svec[j].size == dst.size &&
3889                 svec[j].depth() == dst.depth() &&
3890                 svec[j].channels() == 1 && i < dst.channels() );
3891             pairs[j*2] = j;
3892             pairs[j*2+1] = i;
3893             j++;
3894         }
3895     }
3896
3897     if( nz == dst.channels() )
3898         cv::merge( svec, dst );
3899     else
3900     {
3901         cv::mixChannels( &svec[0], nz, &dst, 1, &pairs[0], nz );
3902     }
3903 }
3904
3905
3906 CV_IMPL void
3907 cvMixChannels( const CvArr** src, int src_count,
3908                CvArr** dst, int dst_count,
3909                const int* from_to, int pair_count )
3910 {
3911     cv::AutoBuffer<cv::Mat> buf(src_count + dst_count);
3912
3913     int i;
3914     for( i = 0; i < src_count; i++ )
3915         buf[i] = cv::cvarrToMat(src[i]);
3916     for( i = 0; i < dst_count; i++ )
3917         buf[i+src_count] = cv::cvarrToMat(dst[i]);
3918     cv::mixChannels(&buf[0], src_count, &buf[src_count], dst_count, from_to, pair_count);
3919 }
3920
3921 CV_IMPL void
3922 cvConvertScaleAbs( const void* srcarr, void* dstarr,
3923                    double scale, double shift )
3924 {
3925     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3926     CV_Assert( src.size == dst.size && dst.type() == CV_8UC(src.channels()));
3927     cv::convertScaleAbs( src, dst, scale, shift );
3928 }
3929
3930 CV_IMPL void
3931 cvConvertScale( const void* srcarr, void* dstarr,
3932                 double scale, double shift )
3933 {
3934     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3935
3936     CV_Assert( src.size == dst.size && src.channels() == dst.channels() );
3937     src.convertTo(dst, dst.type(), scale, shift);
3938 }
3939
3940 CV_IMPL void cvLUT( const void* srcarr, void* dstarr, const void* lutarr )
3941 {
3942     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), lut = cv::cvarrToMat(lutarr);
3943
3944     CV_Assert( dst.size() == src.size() && dst.type() == CV_MAKETYPE(lut.depth(), src.channels()) );
3945     cv::LUT( src, lut, dst );
3946 }
3947
3948 CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
3949                           double a, double b, int norm_type, const CvArr* maskarr )
3950 {
3951     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
3952     if( maskarr )
3953         mask = cv::cvarrToMat(maskarr);
3954     CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
3955     cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
3956 }
3957
3958 /* End of file. */