modules/core/src/arithm_simd.hpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                          License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17 // Third party copyrights are property of their respective owners.
  18 //
  19 // Redistribution and use in source and binary forms, with or without modification,
  20 // are permitted provided that the following conditions are met:
  21 //
  22 //   * Redistribution's of source code must retain the above copyright notice,
  23 //     this list of conditions and the following disclaimer.
  24 //
  25 //   * Redistribution's in binary form must reproduce the above copyright notice,
  26 //     this list of conditions and the following disclaimer in the documentation
  27 //     and/or other materials provided with the distribution.
  28 //
  29 //   * The name of the copyright holders may not be used to endorse or promote products
  30 //     derived from this software without specific prior written permission.
  31 //
  32 // This software is provided by the copyright holders and contributors "as is" and
  33 // any express or implied warranties, including, but not limited to, the implied
  34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35 // In no event shall the Intel Corporation or contributors be liable for any direct,
  36 // indirect, incidental, special, exemplary, or consequential damages
  37 // (including, but not limited to, procurement of substitute goods or services;
  38 // loss of use, data, or profits; or business interruption) however caused
  39 // and on any theory of liability, whether in contract, strict liability,
  40 // or tort (including negligence or otherwise) arising in any way out of
  41 // the use of this software, even if advised of the possibility of such damage.
  42 //
  43 //M*/
  44
  45 #ifndef __OPENCV_ARITHM_SIMD_HPP__
  46 #define __OPENCV_ARITHM_SIMD_HPP__
  47
  48 namespace cv {
  49
  50 struct NOP {};
  51
  52 #if CV_SSE2 || CV_NEON
  53 #define IF_SIMD(op) op
  54 #else
  55 #define IF_SIMD(op) NOP
  56 #endif
  57
  58
  59 #if CV_SSE2 || CV_NEON
  60
  61 #define FUNCTOR_TEMPLATE(name)          \
  62     template<typename T> struct name {}
  63
  64 FUNCTOR_TEMPLATE(VLoadStore128);
  65 #if CV_SSE2
  66 FUNCTOR_TEMPLATE(VLoadStore64);
  67 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
  68 #if CV_AVX2
  69 FUNCTOR_TEMPLATE(VLoadStore256);
  70 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
  71 #endif
  72 #endif
  73
  74 #endif
  75
  76 #if CV_AVX2
  77
  78 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
  79     template <>                                                                                  \
  80     struct name<template_arg>{                                                                   \
  81         typedef register_type reg_type;                                                          \
  82         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
  83         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
  84     }
  85
  86 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
  87     template <>                                                                     \
  88     struct name<template_arg>{                                                      \
  89         typedef register_type reg_type;                                             \
  90         static reg_type load(const template_arg * p) { return load_body (p); }      \
  91         static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
  92     }
  93
  94 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
  95     template<>                                                                 \
  96     struct name<template_arg>                                                  \
  97     {                                                                          \
  98         VLoadStore256<template_arg>::reg_type operator()(                      \
  99                         const VLoadStore256<template_arg>::reg_type & a,       \
 100                         const VLoadStore256<template_arg>::reg_type & b) const \
 101         {                                                                      \
 102             body;                                                              \
 103         }                                                                      \
 104     }
 105
 106 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
 107     template<>                                                                 \
 108     struct name<template_arg>                                                  \
 109     {                                                                          \
 110         VLoadStore256<template_arg>::reg_type operator()(                      \
 111                         const VLoadStore256<template_arg>::reg_type & a,       \
 112                         const VLoadStore256<template_arg>::reg_type &  ) const \
 113         {                                                                      \
 114             body;                                                              \
 115         }                                                                      \
 116     }
 117
 118 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
 119 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
 120 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
 121 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
 122 FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
 123 FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
 124 FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
 125
 126 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
 127 FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
 128 FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
 129
 130 FUNCTOR_TEMPLATE(VAdd);
 131 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
 132 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
 133 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
 134 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
 135 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
 136 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
 137 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
 138
 139 FUNCTOR_TEMPLATE(VSub);
 140 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
 141 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
 142 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
 143 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
 144 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
 145 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
 146 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
 147
 148 FUNCTOR_TEMPLATE(VMin);
 149 FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
 150 FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
 151 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
 152 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
 153 FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
 154 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
 155 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
 156
 157 FUNCTOR_TEMPLATE(VMax);
 158 FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
 159 FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
 160 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
 161 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
 162 FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
 163 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
 164 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
 165
 166
 167 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
 168                                                            0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
 169 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
 170                                                            0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
 171
 172 FUNCTOR_TEMPLATE(VAbsDiff);
 173 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
 174         return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
 175     );
 176 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
 177         __m256i d = _mm256_subs_epi8(a, b);
 178         __m256i m = _mm256_cmpgt_epi8(b, a);
 179         return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
 180     );
 181 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
 182         return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
 183     );
 184 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
 185         __m256i M = _mm256_max_epi16(a, b);
 186         __m256i m = _mm256_min_epi16(a, b);
 187         return _mm256_subs_epi16(M, m);
 188     );
 189 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
 190         __m256i d = _mm256_sub_epi32(a, b);
 191         __m256i m = _mm256_cmpgt_epi32(b, a);
 192         return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
 193     );
 194 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
 195         return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
 196     );
 197 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
 198         return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
 199     );
 200
 201 FUNCTOR_TEMPLATE(VAnd);
 202 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
 203 FUNCTOR_TEMPLATE(VOr);
 204 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
 205 FUNCTOR_TEMPLATE(VXor);
 206 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
 207 FUNCTOR_TEMPLATE(VNot);
 208 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
 209
 210 #elif CV_SSE2
 211
 212 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
 213     template <>                                                                                  \
 214     struct name<template_arg>{                                                                   \
 215         typedef register_type reg_type;                                                          \
 216         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
 217         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
 218     }
 219
 220 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
 221     template <>                                                                \
 222     struct name<template_arg>{                                                 \
 223         typedef register_type reg_type;                                        \
 224         static reg_type load(const template_arg * p) { return load_body (p); } \
 225         static void store(template_arg * p, reg_type v) { store_body (p, v); } \
 226     }
 227
 228 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
 229     template<>                                                                 \
 230     struct name<template_arg>                                                  \
 231     {                                                                          \
 232         VLoadStore128<template_arg>::reg_type operator()(                      \
 233                         const VLoadStore128<template_arg>::reg_type & a,       \
 234                         const VLoadStore128<template_arg>::reg_type & b) const \
 235         {                                                                      \
 236             body;                                                              \
 237         }                                                                      \
 238     }
 239
 240 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
 241     template<>                                                                 \
 242     struct name<template_arg>                                                  \
 243     {                                                                          \
 244         VLoadStore128<template_arg>::reg_type operator()(                      \
 245                         const VLoadStore128<template_arg>::reg_type & a,       \
 246                         const VLoadStore128<template_arg>::reg_type &  ) const \
 247         {                                                                      \
 248             body;                                                              \
 249         }                                                                      \
 250     }
 251
 252 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
 253 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
 254 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
 255 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
 256 FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
 257 FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
 258 FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
 259
 260 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
 261 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
 262 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
 263 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
 264
 265 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
 266 FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
 267 FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
 268
 269 FUNCTOR_TEMPLATE(VAdd);
 270 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
 271 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
 272 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
 273 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
 274 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
 275 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
 276 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
 277
 278 FUNCTOR_TEMPLATE(VSub);
 279 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
 280 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
 281 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
 282 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
 283 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
 284 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
 285 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
 286
 287 FUNCTOR_TEMPLATE(VMin);
 288 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
 289 FUNCTOR_CLOSURE_2arg(VMin, schar,
 290         __m128i m = _mm_cmpgt_epi8(a, b);
 291         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
 292     );
 293 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
 294 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
 295 FUNCTOR_CLOSURE_2arg(VMin,    int,
 296         __m128i m = _mm_cmpgt_epi32(a, b);
 297         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
 298     );
 299 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
 300 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
 301
 302 FUNCTOR_TEMPLATE(VMax);
 303 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
 304 FUNCTOR_CLOSURE_2arg(VMax, schar,
 305         __m128i m = _mm_cmpgt_epi8(b, a);
 306         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
 307     );
 308 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
 309 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
 310 FUNCTOR_CLOSURE_2arg(VMax,    int,
 311         __m128i m = _mm_cmpgt_epi32(b, a);
 312         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
 313     );
 314 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
 315 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
 316
 317
 318 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
 319 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
 320
 321 FUNCTOR_TEMPLATE(VAbsDiff);
 322 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
 323         return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 324     );
 325 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
 326         __m128i d = _mm_subs_epi8(a, b);
 327         __m128i m = _mm_cmpgt_epi8(b, a);
 328         return _mm_subs_epi8(_mm_xor_si128(d, m), m);
 329     );
 330 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
 331         return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
 332     );
 333 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
 334         __m128i M = _mm_max_epi16(a, b);
 335         __m128i m = _mm_min_epi16(a, b);
 336         return _mm_subs_epi16(M, m);
 337     );
 338 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
 339         __m128i d = _mm_sub_epi32(a, b);
 340         __m128i m = _mm_cmpgt_epi32(b, a);
 341         return _mm_sub_epi32(_mm_xor_si128(d, m), m);
 342     );
 343 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
 344         return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
 345     );
 346 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
 347         return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
 348     );
 349
 350 FUNCTOR_TEMPLATE(VAnd);
 351 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
 352 FUNCTOR_TEMPLATE(VOr);
 353 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
 354 FUNCTOR_TEMPLATE(VXor);
 355 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
 356 FUNCTOR_TEMPLATE(VNot);
 357 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
 358 #endif
 359
 360 #if CV_NEON
 361
 362 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
 363     template <>                                                                \
 364     struct name<template_arg>{                                                 \
 365         typedef register_type reg_type;                                        \
 366         static reg_type load(const template_arg * p) { return load_body (p);}; \
 367         static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
 368     }
 369
 370 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
 371     template<>                                                         \
 372     struct name<template_arg>                                          \
 373     {                                                                  \
 374         VLoadStore128<template_arg>::reg_type operator()(              \
 375                         VLoadStore128<template_arg>::reg_type a,       \
 376                         VLoadStore128<template_arg>::reg_type b) const \
 377         {                                                              \
 378             return body;                                               \
 379         };                                                             \
 380     }
 381
 382 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
 383     template<>                                                         \
 384     struct name<template_arg>                                          \
 385     {                                                                  \
 386         VLoadStore128<template_arg>::reg_type operator()(              \
 387                         VLoadStore128<template_arg>::reg_type a,       \
 388                         VLoadStore128<template_arg>::reg_type  ) const \
 389         {                                                              \
 390             return body;                                               \
 391         };                                                             \
 392     }
 393
 394 FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
 395 FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
 396 FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
 397 FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
 398 FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
 399 FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
 400
 401 FUNCTOR_TEMPLATE(VAdd);
 402 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
 403 FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
 404 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
 405 FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
 406 FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
 407 FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
 408
 409 FUNCTOR_TEMPLATE(VSub);
 410 FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
 411 FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
 412 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
 413 FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
 414 FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
 415 FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
 416
 417 FUNCTOR_TEMPLATE(VMin);
 418 FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
 419 FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
 420 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
 421 FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
 422 FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
 423 FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
 424
 425 FUNCTOR_TEMPLATE(VMax);
 426 FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
 427 FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
 428 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
 429 FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
 430 FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
 431 FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
 432
 433 FUNCTOR_TEMPLATE(VAbsDiff);
 434 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
 435 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
 436 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
 437 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
 438 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
 439 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
 440
 441 FUNCTOR_TEMPLATE(VAnd);
 442 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
 443 FUNCTOR_TEMPLATE(VOr);
 444 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
 445 FUNCTOR_TEMPLATE(VXor);
 446 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
 447 FUNCTOR_TEMPLATE(VNot);
 448 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
 449 #endif
 450
 451
 452 template <typename T>
 453 struct Cmp_SIMD
 454 {
 455     explicit Cmp_SIMD(int)
 456     {
 457     }
 458
 459     int operator () (const T *, const T *, uchar *, int) const
 460     {
 461         return 0;
 462     }
 463 };
 464
 465 #if CV_NEON
 466
 467 template <>
 468 struct Cmp_SIMD<schar>
 469 {
 470     explicit Cmp_SIMD(int code_) :
 471         code(code_)
 472     {
 473         // CV_Assert(code == CMP_GT || code == CMP_LE ||
 474         //           code == CMP_EQ || code == CMP_NE);
 475
 476         v_mask = vdupq_n_u8(255);
 477     }
 478
 479     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
 480     {
 481         int x = 0;
 482
 483         if (code == CMP_GT)
 484             for ( ; x <= width - 16; x += 16)
 485                 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
 486         else if (code == CMP_LE)
 487             for ( ; x <= width - 16; x += 16)
 488                 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
 489         else if (code == CMP_EQ)
 490             for ( ; x <= width - 16; x += 16)
 491                 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
 492         else if (code == CMP_NE)
 493             for ( ; x <= width - 16; x += 16)
 494                 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
 495
 496         return x;
 497     }
 498
 499     int code;
 500     uint8x16_t v_mask;
 501 };
 502
 503 template <>
 504 struct Cmp_SIMD<ushort>
 505 {
 506     explicit Cmp_SIMD(int code_) :
 507         code(code_)
 508     {
 509         // CV_Assert(code == CMP_GT || code == CMP_LE ||
 510         //           code == CMP_EQ || code == CMP_NE);
 511
 512         v_mask = vdup_n_u8(255);
 513     }
 514
 515     int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
 516     {
 517         int x = 0;
 518
 519         if (code == CMP_GT)
 520             for ( ; x <= width - 8; x += 8)
 521             {
 522                 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
 523                 vst1_u8(dst + x, vmovn_u16(v_dst));
 524             }
 525         else if (code == CMP_LE)
 526             for ( ; x <= width - 8; x += 8)
 527             {
 528                 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
 529                 vst1_u8(dst + x, vmovn_u16(v_dst));
 530             }
 531         else if (code == CMP_EQ)
 532             for ( ; x <= width - 8; x += 8)
 533             {
 534                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
 535                 vst1_u8(dst + x, vmovn_u16(v_dst));
 536             }
 537         else if (code == CMP_NE)
 538             for ( ; x <= width - 8; x += 8)
 539             {
 540                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
 541                 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
 542             }
 543
 544         return x;
 545     }
 546
 547     int code;
 548     uint8x8_t v_mask;
 549 };
 550
 551 template <>
 552 struct Cmp_SIMD<int>
 553 {
 554     explicit Cmp_SIMD(int code_) :
 555         code(code_)
 556     {
 557         // CV_Assert(code == CMP_GT || code == CMP_LE ||
 558         //           code == CMP_EQ || code == CMP_NE);
 559
 560         v_mask = vdup_n_u8(255);
 561     }
 562
 563     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
 564     {
 565         int x = 0;
 566
 567         if (code == CMP_GT)
 568             for ( ; x <= width - 8; x += 8)
 569             {
 570                 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
 571                 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
 572                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
 573             }
 574         else if (code == CMP_LE)
 575             for ( ; x <= width - 8; x += 8)
 576             {
 577                 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
 578                 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
 579                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
 580             }
 581         else if (code == CMP_EQ)
 582             for ( ; x <= width - 8; x += 8)
 583             {
 584                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
 585                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
 586                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
 587             }
 588         else if (code == CMP_NE)
 589             for ( ; x <= width - 8; x += 8)
 590             {
 591                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
 592                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
 593                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
 594                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
 595             }
 596
 597         return x;
 598     }
 599
 600     int code;
 601     uint8x8_t v_mask;
 602 };
 603
 604 template <>
 605 struct Cmp_SIMD<float>
 606 {
 607     explicit Cmp_SIMD(int code_) :
 608         code(code_)
 609     {
 610         // CV_Assert(code == CMP_GT || code == CMP_LE ||
 611         //           code == CMP_EQ || code == CMP_NE);
 612
 613         v_mask = vdup_n_u8(255);
 614     }
 615
 616     int operator () (const float * src1, const float * src2, uchar * dst, int width) const
 617     {
 618         int x = 0;
 619
 620         if (code == CMP_GT)
 621             for ( ; x <= width - 8; x += 8)
 622             {
 623                 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
 624                 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
 625                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
 626             }
 627         else if (code == CMP_LE)
 628             for ( ; x <= width - 8; x += 8)
 629             {
 630                 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
 631                 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
 632                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
 633             }
 634         else if (code == CMP_EQ)
 635             for ( ; x <= width - 8; x += 8)
 636             {
 637                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
 638                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
 639                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
 640             }
 641         else if (code == CMP_NE)
 642             for ( ; x <= width - 8; x += 8)
 643             {
 644                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
 645                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
 646                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
 647                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
 648             }
 649
 650         return x;
 651     }
 652
 653     int code;
 654     uint8x8_t v_mask;
 655 };
 656
 657 #elif CV_SSE2
 658
 659 template <>
 660 struct Cmp_SIMD<schar>
 661 {
 662     explicit Cmp_SIMD(int code_) :
 663         code(code_)
 664     {
 665         // CV_Assert(code == CMP_GT || code == CMP_LE ||
 666         //           code == CMP_EQ || code == CMP_NE);
 667
 668         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
 669
 670         v_mask = _mm_set1_epi8(-1);
 671     }
 672
 673     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
 674     {
 675         int x = 0;
 676
 677         if (!haveSSE)
 678             return x;
 679
 680         if (code == CMP_GT)
 681             for ( ; x <= width - 16; x += 16)
 682                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
 683                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
 684         else if (code == CMP_LE)
 685             for ( ; x <= width - 16; x += 16)
 686             {
 687                 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
 688                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
 689                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
 690             }
 691         else if (code == CMP_EQ)
 692             for ( ; x <= width - 16; x += 16)
 693                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
 694                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
 695         else if (code == CMP_NE)
 696             for ( ; x <= width - 16; x += 16)
 697             {
 698                 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
 699                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
 700                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
 701             }
 702
 703         return x;
 704     }
 705
 706     int code;
 707     __m128i v_mask;
 708     bool haveSSE;
 709 };
 710
 711 template <>
 712 struct Cmp_SIMD<int>
 713 {
 714     explicit Cmp_SIMD(int code_) :
 715         code(code_)
 716     {
 717         // CV_Assert(code == CMP_GT || code == CMP_LE ||
 718         //           code == CMP_EQ || code == CMP_NE);
 719
 720         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
 721
 722         v_mask = _mm_set1_epi32(0xffffffff);
 723     }
 724
 725     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
 726     {
 727         int x = 0;
 728
 729         if (!haveSSE)
 730             return x;
 731
 732         if (code == CMP_GT)
 733             for ( ; x <= width - 8; x += 8)
 734             {
 735                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
 736                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
 737                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
 738                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
 739
 740                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
 741             }
 742         else if (code == CMP_LE)
 743             for ( ; x <= width - 8; x += 8)
 744             {
 745                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
 746                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
 747                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
 748                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
 749
 750                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
 751             }
 752         else if (code == CMP_EQ)
 753             for ( ; x <= width - 8; x += 8)
 754             {
 755                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
 756                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
 757                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
 758                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
 759
 760                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
 761             }
 762         else if (code == CMP_NE)
 763             for ( ; x <= width - 8; x += 8)
 764             {
 765                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
 766                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
 767                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
 768                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
 769
 770                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
 771             }
 772
 773         return x;
 774     }
 775
 776     int code;
 777     __m128i v_mask;
 778     bool haveSSE;
 779 };
 780
 781 #endif
 782
 783
 784 template <typename T, typename WT>
 785 struct Mul_SIMD
 786 {
 787     int operator() (const T *, const T *, T *, int, WT) const
 788     {
 789         return 0;
 790     }
 791 };
 792
 793 #if CV_NEON
 794
 795 template <>
 796 struct Mul_SIMD<uchar, float>
 797 {
 798     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
 799     {
 800         int x = 0;
 801
 802         if( scale == 1.0f )
 803             for ( ; x <= width - 8; x += 8)
 804             {
 805                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
 806                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
 807
 808                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
 809                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
 810                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
 811                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
 812
 813                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
 814                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
 815                 vst1_u8(dst + x, vqmovn_u16(v_dst));
 816             }
 817         else
 818         {
 819             float32x4_t v_scale = vdupq_n_f32(scale);
 820             for ( ; x <= width - 8; x += 8)
 821             {
 822                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
 823                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
 824
 825                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
 826                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
 827                 v_dst1 = vmulq_f32(v_dst1, v_scale);
 828                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
 829                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
 830                 v_dst2 = vmulq_f32(v_dst2, v_scale);
 831
 832                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
 833                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
 834                 vst1_u8(dst + x, vqmovn_u16(v_dst));
 835             }
 836         }
 837
 838         return x;
 839     }
 840 };
 841
 842 template <>
 843 struct Mul_SIMD<schar, float>
 844 {
 845     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
 846     {
 847         int x = 0;
 848
 849         if( scale == 1.0f )
 850             for ( ; x <= width - 8; x += 8)
 851             {
 852                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
 853                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
 854
 855                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
 856                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
 857                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
 858                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
 859
 860                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
 861                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
 862                 vst1_s8(dst + x, vqmovn_s16(v_dst));
 863             }
 864         else
 865         {
 866             float32x4_t v_scale = vdupq_n_f32(scale);
 867             for ( ; x <= width - 8; x += 8)
 868             {
 869                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
 870                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
 871
 872                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
 873                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
 874                 v_dst1 = vmulq_f32(v_dst1, v_scale);
 875                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
 876                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
 877                 v_dst2 = vmulq_f32(v_dst2, v_scale);
 878
 879                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
 880                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
 881                 vst1_s8(dst + x, vqmovn_s16(v_dst));
 882             }
 883         }
 884
 885         return x;
 886     }
 887 };
 888
 889 template <>
 890 struct Mul_SIMD<ushort, float>
 891 {
 892     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
 893     {
 894         int x = 0;
 895
 896         if( scale == 1.0f )
 897             for ( ; x <= width - 8; x += 8)
 898             {
 899                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
 900
 901                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
 902                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
 903                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
 904                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
 905
 906                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
 907                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
 908                 vst1q_u16(dst + x, v_dst);
 909             }
 910         else
 911         {
 912             float32x4_t v_scale = vdupq_n_f32(scale);
 913             for ( ; x <= width - 8; x += 8)
 914             {
 915                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
 916
 917                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
 918                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
 919                 v_dst1 = vmulq_f32(v_dst1, v_scale);
 920                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
 921                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
 922                 v_dst2 = vmulq_f32(v_dst2, v_scale);
 923
 924                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
 925                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
 926                 vst1q_u16(dst + x, v_dst);
 927             }
 928         }
 929
 930         return x;
 931     }
 932 };
 933
 934 template <>
 935 struct Mul_SIMD<short, float>
 936 {
 937     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
 938     {
 939         int x = 0;
 940
 941         if( scale == 1.0f )
 942             for ( ; x <= width - 8; x += 8)
 943             {
 944                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
 945
 946                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
 947                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
 948                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
 949                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
 950
 951                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
 952                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
 953                 vst1q_s16(dst + x, v_dst);
 954             }
 955         else
 956         {
 957             float32x4_t v_scale = vdupq_n_f32(scale);
 958             for ( ; x <= width - 8; x += 8)
 959             {
 960                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
 961
 962                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
 963                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
 964                 v_dst1 = vmulq_f32(v_dst1, v_scale);
 965                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
 966                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
 967                 v_dst2 = vmulq_f32(v_dst2, v_scale);
 968
 969                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
 970                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
 971                 vst1q_s16(dst + x, v_dst);
 972             }
 973         }
 974
 975         return x;
 976     }
 977 };
 978
 979 template <>
 980 struct Mul_SIMD<float, float>
 981 {
 982     int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
 983     {
 984         int x = 0;
 985
 986         if( scale == 1.0f )
 987             for ( ; x <= width - 8; x += 8)
 988             {
 989                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
 990                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
 991                 vst1q_f32(dst + x, v_dst1);
 992                 vst1q_f32(dst + x + 4, v_dst2);
 993             }
 994         else
 995         {
 996             float32x4_t v_scale = vdupq_n_f32(scale);
 997             for ( ; x <= width - 8; x += 8)
 998             {
 999                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
1000                 v_dst1 = vmulq_f32(v_dst1, v_scale);
1001
1002                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
1003                 v_dst2 = vmulq_f32(v_dst2, v_scale);
1004
1005                 vst1q_f32(dst + x, v_dst1);
1006                 vst1q_f32(dst + x + 4, v_dst2);
1007             }
1008         }
1009
1010         return x;
1011     }
1012 };
1013
1014 #elif CV_SSE2
1015
1016 #if CV_SSE4_1
1017
1018 template <>
1019 struct Mul_SIMD<ushort, float>
1020 {
1021     Mul_SIMD()
1022     {
1023         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
1024     }
1025
1026     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
1027     {
1028         int x = 0;
1029
1030         if (!haveSSE)
1031             return x;
1032
1033         __m128i v_zero = _mm_setzero_si128();
1034
1035         if( scale != 1.0f )
1036         {
1037             __m128 v_scale = _mm_set1_ps(scale);
1038             for ( ; x <= width - 8; x += 8)
1039             {
1040                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1041                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1042
1043                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
1044                                            _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
1045                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1046
1047                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
1048                                            _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
1049                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1050
1051                 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1052                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1053             }
1054         }
1055
1056         return x;
1057     }
1058
1059     bool haveSSE;
1060 };
1061
1062 #endif
1063
1064 template <>
1065 struct Mul_SIMD<schar, float>
1066 {
1067     Mul_SIMD()
1068     {
1069         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1070     }
1071
1072     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
1073     {
1074         int x = 0;
1075
1076         if (!haveSSE)
1077             return x;
1078
1079         __m128i v_zero = _mm_setzero_si128();
1080
1081         if( scale == 1.0f )
1082             for ( ; x <= width - 8; x += 8)
1083             {
1084                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1085                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1086
1087                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1088                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1089
1090                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1091                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1092
1093                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1094                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1095
1096                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1097                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1098             }
1099         else
1100         {
1101             __m128 v_scale = _mm_set1_ps(scale);
1102             for ( ; x <= width - 8; x += 8)
1103             {
1104                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1105                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1106
1107                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1108                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1109
1110                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1111                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1112                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1113
1114                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1115                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1116                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1117
1118                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1119                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1120             }
1121         }
1122
1123         return x;
1124     }
1125
1126     bool haveSSE;
1127 };
1128
1129 template <>
1130 struct Mul_SIMD<short, float>
1131 {
1132     Mul_SIMD()
1133     {
1134         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1135     }
1136
1137     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
1138     {
1139         int x = 0;
1140
1141         if (!haveSSE)
1142             return x;
1143
1144         __m128i v_zero = _mm_setzero_si128();
1145
1146         if( scale != 1.0f )
1147         {
1148             __m128 v_scale = _mm_set1_ps(scale);
1149             for ( ; x <= width - 8; x += 8)
1150             {
1151                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1152                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1153
1154                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1155                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1156                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1157
1158                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1159                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1160                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1161
1162                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1163                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1164             }
1165         }
1166
1167         return x;
1168     }
1169
1170     bool haveSSE;
1171 };
1172
1173 #endif
1174
1175 template <typename T>
1176 struct Div_SIMD
1177 {
1178     int operator() (const T *, const T *, T *, int, double) const
1179     {
1180         return 0;
1181     }
1182 };
1183
1184 template <typename T>
1185 struct Recip_SIMD
1186 {
1187     int operator() (const T *, T *, int, double) const
1188     {
1189         return 0;
1190     }
1191 };
1192
1193
1194 #if CV_SIMD128
1195
1196 template <>
1197 struct Div_SIMD<uchar>
1198 {
1199     bool haveSIMD;
1200     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1201
1202     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
1203     {
1204         int x = 0;
1205
1206         if (!haveSIMD)
1207             return x;
1208
1209         v_float32x4 v_scale = v_setall_f32((float)scale);
1210         v_uint16x8 v_zero = v_setzero_u16();
1211
1212         for ( ; x <= width - 8; x += 8)
1213         {
1214             v_uint16x8 v_src1 = v_load_expand(src1 + x);
1215             v_uint16x8 v_src2 = v_load_expand(src2 + x);
1216
1217             v_uint32x4 t0, t1, t2, t3;
1218             v_expand(v_src1, t0, t1);
1219             v_expand(v_src2, t2, t3);
1220
1221             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1222             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1223
1224             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1225             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1226
1227             f0 = f0 * v_scale / f2;
1228             f1 = f1 * v_scale / f3;
1229
1230             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1231             v_uint16x8 res = v_pack_u(i0, i1);
1232
1233             res = v_select(v_src2 == v_zero, v_zero, res);
1234             v_pack_store(dst + x, res);
1235         }
1236
1237         return x;
1238     }
1239 };
1240
1241
1242 template <>
1243 struct Div_SIMD<schar>
1244 {
1245     bool haveSIMD;
1246     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1247
1248     int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
1249     {
1250         int x = 0;
1251
1252         if (!haveSIMD)
1253             return x;
1254
1255         v_float32x4 v_scale = v_setall_f32((float)scale);
1256         v_int16x8 v_zero = v_setzero_s16();
1257
1258         for ( ; x <= width - 8; x += 8)
1259         {
1260             v_int16x8 v_src1 = v_load_expand(src1 + x);
1261             v_int16x8 v_src2 = v_load_expand(src2 + x);
1262
1263             v_int32x4 t0, t1, t2, t3;
1264             v_expand(v_src1, t0, t1);
1265             v_expand(v_src2, t2, t3);
1266
1267             v_float32x4 f0 = v_cvt_f32(t0);
1268             v_float32x4 f1 = v_cvt_f32(t1);
1269
1270             v_float32x4 f2 = v_cvt_f32(t2);
1271             v_float32x4 f3 = v_cvt_f32(t3);
1272
1273             f0 = f0 * v_scale / f2;
1274             f1 = f1 * v_scale / f3;
1275
1276             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1277             v_int16x8 res = v_pack(i0, i1);
1278
1279             res = v_select(v_src2 == v_zero, v_zero, res);
1280             v_pack_store(dst + x, res);
1281         }
1282
1283         return x;
1284     }
1285 };
1286
1287
1288 template <>
1289 struct Div_SIMD<ushort>
1290 {
1291     bool haveSIMD;
1292     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1293
1294     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
1295     {
1296         int x = 0;
1297
1298         if (!haveSIMD)
1299             return x;
1300
1301         v_float32x4 v_scale = v_setall_f32((float)scale);
1302         v_uint16x8 v_zero = v_setzero_u16();
1303
1304         for ( ; x <= width - 8; x += 8)
1305         {
1306             v_uint16x8 v_src1 = v_load(src1 + x);
1307             v_uint16x8 v_src2 = v_load(src2 + x);
1308
1309             v_uint32x4 t0, t1, t2, t3;
1310             v_expand(v_src1, t0, t1);
1311             v_expand(v_src2, t2, t3);
1312
1313             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1314             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1315
1316             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1317             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1318
1319             f0 = f0 * v_scale / f2;
1320             f1 = f1 * v_scale / f3;
1321
1322             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1323             v_uint16x8 res = v_pack_u(i0, i1);
1324
1325             res = v_select(v_src2 == v_zero, v_zero, res);
1326             v_store(dst + x, res);
1327         }
1328
1329         return x;
1330     }
1331 };
1332
1333 template <>
1334 struct Div_SIMD<short>
1335 {
1336     bool haveSIMD;
1337     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1338
1339     int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
1340     {
1341         int x = 0;
1342
1343         if (!haveSIMD)
1344             return x;
1345
1346         v_float32x4 v_scale = v_setall_f32((float)scale);
1347         v_int16x8 v_zero = v_setzero_s16();
1348
1349         for ( ; x <= width - 8; x += 8)
1350         {
1351             v_int16x8 v_src1 = v_load(src1 + x);
1352             v_int16x8 v_src2 = v_load(src2 + x);
1353
1354             v_int32x4 t0, t1, t2, t3;
1355             v_expand(v_src1, t0, t1);
1356             v_expand(v_src2, t2, t3);
1357
1358             v_float32x4 f0 = v_cvt_f32(t0);
1359             v_float32x4 f1 = v_cvt_f32(t1);
1360
1361             v_float32x4 f2 = v_cvt_f32(t2);
1362             v_float32x4 f3 = v_cvt_f32(t3);
1363
1364             f0 = f0 * v_scale / f2;
1365             f1 = f1 * v_scale / f3;
1366
1367             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1368             v_int16x8 res = v_pack(i0, i1);
1369
1370             res = v_select(v_src2 == v_zero, v_zero, res);
1371             v_store(dst + x, res);
1372         }
1373
1374         return x;
1375     }
1376 };
1377
1378 template <>
1379 struct Div_SIMD<int>
1380 {
1381     bool haveSIMD;
1382     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1383
1384     int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
1385     {
1386         int x = 0;
1387
1388         if (!haveSIMD)
1389             return x;
1390
1391         v_float32x4 v_scale = v_setall_f32((float)scale);
1392         v_int32x4 v_zero = v_setzero_s32();
1393
1394         for ( ; x <= width - 8; x += 8)
1395         {
1396             v_int32x4 t0 = v_load(src1 + x);
1397             v_int32x4 t1 = v_load(src1 + x + 4);
1398             v_int32x4 t2 = v_load(src2 + x);
1399             v_int32x4 t3 = v_load(src2 + x + 4);
1400
1401             v_float32x4 f0 = v_cvt_f32(t0);
1402             v_float32x4 f1 = v_cvt_f32(t1);
1403             v_float32x4 f2 = v_cvt_f32(t2);
1404             v_float32x4 f3 = v_cvt_f32(t3);
1405
1406             f0 = f0 * v_scale / f2;
1407             f1 = f1 * v_scale / f3;
1408
1409             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1410
1411             res0 = v_select(t2 == v_zero, v_zero, res0);
1412             res1 = v_select(t3 == v_zero, v_zero, res1);
1413             v_store(dst + x, res0);
1414             v_store(dst + x + 4, res1);
1415         }
1416
1417         return x;
1418     }
1419 };
1420
1421
1422 template <>
1423 struct Div_SIMD<float>
1424 {
1425     bool haveSIMD;
1426     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1427
1428     int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
1429     {
1430         int x = 0;
1431
1432         if (!haveSIMD)
1433             return x;
1434
1435         v_float32x4 v_scale = v_setall_f32((float)scale);
1436         v_float32x4 v_zero = v_setzero_f32();
1437
1438         for ( ; x <= width - 8; x += 8)
1439         {
1440             v_float32x4 f0 = v_load(src1 + x);
1441             v_float32x4 f1 = v_load(src1 + x + 4);
1442             v_float32x4 f2 = v_load(src2 + x);
1443             v_float32x4 f3 = v_load(src2 + x + 4);
1444
1445             v_float32x4 res0 = f0 * v_scale / f2;
1446             v_float32x4 res1 = f1 * v_scale / f3;
1447
1448             res0 = v_select(f2 == v_zero, v_zero, res0);
1449             res1 = v_select(f3 == v_zero, v_zero, res1);
1450
1451             v_store(dst + x, res0);
1452             v_store(dst + x + 4, res1);
1453         }
1454
1455         return x;
1456     }
1457 };
1458
1459
1460 ///////////////////////// RECIPROCAL //////////////////////
1461
1462 template <>
1463 struct Recip_SIMD<uchar>
1464 {
1465     bool haveSIMD;
1466     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1467
1468     int operator() (const uchar * src2, uchar * dst, int width, double scale) const
1469     {
1470         int x = 0;
1471
1472         if (!haveSIMD)
1473             return x;
1474
1475         v_float32x4 v_scale = v_setall_f32((float)scale);
1476         v_uint16x8 v_zero = v_setzero_u16();
1477
1478         for ( ; x <= width - 8; x += 8)
1479         {
1480             v_uint16x8 v_src2 = v_load_expand(src2 + x);
1481
1482             v_uint32x4 t0, t1;
1483             v_expand(v_src2, t0, t1);
1484
1485             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1486             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1487
1488             f0 = v_scale / f0;
1489             f1 = v_scale / f1;
1490
1491             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1492             v_uint16x8 res = v_pack_u(i0, i1);
1493
1494             res = v_select(v_src2 == v_zero, v_zero, res);
1495             v_pack_store(dst + x, res);
1496         }
1497
1498         return x;
1499     }
1500 };
1501
1502
1503 template <>
1504 struct Recip_SIMD<schar>
1505 {
1506     bool haveSIMD;
1507     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1508
1509     int operator() (const schar * src2, schar * dst, int width, double scale) const
1510     {
1511         int x = 0;
1512
1513         if (!haveSIMD)
1514             return x;
1515
1516         v_float32x4 v_scale = v_setall_f32((float)scale);
1517         v_int16x8 v_zero = v_setzero_s16();
1518
1519         for ( ; x <= width - 8; x += 8)
1520         {
1521             v_int16x8 v_src2 = v_load_expand(src2 + x);
1522
1523             v_int32x4 t0, t1;
1524             v_expand(v_src2, t0, t1);
1525
1526             v_float32x4 f0 = v_cvt_f32(t0);
1527             v_float32x4 f1 = v_cvt_f32(t1);
1528
1529             f0 = v_scale / f0;
1530             f1 = v_scale / f1;
1531
1532             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1533             v_int16x8 res = v_pack(i0, i1);
1534
1535             res = v_select(v_src2 == v_zero, v_zero, res);
1536             v_pack_store(dst + x, res);
1537         }
1538
1539         return x;
1540     }
1541 };
1542
1543
1544 template <>
1545 struct Recip_SIMD<ushort>
1546 {
1547     bool haveSIMD;
1548     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1549
1550     int operator() (const ushort * src2, ushort * dst, int width, double scale) const
1551     {
1552         int x = 0;
1553
1554         if (!haveSIMD)
1555             return x;
1556
1557         v_float32x4 v_scale = v_setall_f32((float)scale);
1558         v_uint16x8 v_zero = v_setzero_u16();
1559
1560         for ( ; x <= width - 8; x += 8)
1561         {
1562             v_uint16x8 v_src2 = v_load(src2 + x);
1563
1564             v_uint32x4 t0, t1;
1565             v_expand(v_src2, t0, t1);
1566
1567             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1568             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1569
1570             f0 = v_scale / f0;
1571             f1 = v_scale / f1;
1572
1573             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1574             v_uint16x8 res = v_pack_u(i0, i1);
1575
1576             res = v_select(v_src2 == v_zero, v_zero, res);
1577             v_store(dst + x, res);
1578         }
1579
1580         return x;
1581     }
1582 };
1583
1584 template <>
1585 struct Recip_SIMD<short>
1586 {
1587     bool haveSIMD;
1588     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1589
1590     int operator() (const short * src2, short * dst, int width, double scale) const
1591     {
1592         int x = 0;
1593
1594         if (!haveSIMD)
1595             return x;
1596
1597         v_float32x4 v_scale = v_setall_f32((float)scale);
1598         v_int16x8 v_zero = v_setzero_s16();
1599
1600         for ( ; x <= width - 8; x += 8)
1601         {
1602             v_int16x8 v_src2 = v_load(src2 + x);
1603
1604             v_int32x4 t0, t1;
1605             v_expand(v_src2, t0, t1);
1606
1607             v_float32x4 f0 = v_cvt_f32(t0);
1608             v_float32x4 f1 = v_cvt_f32(t1);
1609
1610             f0 = v_scale / f0;
1611             f1 = v_scale / f1;
1612
1613             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1614             v_int16x8 res = v_pack(i0, i1);
1615
1616             res = v_select(v_src2 == v_zero, v_zero, res);
1617             v_store(dst + x, res);
1618         }
1619
1620         return x;
1621     }
1622 };
1623
1624 template <>
1625 struct Recip_SIMD<int>
1626 {
1627     bool haveSIMD;
1628     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1629
1630     int operator() (const int * src2, int * dst, int width, double scale) const
1631     {
1632         int x = 0;
1633
1634         if (!haveSIMD)
1635             return x;
1636
1637         v_float32x4 v_scale = v_setall_f32((float)scale);
1638         v_int32x4 v_zero = v_setzero_s32();
1639
1640         for ( ; x <= width - 8; x += 8)
1641         {
1642             v_int32x4 t0 = v_load(src2 + x);
1643             v_int32x4 t1 = v_load(src2 + x + 4);
1644
1645             v_float32x4 f0 = v_cvt_f32(t0);
1646             v_float32x4 f1 = v_cvt_f32(t1);
1647
1648             f0 = v_scale / f0;
1649             f1 = v_scale / f1;
1650
1651             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1652
1653             res0 = v_select(t0 == v_zero, v_zero, res0);
1654             res1 = v_select(t1 == v_zero, v_zero, res1);
1655             v_store(dst + x, res0);
1656             v_store(dst + x + 4, res1);
1657         }
1658
1659         return x;
1660     }
1661 };
1662
1663
1664 template <>
1665 struct Recip_SIMD<float>
1666 {
1667     bool haveSIMD;
1668     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1669
1670     int operator() (const float * src2, float * dst, int width, double scale) const
1671     {
1672         int x = 0;
1673
1674         if (!haveSIMD)
1675             return x;
1676
1677         v_float32x4 v_scale = v_setall_f32((float)scale);
1678         v_float32x4 v_zero = v_setzero_f32();
1679
1680         for ( ; x <= width - 8; x += 8)
1681         {
1682             v_float32x4 f0 = v_load(src2 + x);
1683             v_float32x4 f1 = v_load(src2 + x + 4);
1684
1685             v_float32x4 res0 = v_scale / f0;
1686             v_float32x4 res1 = v_scale / f1;
1687
1688             res0 = v_select(f0 == v_zero, v_zero, res0);
1689             res1 = v_select(f1 == v_zero, v_zero, res1);
1690
1691             v_store(dst + x, res0);
1692             v_store(dst + x + 4, res1);
1693         }
1694
1695         return x;
1696     }
1697 };
1698
1699 #if CV_SIMD128_64F
1700
1701 template <>
1702 struct Div_SIMD<double>
1703 {
1704     bool haveSIMD;
1705     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1706
1707     int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
1708     {
1709         int x = 0;
1710
1711         if (!haveSIMD)
1712             return x;
1713
1714         v_float64x2 v_scale = v_setall_f64(scale);
1715         v_float64x2 v_zero = v_setzero_f64();
1716
1717         for ( ; x <= width - 4; x += 4)
1718         {
1719             v_float64x2 f0 = v_load(src1 + x);
1720             v_float64x2 f1 = v_load(src1 + x + 2);
1721             v_float64x2 f2 = v_load(src2 + x);
1722             v_float64x2 f3 = v_load(src2 + x + 2);
1723
1724             v_float64x2 res0 = f0 * v_scale / f2;
1725             v_float64x2 res1 = f1 * v_scale / f3;
1726
1727             res0 = v_select(f0 == v_zero, v_zero, res0);
1728             res1 = v_select(f1 == v_zero, v_zero, res1);
1729
1730             v_store(dst + x, res0);
1731             v_store(dst + x + 2, res1);
1732         }
1733
1734         return x;
1735     }
1736 };
1737
1738 template <>
1739 struct Recip_SIMD<double>
1740 {
1741     bool haveSIMD;
1742     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1743
1744     int operator() (const double * src2, double * dst, int width, double scale) const
1745     {
1746         int x = 0;
1747
1748         if (!haveSIMD)
1749             return x;
1750
1751         v_float64x2 v_scale = v_setall_f64(scale);
1752         v_float64x2 v_zero = v_setzero_f64();
1753
1754         for ( ; x <= width - 4; x += 4)
1755         {
1756             v_float64x2 f0 = v_load(src2 + x);
1757             v_float64x2 f1 = v_load(src2 + x + 2);
1758
1759             v_float64x2 res0 = v_scale / f0;
1760             v_float64x2 res1 = v_scale / f1;
1761
1762             res0 = v_select(f0 == v_zero, v_zero, res0);
1763             res1 = v_select(f1 == v_zero, v_zero, res1);
1764
1765             v_store(dst + x, res0);
1766             v_store(dst + x + 2, res1);
1767         }
1768
1769         return x;
1770     }
1771 };
1772
1773 #endif
1774
1775 #endif
1776
1777
1778 template <typename T, typename WT>
1779 struct AddWeighted_SIMD
1780 {
1781     int operator() (const T *, const T *, T *, int, WT, WT, WT) const
1782     {
1783         return 0;
1784     }
1785 };
1786
1787 #if CV_SSE2
1788
1789 template <>
1790 struct AddWeighted_SIMD<schar, float>
1791 {
1792     AddWeighted_SIMD()
1793     {
1794         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1795     }
1796
1797     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1798     {
1799         int x = 0;
1800
1801         if (!haveSSE2)
1802             return x;
1803
1804         __m128i v_zero = _mm_setzero_si128();
1805         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1806                v_gamma = _mm_set1_ps(gamma);
1807
1808         for( ; x <= width - 8; x += 8 )
1809         {
1810             __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
1811             __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
1812
1813             __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1814             __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1815
1816             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
1817             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1818                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
1819
1820             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
1821             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1822                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
1823
1824             __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1825                                               _mm_cvtps_epi32(v_dstf1));
1826
1827             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
1828         }
1829
1830         return x;
1831     }
1832
1833     bool haveSSE2;
1834 };
1835
1836 template <>
1837 struct AddWeighted_SIMD<short, float>
1838 {
1839     AddWeighted_SIMD()
1840     {
1841         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1842     }
1843
1844     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1845     {
1846         int x = 0;
1847
1848         if (!haveSSE2)
1849             return x;
1850
1851         __m128i v_zero = _mm_setzero_si128();
1852         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1853                v_gamma = _mm_set1_ps(gamma);
1854
1855         for( ; x <= width - 8; x += 8 )
1856         {
1857             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1858             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1859
1860             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
1861             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1862                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
1863
1864             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
1865             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1866                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
1867
1868             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1869                                                                    _mm_cvtps_epi32(v_dstf1)));
1870         }
1871
1872         return x;
1873     }
1874
1875     bool haveSSE2;
1876 };
1877
1878 #if CV_SSE4_1
1879
1880 template <>
1881 struct AddWeighted_SIMD<ushort, float>
1882 {
1883     AddWeighted_SIMD()
1884     {
1885         haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
1886     }
1887
1888     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1889     {
1890         int x = 0;
1891
1892         if (!haveSSE4_1)
1893             return x;
1894
1895         __m128i v_zero = _mm_setzero_si128();
1896         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1897                v_gamma = _mm_set1_ps(gamma);
1898
1899         for( ; x <= width - 8; x += 8 )
1900         {
1901             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1902             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1903
1904             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
1905             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1906                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
1907
1908             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
1909             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1910                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
1911
1912             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
1913                                                                     _mm_cvtps_epi32(v_dstf1)));
1914         }
1915
1916         return x;
1917     }
1918
1919     bool haveSSE4_1;
1920 };
1921
1922 #endif
1923
1924 #elif CV_NEON
1925
1926 template <>
1927 struct AddWeighted_SIMD<schar, float>
1928 {
1929     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1930     {
1931         int x = 0;
1932
1933         float32x4_t g = vdupq_n_f32 (gamma);
1934
1935         for( ; x <= width - 8; x += 8 )
1936         {
1937             int8x8_t in1 = vld1_s8(src1 + x);
1938             int16x8_t in1_16 = vmovl_s8(in1);
1939             float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
1940             float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
1941
1942             int8x8_t in2 = vld1_s8(src2+x);
1943             int16x8_t in2_16 = vmovl_s8(in2);
1944             float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
1945             float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
1946
1947             float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
1948             float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
1949             out_f_l = vaddq_f32(out_f_l, g);
1950             out_f_h = vaddq_f32(out_f_h, g);
1951
1952             int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
1953             int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
1954
1955             int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
1956             int8x8_t out = vqmovn_s16(out_16);
1957
1958             vst1_s8(dst + x, out);
1959         }
1960
1961         return x;
1962     }
1963 };
1964
1965 template <>
1966 struct AddWeighted_SIMD<ushort, float>
1967 {
1968     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1969     {
1970         int x = 0;
1971
1972         float32x4_t g = vdupq_n_f32(gamma);
1973
1974         for( ; x <= width - 8; x += 8 )
1975         {
1976             uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
1977
1978             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
1979             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
1980             uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1981
1982             v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
1983             v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
1984             uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1985
1986             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
1987         }
1988
1989         return x;
1990     }
1991 };
1992
1993 template <>
1994 struct AddWeighted_SIMD<short, float>
1995 {
1996     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1997     {
1998         int x = 0;
1999
2000         float32x4_t g = vdupq_n_f32(gamma);
2001
2002         for( ; x <= width - 8; x += 8 )
2003         {
2004             int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
2005
2006             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
2007             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
2008             int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
2009
2010             v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
2011             v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
2012             int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
2013
2014             vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
2015         }
2016
2017         return x;
2018     }
2019 };
2020
2021 #endif
2022
2023 }
2024
2025 #endif // __OPENCV_ARITHM_SIMD_HPP__