Merge pull request #5811 from mshabunin:fix-java-moments
[platform/upstream/opencv.git] / modules / hal / src / arithm_simd.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef __OPENCV_HAL_ARITHM_SIMD_HPP__
46 #define __OPENCV_HAL_ARITHM_SIMD_HPP__
47
48 namespace cv { namespace hal {
49
50 struct NOP {};
51
52 #if CV_SSE2 || CV_NEON
53 #define IF_SIMD(op) op
54 #else
55 #define IF_SIMD(op) NOP
56 #endif
57
58
59 #if CV_SSE2 || CV_NEON
60
61 #define FUNCTOR_TEMPLATE(name)          \
62     template<typename T> struct name {}
63
64 FUNCTOR_TEMPLATE(VLoadStore128);
65 #if CV_SSE2
66 FUNCTOR_TEMPLATE(VLoadStore64);
67 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
68 #if CV_AVX2
69 FUNCTOR_TEMPLATE(VLoadStore256);
70 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
71 #endif
72 #endif
73
74 #endif
75
76 #if CV_AVX2
77
78 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
79     template <>                                                                                  \
80     struct name<template_arg>{                                                                   \
81         typedef register_type reg_type;                                                          \
82         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
83         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
84     }
85
86 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
87     template <>                                                                     \
88     struct name<template_arg>{                                                      \
89         typedef register_type reg_type;                                             \
90         static reg_type load(const template_arg * p) { return load_body (p); }      \
91         static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
92     }
93
94 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
95     template<>                                                                 \
96     struct name<template_arg>                                                  \
97     {                                                                          \
98         VLoadStore256<template_arg>::reg_type operator()(                      \
99                         const VLoadStore256<template_arg>::reg_type & a,       \
100                         const VLoadStore256<template_arg>::reg_type & b) const \
101         {                                                                      \
102             body;                                                              \
103         }                                                                      \
104     }
105
106 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
107     template<>                                                                 \
108     struct name<template_arg>                                                  \
109     {                                                                          \
110         VLoadStore256<template_arg>::reg_type operator()(                      \
111                         const VLoadStore256<template_arg>::reg_type & a,       \
112                         const VLoadStore256<template_arg>::reg_type &  ) const \
113         {                                                                      \
114             body;                                                              \
115         }                                                                      \
116     }
117
118 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
119 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
120 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
121 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
122 FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
123 FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
124 FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
125
126 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
127 FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
128 FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
129
130 FUNCTOR_TEMPLATE(VAdd);
131 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
132 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
133 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
134 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
135 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
136 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
137 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
138
139 FUNCTOR_TEMPLATE(VSub);
140 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
141 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
142 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
143 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
144 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
145 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
146 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
147
148 FUNCTOR_TEMPLATE(VMin);
149 FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
150 FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
151 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
152 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
153 FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
154 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
155 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
156
157 FUNCTOR_TEMPLATE(VMax);
158 FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
159 FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
160 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
161 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
162 FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
163 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
164 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
165
166
167 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
168                                                            0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
169 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
170                                                            0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
171
172 FUNCTOR_TEMPLATE(VAbsDiff);
173 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
174         return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
175     );
176 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
177         __m256i d = _mm256_subs_epi8(a, b);
178         __m256i m = _mm256_cmpgt_epi8(b, a);
179         return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
180     );
181 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
182         return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
183     );
184 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
185         __m256i M = _mm256_max_epi16(a, b);
186         __m256i m = _mm256_min_epi16(a, b);
187         return _mm256_subs_epi16(M, m);
188     );
189 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
190         __m256i d = _mm256_sub_epi32(a, b);
191         __m256i m = _mm256_cmpgt_epi32(b, a);
192         return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
193     );
194 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
195         return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
196     );
197 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
198         return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
199     );
200
201 FUNCTOR_TEMPLATE(VAnd);
202 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
203 FUNCTOR_TEMPLATE(VOr);
204 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
205 FUNCTOR_TEMPLATE(VXor);
206 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
207 FUNCTOR_TEMPLATE(VNot);
208 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
209
210 #elif CV_SSE2
211
212 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
213     template <>                                                                                  \
214     struct name<template_arg>{                                                                   \
215         typedef register_type reg_type;                                                          \
216         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
217         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
218     }
219
220 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
221     template <>                                                                \
222     struct name<template_arg>{                                                 \
223         typedef register_type reg_type;                                        \
224         static reg_type load(const template_arg * p) { return load_body (p); } \
225         static void store(template_arg * p, reg_type v) { store_body (p, v); } \
226     }
227
228 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
229     template<>                                                                 \
230     struct name<template_arg>                                                  \
231     {                                                                          \
232         VLoadStore128<template_arg>::reg_type operator()(                      \
233                         const VLoadStore128<template_arg>::reg_type & a,       \
234                         const VLoadStore128<template_arg>::reg_type & b) const \
235         {                                                                      \
236             body;                                                              \
237         }                                                                      \
238     }
239
240 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
241     template<>                                                                 \
242     struct name<template_arg>                                                  \
243     {                                                                          \
244         VLoadStore128<template_arg>::reg_type operator()(                      \
245                         const VLoadStore128<template_arg>::reg_type & a,       \
246                         const VLoadStore128<template_arg>::reg_type &  ) const \
247         {                                                                      \
248             body;                                                              \
249         }                                                                      \
250     }
251
252 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
253 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
254 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
255 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
256 FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
257 FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
258 FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
259
260 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
261 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
262 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
263 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
264
265 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
266 FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
267 FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
268
269 FUNCTOR_TEMPLATE(VAdd);
270 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
271 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
272 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
273 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
274 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
275 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
276 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
277
278 FUNCTOR_TEMPLATE(VSub);
279 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
280 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
281 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
282 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
283 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
284 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
285 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
286
287 FUNCTOR_TEMPLATE(VMin);
288 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
289 FUNCTOR_CLOSURE_2arg(VMin, schar,
290         __m128i m = _mm_cmpgt_epi8(a, b);
291         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
292     );
293 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
294 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
295 FUNCTOR_CLOSURE_2arg(VMin,    int,
296         __m128i m = _mm_cmpgt_epi32(a, b);
297         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
298     );
299 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
300 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
301
302 FUNCTOR_TEMPLATE(VMax);
303 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
304 FUNCTOR_CLOSURE_2arg(VMax, schar,
305         __m128i m = _mm_cmpgt_epi8(b, a);
306         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
307     );
308 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
309 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
310 FUNCTOR_CLOSURE_2arg(VMax,    int,
311         __m128i m = _mm_cmpgt_epi32(b, a);
312         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
313     );
314 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
315 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
316
317
318 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
319 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
320
321 FUNCTOR_TEMPLATE(VAbsDiff);
322 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
323         return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
324     );
325 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
326         __m128i d = _mm_subs_epi8(a, b);
327         __m128i m = _mm_cmpgt_epi8(b, a);
328         return _mm_subs_epi8(_mm_xor_si128(d, m), m);
329     );
330 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
331         return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
332     );
333 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
334         __m128i M = _mm_max_epi16(a, b);
335         __m128i m = _mm_min_epi16(a, b);
336         return _mm_subs_epi16(M, m);
337     );
338 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
339         __m128i d = _mm_sub_epi32(a, b);
340         __m128i m = _mm_cmpgt_epi32(b, a);
341         return _mm_sub_epi32(_mm_xor_si128(d, m), m);
342     );
343 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
344         return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
345     );
346 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
347         return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
348     );
349
350 FUNCTOR_TEMPLATE(VAnd);
351 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
352 FUNCTOR_TEMPLATE(VOr);
353 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
354 FUNCTOR_TEMPLATE(VXor);
355 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
356 FUNCTOR_TEMPLATE(VNot);
357 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
358 #endif
359
360 #if CV_NEON
361
362 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
363     template <>                                                                \
364     struct name<template_arg>{                                                 \
365         typedef register_type reg_type;                                        \
366         static reg_type load(const template_arg * p) { return load_body (p);}; \
367         static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
368     }
369
370 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
371     template<>                                                         \
372     struct name<template_arg>                                          \
373     {                                                                  \
374         VLoadStore128<template_arg>::reg_type operator()(              \
375                         VLoadStore128<template_arg>::reg_type a,       \
376                         VLoadStore128<template_arg>::reg_type b) const \
377         {                                                              \
378             return body;                                               \
379         };                                                             \
380     }
381
382 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
383     template<>                                                         \
384     struct name<template_arg>                                          \
385     {                                                                  \
386         VLoadStore128<template_arg>::reg_type operator()(              \
387                         VLoadStore128<template_arg>::reg_type a,       \
388                         VLoadStore128<template_arg>::reg_type  ) const \
389         {                                                              \
390             return body;                                               \
391         };                                                             \
392     }
393
394 FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
395 FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
396 FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
397 FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
398 FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
399 FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
400
401 FUNCTOR_TEMPLATE(VAdd);
402 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
403 FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
404 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
405 FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
406 FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
407 FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
408
409 FUNCTOR_TEMPLATE(VSub);
410 FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
411 FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
412 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
413 FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
414 FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
415 FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
416
417 FUNCTOR_TEMPLATE(VMin);
418 FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
419 FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
420 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
421 FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
422 FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
423 FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
424
425 FUNCTOR_TEMPLATE(VMax);
426 FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
427 FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
428 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
429 FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
430 FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
431 FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
432
433 FUNCTOR_TEMPLATE(VAbsDiff);
434 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
435 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
436 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
437 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
438 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
439 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
440
441 FUNCTOR_TEMPLATE(VAnd);
442 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
443 FUNCTOR_TEMPLATE(VOr);
444 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
445 FUNCTOR_TEMPLATE(VXor);
446 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
447 FUNCTOR_TEMPLATE(VNot);
448 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
449 #endif
450
451
452 template <typename T>
453 struct Cmp_SIMD
454 {
455     explicit Cmp_SIMD(int)
456     {
457     }
458
459     int operator () (const T *, const T *, uchar *, int) const
460     {
461         return 0;
462     }
463 };
464
465 #if CV_NEON
466
467 template <>
468 struct Cmp_SIMD<schar>
469 {
470     explicit Cmp_SIMD(int code_) :
471         code(code_)
472     {
473         // CV_Assert(code == CMP_GT || code == CMP_LE ||
474         //           code == CMP_EQ || code == CMP_NE);
475
476         v_mask = vdupq_n_u8(255);
477     }
478
479     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
480     {
481         int x = 0;
482
483         if (code == CMP_GT)
484             for ( ; x <= width - 16; x += 16)
485                 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
486         else if (code == CMP_LE)
487             for ( ; x <= width - 16; x += 16)
488                 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
489         else if (code == CMP_EQ)
490             for ( ; x <= width - 16; x += 16)
491                 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
492         else if (code == CMP_NE)
493             for ( ; x <= width - 16; x += 16)
494                 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
495
496         return x;
497     }
498
499     int code;
500     uint8x16_t v_mask;
501 };
502
503 template <>
504 struct Cmp_SIMD<ushort>
505 {
506     explicit Cmp_SIMD(int code_) :
507         code(code_)
508     {
509         // CV_Assert(code == CMP_GT || code == CMP_LE ||
510         //           code == CMP_EQ || code == CMP_NE);
511
512         v_mask = vdup_n_u8(255);
513     }
514
515     int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
516     {
517         int x = 0;
518
519         if (code == CMP_GT)
520             for ( ; x <= width - 8; x += 8)
521             {
522                 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
523                 vst1_u8(dst + x, vmovn_u16(v_dst));
524             }
525         else if (code == CMP_LE)
526             for ( ; x <= width - 8; x += 8)
527             {
528                 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
529                 vst1_u8(dst + x, vmovn_u16(v_dst));
530             }
531         else if (code == CMP_EQ)
532             for ( ; x <= width - 8; x += 8)
533             {
534                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
535                 vst1_u8(dst + x, vmovn_u16(v_dst));
536             }
537         else if (code == CMP_NE)
538             for ( ; x <= width - 8; x += 8)
539             {
540                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
541                 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
542             }
543
544         return x;
545     }
546
547     int code;
548     uint8x8_t v_mask;
549 };
550
551 template <>
552 struct Cmp_SIMD<int>
553 {
554     explicit Cmp_SIMD(int code_) :
555         code(code_)
556     {
557         // CV_Assert(code == CMP_GT || code == CMP_LE ||
558         //           code == CMP_EQ || code == CMP_NE);
559
560         v_mask = vdup_n_u8(255);
561     }
562
563     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
564     {
565         int x = 0;
566
567         if (code == CMP_GT)
568             for ( ; x <= width - 8; x += 8)
569             {
570                 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
571                 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
572                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
573             }
574         else if (code == CMP_LE)
575             for ( ; x <= width - 8; x += 8)
576             {
577                 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
578                 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
579                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
580             }
581         else if (code == CMP_EQ)
582             for ( ; x <= width - 8; x += 8)
583             {
584                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
585                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
586                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
587             }
588         else if (code == CMP_NE)
589             for ( ; x <= width - 8; x += 8)
590             {
591                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
592                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
593                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
594                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
595             }
596
597         return x;
598     }
599
600     int code;
601     uint8x8_t v_mask;
602 };
603
604 template <>
605 struct Cmp_SIMD<float>
606 {
607     explicit Cmp_SIMD(int code_) :
608         code(code_)
609     {
610         // CV_Assert(code == CMP_GT || code == CMP_LE ||
611         //           code == CMP_EQ || code == CMP_NE);
612
613         v_mask = vdup_n_u8(255);
614     }
615
616     int operator () (const float * src1, const float * src2, uchar * dst, int width) const
617     {
618         int x = 0;
619
620         if (code == CMP_GT)
621             for ( ; x <= width - 8; x += 8)
622             {
623                 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
624                 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
625                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
626             }
627         else if (code == CMP_LE)
628             for ( ; x <= width - 8; x += 8)
629             {
630                 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
631                 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
632                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
633             }
634         else if (code == CMP_EQ)
635             for ( ; x <= width - 8; x += 8)
636             {
637                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
638                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
639                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
640             }
641         else if (code == CMP_NE)
642             for ( ; x <= width - 8; x += 8)
643             {
644                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
645                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
646                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
647                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
648             }
649
650         return x;
651     }
652
653     int code;
654     uint8x8_t v_mask;
655 };
656
657 #elif CV_SSE2
658
659 template <>
660 struct Cmp_SIMD<schar>
661 {
662     explicit Cmp_SIMD(int code_) :
663         code(code_)
664     {
665         // CV_Assert(code == CMP_GT || code == CMP_LE ||
666         //           code == CMP_EQ || code == CMP_NE);
667
668         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
669
670         v_mask = _mm_set1_epi8(-1);
671     }
672
673     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
674     {
675         int x = 0;
676
677         if (!haveSSE)
678             return x;
679
680         if (code == CMP_GT)
681             for ( ; x <= width - 16; x += 16)
682                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
683                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
684         else if (code == CMP_LE)
685             for ( ; x <= width - 16; x += 16)
686             {
687                 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
688                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
689                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
690             }
691         else if (code == CMP_EQ)
692             for ( ; x <= width - 16; x += 16)
693                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
694                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
695         else if (code == CMP_NE)
696             for ( ; x <= width - 16; x += 16)
697             {
698                 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
699                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
700                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
701             }
702
703         return x;
704     }
705
706     int code;
707     __m128i v_mask;
708     bool haveSSE;
709 };
710
711 template <>
712 struct Cmp_SIMD<int>
713 {
714     explicit Cmp_SIMD(int code_) :
715         code(code_)
716     {
717         // CV_Assert(code == CMP_GT || code == CMP_LE ||
718         //           code == CMP_EQ || code == CMP_NE);
719
720         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
721
722         v_mask = _mm_set1_epi32(0xffffffff);
723     }
724
725     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
726     {
727         int x = 0;
728
729         if (!haveSSE)
730             return x;
731
732         if (code == CMP_GT)
733             for ( ; x <= width - 8; x += 8)
734             {
735                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
736                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
737                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
738                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
739
740                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
741             }
742         else if (code == CMP_LE)
743             for ( ; x <= width - 8; x += 8)
744             {
745                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
746                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
747                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
748                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
749
750                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
751             }
752         else if (code == CMP_EQ)
753             for ( ; x <= width - 8; x += 8)
754             {
755                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
756                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
757                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
758                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
759
760                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
761             }
762         else if (code == CMP_NE)
763             for ( ; x <= width - 8; x += 8)
764             {
765                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
766                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
767                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
768                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
769
770                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
771             }
772
773         return x;
774     }
775
776     int code;
777     __m128i v_mask;
778     bool haveSSE;
779 };
780
781 #endif
782
783
784 template <typename T, typename WT>
785 struct Mul_SIMD
786 {
787     int operator() (const T *, const T *, T *, int, WT) const
788     {
789         return 0;
790     }
791 };
792
793 #if CV_NEON
794
795 template <>
796 struct Mul_SIMD<uchar, float>
797 {
798     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
799     {
800         int x = 0;
801
802         if( scale == 1.0f )
803             for ( ; x <= width - 8; x += 8)
804             {
805                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
806                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
807
808                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
809                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
810                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
811                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
812
813                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
814                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
815                 vst1_u8(dst + x, vqmovn_u16(v_dst));
816             }
817         else
818         {
819             float32x4_t v_scale = vdupq_n_f32(scale);
820             for ( ; x <= width - 8; x += 8)
821             {
822                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
823                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
824
825                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
826                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
827                 v_dst1 = vmulq_f32(v_dst1, v_scale);
828                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
829                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
830                 v_dst2 = vmulq_f32(v_dst2, v_scale);
831
832                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
833                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
834                 vst1_u8(dst + x, vqmovn_u16(v_dst));
835             }
836         }
837
838         return x;
839     }
840 };
841
842 template <>
843 struct Mul_SIMD<schar, float>
844 {
845     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
846     {
847         int x = 0;
848
849         if( scale == 1.0f )
850             for ( ; x <= width - 8; x += 8)
851             {
852                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
853                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
854
855                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
856                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
857                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
858                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
859
860                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
861                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
862                 vst1_s8(dst + x, vqmovn_s16(v_dst));
863             }
864         else
865         {
866             float32x4_t v_scale = vdupq_n_f32(scale);
867             for ( ; x <= width - 8; x += 8)
868             {
869                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
870                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
871
872                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
873                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
874                 v_dst1 = vmulq_f32(v_dst1, v_scale);
875                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
876                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
877                 v_dst2 = vmulq_f32(v_dst2, v_scale);
878
879                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
880                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
881                 vst1_s8(dst + x, vqmovn_s16(v_dst));
882             }
883         }
884
885         return x;
886     }
887 };
888
889 template <>
890 struct Mul_SIMD<ushort, float>
891 {
892     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
893     {
894         int x = 0;
895
896         if( scale == 1.0f )
897             for ( ; x <= width - 8; x += 8)
898             {
899                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
900
901                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
902                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
903                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
904                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
905
906                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
907                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
908                 vst1q_u16(dst + x, v_dst);
909             }
910         else
911         {
912             float32x4_t v_scale = vdupq_n_f32(scale);
913             for ( ; x <= width - 8; x += 8)
914             {
915                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
916
917                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
918                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
919                 v_dst1 = vmulq_f32(v_dst1, v_scale);
920                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
921                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
922                 v_dst2 = vmulq_f32(v_dst2, v_scale);
923
924                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
925                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
926                 vst1q_u16(dst + x, v_dst);
927             }
928         }
929
930         return x;
931     }
932 };
933
934 template <>
935 struct Mul_SIMD<short, float>
936 {
937     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
938     {
939         int x = 0;
940
941         if( scale == 1.0f )
942             for ( ; x <= width - 8; x += 8)
943             {
944                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
945
946                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
947                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
948                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
949                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
950
951                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
952                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
953                 vst1q_s16(dst + x, v_dst);
954             }
955         else
956         {
957             float32x4_t v_scale = vdupq_n_f32(scale);
958             for ( ; x <= width - 8; x += 8)
959             {
960                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
961
962                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
963                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
964                 v_dst1 = vmulq_f32(v_dst1, v_scale);
965                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
966                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
967                 v_dst2 = vmulq_f32(v_dst2, v_scale);
968
969                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
970                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
971                 vst1q_s16(dst + x, v_dst);
972             }
973         }
974
975         return x;
976     }
977 };
978
979 template <>
980 struct Mul_SIMD<float, float>
981 {
982     int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
983     {
984         int x = 0;
985
986         if( scale == 1.0f )
987             for ( ; x <= width - 8; x += 8)
988             {
989                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
990                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
991                 vst1q_f32(dst + x, v_dst1);
992                 vst1q_f32(dst + x + 4, v_dst2);
993             }
994         else
995         {
996             float32x4_t v_scale = vdupq_n_f32(scale);
997             for ( ; x <= width - 8; x += 8)
998             {
999                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
1000                 v_dst1 = vmulq_f32(v_dst1, v_scale);
1001
1002                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
1003                 v_dst2 = vmulq_f32(v_dst2, v_scale);
1004
1005                 vst1q_f32(dst + x, v_dst1);
1006                 vst1q_f32(dst + x + 4, v_dst2);
1007             }
1008         }
1009
1010         return x;
1011     }
1012 };
1013
1014 #elif CV_SSE2
1015
1016 #if CV_SSE4_1
1017
1018 template <>
1019 struct Mul_SIMD<ushort, float>
1020 {
1021     Mul_SIMD()
1022     {
1023         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
1024     }
1025
1026     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
1027     {
1028         int x = 0;
1029
1030         if (!haveSSE)
1031             return x;
1032
1033         __m128i v_zero = _mm_setzero_si128();
1034
1035         if( scale != 1.0f )
1036         {
1037             __m128 v_scale = _mm_set1_ps(scale);
1038             for ( ; x <= width - 8; x += 8)
1039             {
1040                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1041                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1042
1043                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
1044                                            _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
1045                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1046
1047                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
1048                                            _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
1049                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1050
1051                 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1052                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1053             }
1054         }
1055
1056         return x;
1057     }
1058
1059     bool haveSSE;
1060 };
1061
1062 #endif
1063
1064 template <>
1065 struct Mul_SIMD<schar, float>
1066 {
1067     Mul_SIMD()
1068     {
1069         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1070     }
1071
1072     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
1073     {
1074         int x = 0;
1075
1076         if (!haveSSE)
1077             return x;
1078
1079         __m128i v_zero = _mm_setzero_si128();
1080
1081         if( scale == 1.0f )
1082             for ( ; x <= width - 8; x += 8)
1083             {
1084                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1085                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1086
1087                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1088                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1089
1090                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1091                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1092
1093                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1094                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1095
1096                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1097                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1098             }
1099         else
1100         {
1101             __m128 v_scale = _mm_set1_ps(scale);
1102             for ( ; x <= width - 8; x += 8)
1103             {
1104                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1105                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1106
1107                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1108                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1109
1110                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1111                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1112                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1113
1114                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1115                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1116                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1117
1118                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1119                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1120             }
1121         }
1122
1123         return x;
1124     }
1125
1126     bool haveSSE;
1127 };
1128
1129 template <>
1130 struct Mul_SIMD<short, float>
1131 {
1132     Mul_SIMD()
1133     {
1134         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1135     }
1136
1137     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
1138     {
1139         int x = 0;
1140
1141         if (!haveSSE)
1142             return x;
1143
1144         __m128i v_zero = _mm_setzero_si128();
1145
1146         if( scale != 1.0f )
1147         {
1148             __m128 v_scale = _mm_set1_ps(scale);
1149             for ( ; x <= width - 8; x += 8)
1150             {
1151                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1152                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1153
1154                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1155                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1156                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1157
1158                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1159                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1160                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1161
1162                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1163                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1164             }
1165         }
1166
1167         return x;
1168     }
1169
1170     bool haveSSE;
1171 };
1172
1173 #endif
1174
1175 template <typename T>
1176 struct Div_SIMD
1177 {
1178     int operator() (const T *, const T *, T *, int, double) const
1179     {
1180         return 0;
1181     }
1182 };
1183
1184 template <typename T>
1185 struct Recip_SIMD
1186 {
1187     int operator() (const T *, T *, int, double) const
1188     {
1189         return 0;
1190     }
1191 };
1192
1193
1194 #if CV_SIMD128
1195
1196 template <>
1197 struct Div_SIMD<uchar>
1198 {
1199     bool haveSIMD;
1200     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1201
1202     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
1203     {
1204         int x = 0;
1205
1206         if (!haveSIMD)
1207             return x;
1208
1209         v_float32x4 v_scale = v_setall_f32((float)scale);
1210         v_uint16x8 v_zero = v_setzero_u16();
1211
1212         for ( ; x <= width - 8; x += 8)
1213         {
1214             v_uint16x8 v_src1 = v_load_expand(src1 + x);
1215             v_uint16x8 v_src2 = v_load_expand(src2 + x);
1216
1217             v_uint32x4 t0, t1, t2, t3;
1218             v_expand(v_src1, t0, t1);
1219             v_expand(v_src2, t2, t3);
1220
1221             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1222             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1223
1224             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1225             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1226
1227             f0 = f0 * v_scale / f2;
1228             f1 = f1 * v_scale / f3;
1229
1230             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1231             v_uint16x8 res = v_pack_u(i0, i1);
1232
1233             res = v_select(v_src2 == v_zero, v_zero, res);
1234             v_pack_store(dst + x, res);
1235         }
1236
1237         return x;
1238     }
1239 };
1240
1241
1242 template <>
1243 struct Div_SIMD<schar>
1244 {
1245     bool haveSIMD;
1246     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1247
1248     int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
1249     {
1250         int x = 0;
1251
1252         if (!haveSIMD)
1253             return x;
1254
1255         v_float32x4 v_scale = v_setall_f32((float)scale);
1256         v_int16x8 v_zero = v_setzero_s16();
1257
1258         for ( ; x <= width - 8; x += 8)
1259         {
1260             v_int16x8 v_src1 = v_load_expand(src1 + x);
1261             v_int16x8 v_src2 = v_load_expand(src2 + x);
1262
1263             v_int32x4 t0, t1, t2, t3;
1264             v_expand(v_src1, t0, t1);
1265             v_expand(v_src2, t2, t3);
1266
1267             v_float32x4 f0 = v_cvt_f32(t0);
1268             v_float32x4 f1 = v_cvt_f32(t1);
1269
1270             v_float32x4 f2 = v_cvt_f32(t2);
1271             v_float32x4 f3 = v_cvt_f32(t3);
1272
1273             f0 = f0 * v_scale / f2;
1274             f1 = f1 * v_scale / f3;
1275
1276             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1277             v_int16x8 res = v_pack(i0, i1);
1278
1279             res = v_select(v_src2 == v_zero, v_zero, res);
1280             v_pack_store(dst + x, res);
1281         }
1282
1283         return x;
1284     }
1285 };
1286
1287
1288 template <>
1289 struct Div_SIMD<ushort>
1290 {
1291     bool haveSIMD;
1292     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1293
1294     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
1295     {
1296         int x = 0;
1297
1298         if (!haveSIMD)
1299             return x;
1300
1301         v_float32x4 v_scale = v_setall_f32((float)scale);
1302         v_uint16x8 v_zero = v_setzero_u16();
1303
1304         for ( ; x <= width - 8; x += 8)
1305         {
1306             v_uint16x8 v_src1 = v_load(src1 + x);
1307             v_uint16x8 v_src2 = v_load(src2 + x);
1308
1309             v_uint32x4 t0, t1, t2, t3;
1310             v_expand(v_src1, t0, t1);
1311             v_expand(v_src2, t2, t3);
1312
1313             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1314             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1315
1316             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1317             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1318
1319             f0 = f0 * v_scale / f2;
1320             f1 = f1 * v_scale / f3;
1321
1322             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1323             v_uint16x8 res = v_pack_u(i0, i1);
1324
1325             res = v_select(v_src2 == v_zero, v_zero, res);
1326             v_store(dst + x, res);
1327         }
1328
1329         return x;
1330     }
1331 };
1332
1333 template <>
1334 struct Div_SIMD<short>
1335 {
1336     bool haveSIMD;
1337     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1338
1339     int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
1340     {
1341         int x = 0;
1342
1343         if (!haveSIMD)
1344             return x;
1345
1346         v_float32x4 v_scale = v_setall_f32((float)scale);
1347         v_int16x8 v_zero = v_setzero_s16();
1348
1349         for ( ; x <= width - 8; x += 8)
1350         {
1351             v_int16x8 v_src1 = v_load(src1 + x);
1352             v_int16x8 v_src2 = v_load(src2 + x);
1353
1354             v_int32x4 t0, t1, t2, t3;
1355             v_expand(v_src1, t0, t1);
1356             v_expand(v_src2, t2, t3);
1357
1358             v_float32x4 f0 = v_cvt_f32(t0);
1359             v_float32x4 f1 = v_cvt_f32(t1);
1360
1361             v_float32x4 f2 = v_cvt_f32(t2);
1362             v_float32x4 f3 = v_cvt_f32(t3);
1363
1364             f0 = f0 * v_scale / f2;
1365             f1 = f1 * v_scale / f3;
1366
1367             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1368             v_int16x8 res = v_pack(i0, i1);
1369
1370             res = v_select(v_src2 == v_zero, v_zero, res);
1371             v_store(dst + x, res);
1372         }
1373
1374         return x;
1375     }
1376 };
1377
1378 template <>
1379 struct Div_SIMD<int>
1380 {
1381     bool haveSIMD;
1382     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1383
1384     int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
1385     {
1386         int x = 0;
1387
1388         if (!haveSIMD)
1389             return x;
1390
1391         v_float32x4 v_scale = v_setall_f32((float)scale);
1392         v_int32x4 v_zero = v_setzero_s32();
1393
1394         for ( ; x <= width - 8; x += 8)
1395         {
1396             v_int32x4 t0 = v_load(src1 + x);
1397             v_int32x4 t1 = v_load(src1 + x + 4);
1398             v_int32x4 t2 = v_load(src2 + x);
1399             v_int32x4 t3 = v_load(src2 + x + 4);
1400
1401             v_float32x4 f0 = v_cvt_f32(t0);
1402             v_float32x4 f1 = v_cvt_f32(t1);
1403             v_float32x4 f2 = v_cvt_f32(t2);
1404             v_float32x4 f3 = v_cvt_f32(t3);
1405
1406             f0 = f0 * v_scale / f2;
1407             f1 = f1 * v_scale / f3;
1408
1409             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1410
1411             res0 = v_select(t2 == v_zero, v_zero, res0);
1412             res1 = v_select(t3 == v_zero, v_zero, res1);
1413             v_store(dst + x, res0);
1414             v_store(dst + x + 4, res1);
1415         }
1416
1417         return x;
1418     }
1419 };
1420
1421
1422 template <>
1423 struct Div_SIMD<float>
1424 {
1425     bool haveSIMD;
1426     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1427
1428     int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
1429     {
1430         int x = 0;
1431
1432         if (!haveSIMD)
1433             return x;
1434
1435         v_float32x4 v_scale = v_setall_f32((float)scale);
1436         v_float32x4 v_zero = v_setzero_f32();
1437
1438         for ( ; x <= width - 8; x += 8)
1439         {
1440             v_float32x4 f0 = v_load(src1 + x);
1441             v_float32x4 f1 = v_load(src1 + x + 4);
1442             v_float32x4 f2 = v_load(src2 + x);
1443             v_float32x4 f3 = v_load(src2 + x + 4);
1444
1445             v_float32x4 res0 = f0 * v_scale / f2;
1446             v_float32x4 res1 = f1 * v_scale / f3;
1447
1448             res0 = v_select(f2 == v_zero, v_zero, res0);
1449             res1 = v_select(f3 == v_zero, v_zero, res1);
1450
1451             v_store(dst + x, res0);
1452             v_store(dst + x + 4, res1);
1453         }
1454
1455         return x;
1456     }
1457 };
1458
1459
1460 ///////////////////////// RECIPROCAL //////////////////////
1461
1462 template <>
1463 struct Recip_SIMD<uchar>
1464 {
1465     bool haveSIMD;
1466     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1467
1468     int operator() (const uchar * src2, uchar * dst, int width, double scale) const
1469     {
1470         int x = 0;
1471
1472         if (!haveSIMD)
1473             return x;
1474
1475         v_float32x4 v_scale = v_setall_f32((float)scale);
1476         v_uint16x8 v_zero = v_setzero_u16();
1477
1478         for ( ; x <= width - 8; x += 8)
1479         {
1480             v_uint16x8 v_src2 = v_load_expand(src2 + x);
1481
1482             v_uint32x4 t0, t1;
1483             v_expand(v_src2, t0, t1);
1484
1485             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1486             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1487
1488             f0 = v_scale / f0;
1489             f1 = v_scale / f1;
1490
1491             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1492             v_uint16x8 res = v_pack_u(i0, i1);
1493
1494             res = v_select(v_src2 == v_zero, v_zero, res);
1495             v_pack_store(dst + x, res);
1496         }
1497
1498         return x;
1499     }
1500 };
1501
1502
1503 template <>
1504 struct Recip_SIMD<schar>
1505 {
1506     bool haveSIMD;
1507     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1508
1509     int operator() (const schar * src2, schar * dst, int width, double scale) const
1510     {
1511         int x = 0;
1512
1513         if (!haveSIMD)
1514             return x;
1515
1516         v_float32x4 v_scale = v_setall_f32((float)scale);
1517         v_int16x8 v_zero = v_setzero_s16();
1518
1519         for ( ; x <= width - 8; x += 8)
1520         {
1521             v_int16x8 v_src2 = v_load_expand(src2 + x);
1522
1523             v_int32x4 t0, t1;
1524             v_expand(v_src2, t0, t1);
1525
1526             v_float32x4 f0 = v_cvt_f32(t0);
1527             v_float32x4 f1 = v_cvt_f32(t1);
1528
1529             f0 = v_scale / f0;
1530             f1 = v_scale / f1;
1531
1532             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1533             v_int16x8 res = v_pack(i0, i1);
1534
1535             res = v_select(v_src2 == v_zero, v_zero, res);
1536             v_pack_store(dst + x, res);
1537         }
1538
1539         return x;
1540     }
1541 };
1542
1543
1544 template <>
1545 struct Recip_SIMD<ushort>
1546 {
1547     bool haveSIMD;
1548     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1549
1550     int operator() (const ushort * src2, ushort * dst, int width, double scale) const
1551     {
1552         int x = 0;
1553
1554         if (!haveSIMD)
1555             return x;
1556
1557         v_float32x4 v_scale = v_setall_f32((float)scale);
1558         v_uint16x8 v_zero = v_setzero_u16();
1559
1560         for ( ; x <= width - 8; x += 8)
1561         {
1562             v_uint16x8 v_src2 = v_load(src2 + x);
1563
1564             v_uint32x4 t0, t1;
1565             v_expand(v_src2, t0, t1);
1566
1567             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1568             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1569
1570             f0 = v_scale / f0;
1571             f1 = v_scale / f1;
1572
1573             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1574             v_uint16x8 res = v_pack_u(i0, i1);
1575
1576             res = v_select(v_src2 == v_zero, v_zero, res);
1577             v_store(dst + x, res);
1578         }
1579
1580         return x;
1581     }
1582 };
1583
1584 template <>
1585 struct Recip_SIMD<short>
1586 {
1587     bool haveSIMD;
1588     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1589
1590     int operator() (const short * src2, short * dst, int width, double scale) const
1591     {
1592         int x = 0;
1593
1594         if (!haveSIMD)
1595             return x;
1596
1597         v_float32x4 v_scale = v_setall_f32((float)scale);
1598         v_int16x8 v_zero = v_setzero_s16();
1599
1600         for ( ; x <= width - 8; x += 8)
1601         {
1602             v_int16x8 v_src2 = v_load(src2 + x);
1603
1604             v_int32x4 t0, t1;
1605             v_expand(v_src2, t0, t1);
1606
1607             v_float32x4 f0 = v_cvt_f32(t0);
1608             v_float32x4 f1 = v_cvt_f32(t1);
1609
1610             f0 = v_scale / f0;
1611             f1 = v_scale / f1;
1612
1613             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1614             v_int16x8 res = v_pack(i0, i1);
1615
1616             res = v_select(v_src2 == v_zero, v_zero, res);
1617             v_store(dst + x, res);
1618         }
1619
1620         return x;
1621     }
1622 };
1623
1624 template <>
1625 struct Recip_SIMD<int>
1626 {
1627     bool haveSIMD;
1628     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1629
1630     int operator() (const int * src2, int * dst, int width, double scale) const
1631     {
1632         int x = 0;
1633
1634         if (!haveSIMD)
1635             return x;
1636
1637         v_float32x4 v_scale = v_setall_f32((float)scale);
1638         v_int32x4 v_zero = v_setzero_s32();
1639
1640         for ( ; x <= width - 8; x += 8)
1641         {
1642             v_int32x4 t0 = v_load(src2 + x);
1643             v_int32x4 t1 = v_load(src2 + x + 4);
1644
1645             v_float32x4 f0 = v_cvt_f32(t0);
1646             v_float32x4 f1 = v_cvt_f32(t1);
1647
1648             f0 = v_scale / f0;
1649             f1 = v_scale / f1;
1650
1651             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1652
1653             res0 = v_select(t0 == v_zero, v_zero, res0);
1654             res1 = v_select(t1 == v_zero, v_zero, res1);
1655             v_store(dst + x, res0);
1656             v_store(dst + x + 4, res1);
1657         }
1658
1659         return x;
1660     }
1661 };
1662
1663
1664 template <>
1665 struct Recip_SIMD<float>
1666 {
1667     bool haveSIMD;
1668     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1669
1670     int operator() (const float * src2, float * dst, int width, double scale) const
1671     {
1672         int x = 0;
1673
1674         if (!haveSIMD)
1675             return x;
1676
1677         v_float32x4 v_scale = v_setall_f32((float)scale);
1678         v_float32x4 v_zero = v_setzero_f32();
1679
1680         for ( ; x <= width - 8; x += 8)
1681         {
1682             v_float32x4 f0 = v_load(src2 + x);
1683             v_float32x4 f1 = v_load(src2 + x + 4);
1684
1685             v_float32x4 res0 = v_scale / f0;
1686             v_float32x4 res1 = v_scale / f1;
1687
1688             res0 = v_select(f0 == v_zero, v_zero, res0);
1689             res1 = v_select(f1 == v_zero, v_zero, res1);
1690
1691             v_store(dst + x, res0);
1692             v_store(dst + x + 4, res1);
1693         }
1694
1695         return x;
1696     }
1697 };
1698
1699 #if CV_SIMD128_64F
1700
1701 template <>
1702 struct Div_SIMD<double>
1703 {
1704     bool haveSIMD;
1705     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1706
1707     int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
1708     {
1709         int x = 0;
1710
1711         if (!haveSIMD)
1712             return x;
1713
1714         v_float64x2 v_scale = v_setall_f64(scale);
1715         v_float64x2 v_zero = v_setzero_f64();
1716
1717         for ( ; x <= width - 4; x += 4)
1718         {
1719             v_float64x2 f0 = v_load(src1 + x);
1720             v_float64x2 f1 = v_load(src1 + x + 2);
1721             v_float64x2 f2 = v_load(src2 + x);
1722             v_float64x2 f3 = v_load(src2 + x + 2);
1723
1724             v_float64x2 res0 = f0 * v_scale / f2;
1725             v_float64x2 res1 = f1 * v_scale / f3;
1726
1727             res0 = v_select(f0 == v_zero, v_zero, res0);
1728             res1 = v_select(f1 == v_zero, v_zero, res1);
1729
1730             v_store(dst + x, res0);
1731             v_store(dst + x + 2, res1);
1732         }
1733
1734         return x;
1735     }
1736 };
1737
1738 template <>
1739 struct Recip_SIMD<double>
1740 {
1741     bool haveSIMD;
1742     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1743
1744     int operator() (const double * src2, double * dst, int width, double scale) const
1745     {
1746         int x = 0;
1747
1748         if (!haveSIMD)
1749             return x;
1750
1751         v_float64x2 v_scale = v_setall_f64(scale);
1752         v_float64x2 v_zero = v_setzero_f64();
1753
1754         for ( ; x <= width - 4; x += 4)
1755         {
1756             v_float64x2 f0 = v_load(src2 + x);
1757             v_float64x2 f1 = v_load(src2 + x + 2);
1758
1759             v_float64x2 res0 = v_scale / f0;
1760             v_float64x2 res1 = v_scale / f1;
1761
1762             res0 = v_select(f0 == v_zero, v_zero, res0);
1763             res1 = v_select(f1 == v_zero, v_zero, res1);
1764
1765             v_store(dst + x, res0);
1766             v_store(dst + x + 2, res1);
1767         }
1768
1769         return x;
1770     }
1771 };
1772
1773 #endif
1774
1775 #endif
1776
1777
1778 template <typename T, typename WT>
1779 struct AddWeighted_SIMD
1780 {
1781     int operator() (const T *, const T *, T *, int, WT, WT, WT) const
1782     {
1783         return 0;
1784     }
1785 };
1786
1787 #if CV_SSE2
1788
1789 template <>
1790 struct AddWeighted_SIMD<schar, float>
1791 {
1792     AddWeighted_SIMD()
1793     {
1794         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1795     }
1796
1797     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1798     {
1799         int x = 0;
1800
1801         if (!haveSSE2)
1802             return x;
1803
1804         __m128i v_zero = _mm_setzero_si128();
1805         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1806                v_gamma = _mm_set1_ps(gamma);
1807
1808         for( ; x <= width - 8; x += 8 )
1809         {
1810             __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
1811             __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
1812
1813             __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1814             __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1815
1816             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
1817             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1818                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
1819
1820             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
1821             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1822                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
1823
1824             __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1825                                               _mm_cvtps_epi32(v_dstf1));
1826
1827             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
1828         }
1829
1830         return x;
1831     }
1832
1833     bool haveSSE2;
1834 };
1835
1836 template <>
1837 struct AddWeighted_SIMD<short, float>
1838 {
1839     AddWeighted_SIMD()
1840     {
1841         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1842     }
1843
1844     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1845     {
1846         int x = 0;
1847
1848         if (!haveSSE2)
1849             return x;
1850
1851         __m128i v_zero = _mm_setzero_si128();
1852         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1853                v_gamma = _mm_set1_ps(gamma);
1854
1855         for( ; x <= width - 8; x += 8 )
1856         {
1857             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1858             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1859
1860             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
1861             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1862                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
1863
1864             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
1865             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1866                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
1867
1868             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1869                                                                    _mm_cvtps_epi32(v_dstf1)));
1870         }
1871
1872         return x;
1873     }
1874
1875     bool haveSSE2;
1876 };
1877
1878 #if CV_SSE4_1
1879
1880 template <>
1881 struct AddWeighted_SIMD<ushort, float>
1882 {
1883     AddWeighted_SIMD()
1884     {
1885         haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
1886     }
1887
1888     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1889     {
1890         int x = 0;
1891
1892         if (!haveSSE4_1)
1893             return x;
1894
1895         __m128i v_zero = _mm_setzero_si128();
1896         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1897                v_gamma = _mm_set1_ps(gamma);
1898
1899         for( ; x <= width - 8; x += 8 )
1900         {
1901             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1902             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1903
1904             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
1905             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1906                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
1907
1908             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
1909             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1910                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
1911
1912             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
1913                                                                     _mm_cvtps_epi32(v_dstf1)));
1914         }
1915
1916         return x;
1917     }
1918
1919     bool haveSSE4_1;
1920 };
1921
1922 #endif
1923
1924 #elif CV_NEON
1925
1926 template <>
1927 struct AddWeighted_SIMD<schar, float>
1928 {
1929     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1930     {
1931         int x = 0;
1932
1933         float32x4_t g = vdupq_n_f32 (gamma);
1934
1935         for( ; x <= width - 8; x += 8 )
1936         {
1937             int8x8_t in1 = vld1_s8(src1 + x);
1938             int16x8_t in1_16 = vmovl_s8(in1);
1939             float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
1940             float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
1941
1942             int8x8_t in2 = vld1_s8(src2+x);
1943             int16x8_t in2_16 = vmovl_s8(in2);
1944             float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
1945             float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
1946
1947             float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
1948             float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
1949             out_f_l = vaddq_f32(out_f_l, g);
1950             out_f_h = vaddq_f32(out_f_h, g);
1951
1952             int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
1953             int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
1954
1955             int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
1956             int8x8_t out = vqmovn_s16(out_16);
1957
1958             vst1_s8(dst + x, out);
1959         }
1960
1961         return x;
1962     }
1963 };
1964
1965 template <>
1966 struct AddWeighted_SIMD<ushort, float>
1967 {
1968     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1969     {
1970         int x = 0;
1971
1972         float32x4_t g = vdupq_n_f32(gamma);
1973
1974         for( ; x <= width - 8; x += 8 )
1975         {
1976             uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
1977
1978             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
1979             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
1980             uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1981
1982             v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
1983             v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
1984             uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1985
1986             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
1987         }
1988
1989         return x;
1990     }
1991 };
1992
1993 template <>
1994 struct AddWeighted_SIMD<short, float>
1995 {
1996     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1997     {
1998         int x = 0;
1999
2000         float32x4_t g = vdupq_n_f32(gamma);
2001
2002         for( ; x <= width - 8; x += 8 )
2003         {
2004             int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
2005
2006             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
2007             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
2008             int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
2009
2010             v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
2011             v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
2012             int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
2013
2014             vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
2015         }
2016
2017         return x;
2018     }
2019 };
2020
2021 #endif
2022
2023 }}
2024
2025 #endif // __OPENCV_HAL_ARITHM_SIMD_HPP__