1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef __OPENCV_ARITHM_SIMD_HPP__
46 #define __OPENCV_ARITHM_SIMD_HPP__
52 #if CV_SSE2 || CV_NEON
53 #define IF_SIMD(op) op
55 #define IF_SIMD(op) NOP
59 #if CV_SSE2 || CV_NEON
61 #define FUNCTOR_TEMPLATE(name) \
62 template<typename T> struct name {}
64 FUNCTOR_TEMPLATE(VLoadStore128);
66 FUNCTOR_TEMPLATE(VLoadStore64);
67 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
69 FUNCTOR_TEMPLATE(VLoadStore256);
70 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
78 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \
80 struct name<template_arg>{ \
81 typedef register_type reg_type; \
82 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
83 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
86 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
88 struct name<template_arg>{ \
89 typedef register_type reg_type; \
90 static reg_type load(const template_arg * p) { return load_body (p); } \
91 static void store(template_arg * p, reg_type v) { store_body (p, v); } \
94 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \
96 struct name<template_arg> \
98 VLoadStore256<template_arg>::reg_type operator()( \
99 const VLoadStore256<template_arg>::reg_type & a, \
100 const VLoadStore256<template_arg>::reg_type & b) const \
106 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \
108 struct name<template_arg> \
110 VLoadStore256<template_arg>::reg_type operator()( \
111 const VLoadStore256<template_arg>::reg_type & a, \
112 const VLoadStore256<template_arg>::reg_type & ) const \
118 FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
119 FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
120 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
121 FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
122 FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
123 FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps );
124 FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd );
126 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256);
127 FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps );
128 FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd );
130 FUNCTOR_TEMPLATE(VAdd);
131 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b));
132 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b));
133 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
134 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b));
135 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b));
136 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b));
137 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b));
139 FUNCTOR_TEMPLATE(VSub);
140 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b));
141 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b));
142 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
143 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b));
144 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b));
145 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b));
146 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b));
148 FUNCTOR_TEMPLATE(VMin);
149 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b));
150 FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b));
151 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
152 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b));
153 FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b));
154 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b));
155 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b));
157 FUNCTOR_TEMPLATE(VMax);
158 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b));
159 FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b));
160 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
161 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b));
162 FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b));
163 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b));
164 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b));
167 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
168 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
169 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
170 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
172 FUNCTOR_TEMPLATE(VAbsDiff);
173 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
174 return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
176 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
177 __m256i d = _mm256_subs_epi8(a, b);
178 __m256i m = _mm256_cmpgt_epi8(b, a);
179 return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
181 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
182 return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
184 FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
185 __m256i M = _mm256_max_epi16(a, b);
186 __m256i m = _mm256_min_epi16(a, b);
187 return _mm256_subs_epi16(M, m);
189 FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
190 __m256i d = _mm256_sub_epi32(a, b);
191 __m256i m = _mm256_cmpgt_epi32(b, a);
192 return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
194 FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
195 return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
197 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
198 return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
201 FUNCTOR_TEMPLATE(VAnd);
202 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
203 FUNCTOR_TEMPLATE(VOr);
204 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
205 FUNCTOR_TEMPLATE(VXor);
206 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
207 FUNCTOR_TEMPLATE(VNot);
208 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
212 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
214 struct name<template_arg>{ \
215 typedef register_type reg_type; \
216 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
217 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
220 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
222 struct name<template_arg>{ \
223 typedef register_type reg_type; \
224 static reg_type load(const template_arg * p) { return load_body (p); } \
225 static void store(template_arg * p, reg_type v) { store_body (p, v); } \
228 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
230 struct name<template_arg> \
232 VLoadStore128<template_arg>::reg_type operator()( \
233 const VLoadStore128<template_arg>::reg_type & a, \
234 const VLoadStore128<template_arg>::reg_type & b) const \
240 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
242 struct name<template_arg> \
244 VLoadStore128<template_arg>::reg_type operator()( \
245 const VLoadStore128<template_arg>::reg_type & a, \
246 const VLoadStore128<template_arg>::reg_type & ) const \
252 FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
253 FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
254 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
255 FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
256 FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
257 FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps );
258 FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd );
260 FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
261 FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
262 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
263 FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
265 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128);
266 FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps );
267 FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd );
269 FUNCTOR_TEMPLATE(VAdd);
270 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b));
271 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b));
272 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
273 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b));
274 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b));
275 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b));
276 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b));
278 FUNCTOR_TEMPLATE(VSub);
279 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b));
280 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b));
281 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
282 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b));
283 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b));
284 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b));
285 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b));
287 FUNCTOR_TEMPLATE(VMin);
288 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
289 FUNCTOR_CLOSURE_2arg(VMin, schar,
290 __m128i m = _mm_cmpgt_epi8(a, b);
291 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
293 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
294 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b));
295 FUNCTOR_CLOSURE_2arg(VMin, int,
296 __m128i m = _mm_cmpgt_epi32(a, b);
297 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
299 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b));
300 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
302 FUNCTOR_TEMPLATE(VMax);
303 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
304 FUNCTOR_CLOSURE_2arg(VMax, schar,
305 __m128i m = _mm_cmpgt_epi8(b, a);
306 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
308 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
309 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b));
310 FUNCTOR_CLOSURE_2arg(VMax, int,
311 __m128i m = _mm_cmpgt_epi32(b, a);
312 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
314 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b));
315 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
318 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
319 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
321 FUNCTOR_TEMPLATE(VAbsDiff);
322 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
323 return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
325 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
326 __m128i d = _mm_subs_epi8(a, b);
327 __m128i m = _mm_cmpgt_epi8(b, a);
328 return _mm_subs_epi8(_mm_xor_si128(d, m), m);
330 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
331 return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
333 FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
334 __m128i M = _mm_max_epi16(a, b);
335 __m128i m = _mm_min_epi16(a, b);
336 return _mm_subs_epi16(M, m);
338 FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
339 __m128i d = _mm_sub_epi32(a, b);
340 __m128i m = _mm_cmpgt_epi32(b, a);
341 return _mm_sub_epi32(_mm_xor_si128(d, m), m);
343 FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
344 return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
346 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
347 return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
350 FUNCTOR_TEMPLATE(VAnd);
351 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
352 FUNCTOR_TEMPLATE(VOr);
353 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
354 FUNCTOR_TEMPLATE(VXor);
355 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
356 FUNCTOR_TEMPLATE(VNot);
357 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
362 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
364 struct name<template_arg>{ \
365 typedef register_type reg_type; \
366 static reg_type load(const template_arg * p) { return load_body (p);}; \
367 static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
370 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
372 struct name<template_arg> \
374 VLoadStore128<template_arg>::reg_type operator()( \
375 VLoadStore128<template_arg>::reg_type a, \
376 VLoadStore128<template_arg>::reg_type b) const \
382 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
384 struct name<template_arg> \
386 VLoadStore128<template_arg>::reg_type operator()( \
387 VLoadStore128<template_arg>::reg_type a, \
388 VLoadStore128<template_arg>::reg_type ) const \
394 FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 );
395 FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 );
396 FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16);
397 FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16);
398 FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32);
399 FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32);
401 FUNCTOR_TEMPLATE(VAdd);
402 FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b));
403 FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b));
404 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
405 FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b));
406 FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b));
407 FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b));
409 FUNCTOR_TEMPLATE(VSub);
410 FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b));
411 FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b));
412 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
413 FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b));
414 FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b));
415 FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b));
417 FUNCTOR_TEMPLATE(VMin);
418 FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b));
419 FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b));
420 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
421 FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b));
422 FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b));
423 FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b));
425 FUNCTOR_TEMPLATE(VMax);
426 FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b));
427 FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b));
428 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
429 FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b));
430 FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b));
431 FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b));
433 FUNCTOR_TEMPLATE(VAbsDiff);
434 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b));
435 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b)));
436 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
437 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b)));
438 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b));
439 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b));
441 FUNCTOR_TEMPLATE(VAnd);
442 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
443 FUNCTOR_TEMPLATE(VOr);
444 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
445 FUNCTOR_TEMPLATE(VXor);
446 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
447 FUNCTOR_TEMPLATE(VNot);
448 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a ));
452 template <typename T>
455 explicit Cmp_SIMD(int)
459 int operator () (const T *, const T *, uchar *, int) const
468 struct Cmp_SIMD<schar>
470 explicit Cmp_SIMD(int code_) :
473 // CV_Assert(code == CMP_GT || code == CMP_LE ||
474 // code == CMP_EQ || code == CMP_NE);
476 v_mask = vdupq_n_u8(255);
479 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
484 for ( ; x <= width - 16; x += 16)
485 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
486 else if (code == CMP_LE)
487 for ( ; x <= width - 16; x += 16)
488 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
489 else if (code == CMP_EQ)
490 for ( ; x <= width - 16; x += 16)
491 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
492 else if (code == CMP_NE)
493 for ( ; x <= width - 16; x += 16)
494 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
504 struct Cmp_SIMD<ushort>
506 explicit Cmp_SIMD(int code_) :
509 // CV_Assert(code == CMP_GT || code == CMP_LE ||
510 // code == CMP_EQ || code == CMP_NE);
512 v_mask = vdup_n_u8(255);
515 int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
520 for ( ; x <= width - 8; x += 8)
522 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
523 vst1_u8(dst + x, vmovn_u16(v_dst));
525 else if (code == CMP_LE)
526 for ( ; x <= width - 8; x += 8)
528 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
529 vst1_u8(dst + x, vmovn_u16(v_dst));
531 else if (code == CMP_EQ)
532 for ( ; x <= width - 8; x += 8)
534 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
535 vst1_u8(dst + x, vmovn_u16(v_dst));
537 else if (code == CMP_NE)
538 for ( ; x <= width - 8; x += 8)
540 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
541 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
554 explicit Cmp_SIMD(int code_) :
557 // CV_Assert(code == CMP_GT || code == CMP_LE ||
558 // code == CMP_EQ || code == CMP_NE);
560 v_mask = vdup_n_u8(255);
563 int operator () (const int * src1, const int * src2, uchar * dst, int width) const
568 for ( ; x <= width - 8; x += 8)
570 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
571 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
572 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
574 else if (code == CMP_LE)
575 for ( ; x <= width - 8; x += 8)
577 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
578 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
579 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
581 else if (code == CMP_EQ)
582 for ( ; x <= width - 8; x += 8)
584 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
585 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
586 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
588 else if (code == CMP_NE)
589 for ( ; x <= width - 8; x += 8)
591 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
592 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
593 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
594 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
605 struct Cmp_SIMD<float>
607 explicit Cmp_SIMD(int code_) :
610 // CV_Assert(code == CMP_GT || code == CMP_LE ||
611 // code == CMP_EQ || code == CMP_NE);
613 v_mask = vdup_n_u8(255);
616 int operator () (const float * src1, const float * src2, uchar * dst, int width) const
621 for ( ; x <= width - 8; x += 8)
623 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
624 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
625 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
627 else if (code == CMP_LE)
628 for ( ; x <= width - 8; x += 8)
630 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
631 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
632 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
634 else if (code == CMP_EQ)
635 for ( ; x <= width - 8; x += 8)
637 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
638 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
639 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
641 else if (code == CMP_NE)
642 for ( ; x <= width - 8; x += 8)
644 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
645 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
646 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
647 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
660 struct Cmp_SIMD<schar>
662 explicit Cmp_SIMD(int code_) :
665 // CV_Assert(code == CMP_GT || code == CMP_LE ||
666 // code == CMP_EQ || code == CMP_NE);
668 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
670 v_mask = _mm_set1_epi8(-1);
673 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
681 for ( ; x <= width - 16; x += 16)
682 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
683 _mm_loadu_si128((const __m128i *)(src2 + x))));
684 else if (code == CMP_LE)
685 for ( ; x <= width - 16; x += 16)
687 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
688 _mm_loadu_si128((const __m128i *)(src2 + x)));
689 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
691 else if (code == CMP_EQ)
692 for ( ; x <= width - 16; x += 16)
693 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
694 _mm_loadu_si128((const __m128i *)(src2 + x))));
695 else if (code == CMP_NE)
696 for ( ; x <= width - 16; x += 16)
698 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
699 _mm_loadu_si128((const __m128i *)(src2 + x)));
700 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
714 explicit Cmp_SIMD(int code_) :
717 // CV_Assert(code == CMP_GT || code == CMP_LE ||
718 // code == CMP_EQ || code == CMP_NE);
720 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
722 v_mask = _mm_set1_epi32(0xffffffff);
725 int operator () (const int * src1, const int * src2, uchar * dst, int width) const
733 for ( ; x <= width - 8; x += 8)
735 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
736 _mm_loadu_si128((const __m128i *)(src2 + x)));
737 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
738 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
740 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
742 else if (code == CMP_LE)
743 for ( ; x <= width - 8; x += 8)
745 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
746 _mm_loadu_si128((const __m128i *)(src2 + x)));
747 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
748 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
750 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
752 else if (code == CMP_EQ)
753 for ( ; x <= width - 8; x += 8)
755 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
756 _mm_loadu_si128((const __m128i *)(src2 + x)));
757 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
758 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
760 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
762 else if (code == CMP_NE)
763 for ( ; x <= width - 8; x += 8)
765 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
766 _mm_loadu_si128((const __m128i *)(src2 + x)));
767 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
768 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
770 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
784 template <typename T, typename WT>
787 int operator() (const T *, const T *, T *, int, WT) const
796 struct Mul_SIMD<uchar, float>
798 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
803 for ( ; x <= width - 8; x += 8)
805 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
806 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
808 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
809 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
810 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
811 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
813 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
814 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
815 vst1_u8(dst + x, vqmovn_u16(v_dst));
819 float32x4_t v_scale = vdupq_n_f32(scale);
820 for ( ; x <= width - 8; x += 8)
822 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
823 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
825 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
826 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
827 v_dst1 = vmulq_f32(v_dst1, v_scale);
828 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
829 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
830 v_dst2 = vmulq_f32(v_dst2, v_scale);
832 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
833 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
834 vst1_u8(dst + x, vqmovn_u16(v_dst));
843 struct Mul_SIMD<schar, float>
845 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
850 for ( ; x <= width - 8; x += 8)
852 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
853 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
855 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
856 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
857 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
858 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
860 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
861 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
862 vst1_s8(dst + x, vqmovn_s16(v_dst));
866 float32x4_t v_scale = vdupq_n_f32(scale);
867 for ( ; x <= width - 8; x += 8)
869 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
870 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
872 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
873 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
874 v_dst1 = vmulq_f32(v_dst1, v_scale);
875 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
876 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
877 v_dst2 = vmulq_f32(v_dst2, v_scale);
879 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
880 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
881 vst1_s8(dst + x, vqmovn_s16(v_dst));
890 struct Mul_SIMD<ushort, float>
892 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
897 for ( ; x <= width - 8; x += 8)
899 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
901 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
902 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
903 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
904 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
906 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
907 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
908 vst1q_u16(dst + x, v_dst);
912 float32x4_t v_scale = vdupq_n_f32(scale);
913 for ( ; x <= width - 8; x += 8)
915 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
917 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
918 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
919 v_dst1 = vmulq_f32(v_dst1, v_scale);
920 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
921 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
922 v_dst2 = vmulq_f32(v_dst2, v_scale);
924 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
925 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
926 vst1q_u16(dst + x, v_dst);
935 struct Mul_SIMD<short, float>
937 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
942 for ( ; x <= width - 8; x += 8)
944 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
946 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
947 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
948 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
949 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
951 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
952 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
953 vst1q_s16(dst + x, v_dst);
957 float32x4_t v_scale = vdupq_n_f32(scale);
958 for ( ; x <= width - 8; x += 8)
960 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
962 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
963 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
964 v_dst1 = vmulq_f32(v_dst1, v_scale);
965 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
966 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
967 v_dst2 = vmulq_f32(v_dst2, v_scale);
969 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
970 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
971 vst1q_s16(dst + x, v_dst);
980 struct Mul_SIMD<float, float>
982 int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
987 for ( ; x <= width - 8; x += 8)
989 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
990 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
991 vst1q_f32(dst + x, v_dst1);
992 vst1q_f32(dst + x + 4, v_dst2);
996 float32x4_t v_scale = vdupq_n_f32(scale);
997 for ( ; x <= width - 8; x += 8)
999 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
1000 v_dst1 = vmulq_f32(v_dst1, v_scale);
1002 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
1003 v_dst2 = vmulq_f32(v_dst2, v_scale);
1005 vst1q_f32(dst + x, v_dst1);
1006 vst1q_f32(dst + x + 4, v_dst2);
1019 struct Mul_SIMD<ushort, float>
1023 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
1026 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
1033 __m128i v_zero = _mm_setzero_si128();
1037 __m128 v_scale = _mm_set1_ps(scale);
1038 for ( ; x <= width - 8; x += 8)
1040 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1041 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1043 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
1044 _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
1045 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1047 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
1048 _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
1049 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1051 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1052 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1065 struct Mul_SIMD<schar, float>
1069 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1072 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
1079 __m128i v_zero = _mm_setzero_si128();
1082 for ( ; x <= width - 8; x += 8)
1084 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1085 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1087 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1088 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1090 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1091 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1093 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1094 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1096 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1097 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1101 __m128 v_scale = _mm_set1_ps(scale);
1102 for ( ; x <= width - 8; x += 8)
1104 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1105 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1107 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1108 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1110 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1111 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1112 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1114 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1115 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1116 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1118 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1119 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1130 struct Mul_SIMD<short, float>
1134 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1137 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
1144 __m128i v_zero = _mm_setzero_si128();
1148 __m128 v_scale = _mm_set1_ps(scale);
1149 for ( ; x <= width - 8; x += 8)
1151 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1152 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1154 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1155 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1156 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1158 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1159 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1160 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1162 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1163 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1175 template <typename T>
1178 int operator() (const T *, const T *, T *, int, double) const
1184 template <typename T>
1187 int operator() (const T *, T *, int, double) const
1197 struct Div_SIMD<uchar>
1200 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1202 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
1209 v_float32x4 v_scale = v_setall_f32((float)scale);
1210 v_uint16x8 v_zero = v_setzero_u16();
1212 for ( ; x <= width - 8; x += 8)
1214 v_uint16x8 v_src1 = v_load_expand(src1 + x);
1215 v_uint16x8 v_src2 = v_load_expand(src2 + x);
1217 v_uint32x4 t0, t1, t2, t3;
1218 v_expand(v_src1, t0, t1);
1219 v_expand(v_src2, t2, t3);
1221 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1222 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1224 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1225 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1227 f0 = f0 * v_scale / f2;
1228 f1 = f1 * v_scale / f3;
1230 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1231 v_uint16x8 res = v_pack_u(i0, i1);
1233 res = v_select(v_src2 == v_zero, v_zero, res);
1234 v_pack_store(dst + x, res);
1243 struct Div_SIMD<schar>
1246 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1248 int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
1255 v_float32x4 v_scale = v_setall_f32((float)scale);
1256 v_int16x8 v_zero = v_setzero_s16();
1258 for ( ; x <= width - 8; x += 8)
1260 v_int16x8 v_src1 = v_load_expand(src1 + x);
1261 v_int16x8 v_src2 = v_load_expand(src2 + x);
1263 v_int32x4 t0, t1, t2, t3;
1264 v_expand(v_src1, t0, t1);
1265 v_expand(v_src2, t2, t3);
1267 v_float32x4 f0 = v_cvt_f32(t0);
1268 v_float32x4 f1 = v_cvt_f32(t1);
1270 v_float32x4 f2 = v_cvt_f32(t2);
1271 v_float32x4 f3 = v_cvt_f32(t3);
1273 f0 = f0 * v_scale / f2;
1274 f1 = f1 * v_scale / f3;
1276 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1277 v_int16x8 res = v_pack(i0, i1);
1279 res = v_select(v_src2 == v_zero, v_zero, res);
1280 v_pack_store(dst + x, res);
1289 struct Div_SIMD<ushort>
1292 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1294 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
1301 v_float32x4 v_scale = v_setall_f32((float)scale);
1302 v_uint16x8 v_zero = v_setzero_u16();
1304 for ( ; x <= width - 8; x += 8)
1306 v_uint16x8 v_src1 = v_load(src1 + x);
1307 v_uint16x8 v_src2 = v_load(src2 + x);
1309 v_uint32x4 t0, t1, t2, t3;
1310 v_expand(v_src1, t0, t1);
1311 v_expand(v_src2, t2, t3);
1313 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1314 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1316 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1317 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1319 f0 = f0 * v_scale / f2;
1320 f1 = f1 * v_scale / f3;
1322 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1323 v_uint16x8 res = v_pack_u(i0, i1);
1325 res = v_select(v_src2 == v_zero, v_zero, res);
1326 v_store(dst + x, res);
1334 struct Div_SIMD<short>
1337 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1339 int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
1346 v_float32x4 v_scale = v_setall_f32((float)scale);
1347 v_int16x8 v_zero = v_setzero_s16();
1349 for ( ; x <= width - 8; x += 8)
1351 v_int16x8 v_src1 = v_load(src1 + x);
1352 v_int16x8 v_src2 = v_load(src2 + x);
1354 v_int32x4 t0, t1, t2, t3;
1355 v_expand(v_src1, t0, t1);
1356 v_expand(v_src2, t2, t3);
1358 v_float32x4 f0 = v_cvt_f32(t0);
1359 v_float32x4 f1 = v_cvt_f32(t1);
1361 v_float32x4 f2 = v_cvt_f32(t2);
1362 v_float32x4 f3 = v_cvt_f32(t3);
1364 f0 = f0 * v_scale / f2;
1365 f1 = f1 * v_scale / f3;
1367 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1368 v_int16x8 res = v_pack(i0, i1);
1370 res = v_select(v_src2 == v_zero, v_zero, res);
1371 v_store(dst + x, res);
1379 struct Div_SIMD<int>
1382 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1384 int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
1391 v_float32x4 v_scale = v_setall_f32((float)scale);
1392 v_int32x4 v_zero = v_setzero_s32();
1394 for ( ; x <= width - 8; x += 8)
1396 v_int32x4 t0 = v_load(src1 + x);
1397 v_int32x4 t1 = v_load(src1 + x + 4);
1398 v_int32x4 t2 = v_load(src2 + x);
1399 v_int32x4 t3 = v_load(src2 + x + 4);
1401 v_float32x4 f0 = v_cvt_f32(t0);
1402 v_float32x4 f1 = v_cvt_f32(t1);
1403 v_float32x4 f2 = v_cvt_f32(t2);
1404 v_float32x4 f3 = v_cvt_f32(t3);
1406 f0 = f0 * v_scale / f2;
1407 f1 = f1 * v_scale / f3;
1409 v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1411 res0 = v_select(t2 == v_zero, v_zero, res0);
1412 res1 = v_select(t3 == v_zero, v_zero, res1);
1413 v_store(dst + x, res0);
1414 v_store(dst + x + 4, res1);
1423 struct Div_SIMD<float>
1426 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1428 int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
1435 v_float32x4 v_scale = v_setall_f32((float)scale);
1436 v_float32x4 v_zero = v_setzero_f32();
1438 for ( ; x <= width - 8; x += 8)
1440 v_float32x4 f0 = v_load(src1 + x);
1441 v_float32x4 f1 = v_load(src1 + x + 4);
1442 v_float32x4 f2 = v_load(src2 + x);
1443 v_float32x4 f3 = v_load(src2 + x + 4);
1445 v_float32x4 res0 = f0 * v_scale / f2;
1446 v_float32x4 res1 = f1 * v_scale / f3;
1448 res0 = v_select(f2 == v_zero, v_zero, res0);
1449 res1 = v_select(f3 == v_zero, v_zero, res1);
1451 v_store(dst + x, res0);
1452 v_store(dst + x + 4, res1);
1460 ///////////////////////// RECIPROCAL //////////////////////
1463 struct Recip_SIMD<uchar>
1466 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1468 int operator() (const uchar * src2, uchar * dst, int width, double scale) const
1475 v_float32x4 v_scale = v_setall_f32((float)scale);
1476 v_uint16x8 v_zero = v_setzero_u16();
1478 for ( ; x <= width - 8; x += 8)
1480 v_uint16x8 v_src2 = v_load_expand(src2 + x);
1483 v_expand(v_src2, t0, t1);
1485 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1486 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1491 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1492 v_uint16x8 res = v_pack_u(i0, i1);
1494 res = v_select(v_src2 == v_zero, v_zero, res);
1495 v_pack_store(dst + x, res);
1504 struct Recip_SIMD<schar>
1507 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1509 int operator() (const schar * src2, schar * dst, int width, double scale) const
1516 v_float32x4 v_scale = v_setall_f32((float)scale);
1517 v_int16x8 v_zero = v_setzero_s16();
1519 for ( ; x <= width - 8; x += 8)
1521 v_int16x8 v_src2 = v_load_expand(src2 + x);
1524 v_expand(v_src2, t0, t1);
1526 v_float32x4 f0 = v_cvt_f32(t0);
1527 v_float32x4 f1 = v_cvt_f32(t1);
1532 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1533 v_int16x8 res = v_pack(i0, i1);
1535 res = v_select(v_src2 == v_zero, v_zero, res);
1536 v_pack_store(dst + x, res);
1545 struct Recip_SIMD<ushort>
1548 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1550 int operator() (const ushort * src2, ushort * dst, int width, double scale) const
1557 v_float32x4 v_scale = v_setall_f32((float)scale);
1558 v_uint16x8 v_zero = v_setzero_u16();
1560 for ( ; x <= width - 8; x += 8)
1562 v_uint16x8 v_src2 = v_load(src2 + x);
1565 v_expand(v_src2, t0, t1);
1567 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1568 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1573 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1574 v_uint16x8 res = v_pack_u(i0, i1);
1576 res = v_select(v_src2 == v_zero, v_zero, res);
1577 v_store(dst + x, res);
1585 struct Recip_SIMD<short>
1588 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1590 int operator() (const short * src2, short * dst, int width, double scale) const
1597 v_float32x4 v_scale = v_setall_f32((float)scale);
1598 v_int16x8 v_zero = v_setzero_s16();
1600 for ( ; x <= width - 8; x += 8)
1602 v_int16x8 v_src2 = v_load(src2 + x);
1605 v_expand(v_src2, t0, t1);
1607 v_float32x4 f0 = v_cvt_f32(t0);
1608 v_float32x4 f1 = v_cvt_f32(t1);
1613 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1614 v_int16x8 res = v_pack(i0, i1);
1616 res = v_select(v_src2 == v_zero, v_zero, res);
1617 v_store(dst + x, res);
1625 struct Recip_SIMD<int>
1628 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1630 int operator() (const int * src2, int * dst, int width, double scale) const
1637 v_float32x4 v_scale = v_setall_f32((float)scale);
1638 v_int32x4 v_zero = v_setzero_s32();
1640 for ( ; x <= width - 8; x += 8)
1642 v_int32x4 t0 = v_load(src2 + x);
1643 v_int32x4 t1 = v_load(src2 + x + 4);
1645 v_float32x4 f0 = v_cvt_f32(t0);
1646 v_float32x4 f1 = v_cvt_f32(t1);
1651 v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1653 res0 = v_select(t0 == v_zero, v_zero, res0);
1654 res1 = v_select(t1 == v_zero, v_zero, res1);
1655 v_store(dst + x, res0);
1656 v_store(dst + x + 4, res1);
1665 struct Recip_SIMD<float>
1668 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1670 int operator() (const float * src2, float * dst, int width, double scale) const
1677 v_float32x4 v_scale = v_setall_f32((float)scale);
1678 v_float32x4 v_zero = v_setzero_f32();
1680 for ( ; x <= width - 8; x += 8)
1682 v_float32x4 f0 = v_load(src2 + x);
1683 v_float32x4 f1 = v_load(src2 + x + 4);
1685 v_float32x4 res0 = v_scale / f0;
1686 v_float32x4 res1 = v_scale / f1;
1688 res0 = v_select(f0 == v_zero, v_zero, res0);
1689 res1 = v_select(f1 == v_zero, v_zero, res1);
1691 v_store(dst + x, res0);
1692 v_store(dst + x + 4, res1);
1702 struct Div_SIMD<double>
1705 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1707 int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
1714 v_float64x2 v_scale = v_setall_f64(scale);
1715 v_float64x2 v_zero = v_setzero_f64();
1717 for ( ; x <= width - 4; x += 4)
1719 v_float64x2 f0 = v_load(src1 + x);
1720 v_float64x2 f1 = v_load(src1 + x + 2);
1721 v_float64x2 f2 = v_load(src2 + x);
1722 v_float64x2 f3 = v_load(src2 + x + 2);
1724 v_float64x2 res0 = f0 * v_scale / f2;
1725 v_float64x2 res1 = f1 * v_scale / f3;
1727 res0 = v_select(f0 == v_zero, v_zero, res0);
1728 res1 = v_select(f1 == v_zero, v_zero, res1);
1730 v_store(dst + x, res0);
1731 v_store(dst + x + 2, res1);
1739 struct Recip_SIMD<double>
1742 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
1744 int operator() (const double * src2, double * dst, int width, double scale) const
1751 v_float64x2 v_scale = v_setall_f64(scale);
1752 v_float64x2 v_zero = v_setzero_f64();
1754 for ( ; x <= width - 4; x += 4)
1756 v_float64x2 f0 = v_load(src2 + x);
1757 v_float64x2 f1 = v_load(src2 + x + 2);
1759 v_float64x2 res0 = v_scale / f0;
1760 v_float64x2 res1 = v_scale / f1;
1762 res0 = v_select(f0 == v_zero, v_zero, res0);
1763 res1 = v_select(f1 == v_zero, v_zero, res1);
1765 v_store(dst + x, res0);
1766 v_store(dst + x + 2, res1);
1778 template <typename T, typename WT>
1779 struct AddWeighted_SIMD
1781 int operator() (const T *, const T *, T *, int, WT, WT, WT) const
1790 struct AddWeighted_SIMD<schar, float>
1794 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1797 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1804 __m128i v_zero = _mm_setzero_si128();
1805 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1806 v_gamma = _mm_set1_ps(gamma);
1808 for( ; x <= width - 8; x += 8 )
1810 __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
1811 __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
1813 __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1814 __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1816 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
1817 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1818 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
1820 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
1821 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1822 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
1824 __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1825 _mm_cvtps_epi32(v_dstf1));
1827 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
1837 struct AddWeighted_SIMD<short, float>
1841 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1844 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1851 __m128i v_zero = _mm_setzero_si128();
1852 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1853 v_gamma = _mm_set1_ps(gamma);
1855 for( ; x <= width - 8; x += 8 )
1857 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1858 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1860 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
1861 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1862 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
1864 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
1865 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1866 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
1868 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1869 _mm_cvtps_epi32(v_dstf1)));
1881 struct AddWeighted_SIMD<ushort, float>
1885 haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
1888 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1895 __m128i v_zero = _mm_setzero_si128();
1896 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1897 v_gamma = _mm_set1_ps(gamma);
1899 for( ; x <= width - 8; x += 8 )
1901 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1902 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1904 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
1905 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1906 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
1908 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
1909 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1910 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
1912 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
1913 _mm_cvtps_epi32(v_dstf1)));
1927 struct AddWeighted_SIMD<schar, float>
1929 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1933 float32x4_t g = vdupq_n_f32 (gamma);
1935 for( ; x <= width - 8; x += 8 )
1937 int8x8_t in1 = vld1_s8(src1 + x);
1938 int16x8_t in1_16 = vmovl_s8(in1);
1939 float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
1940 float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
1942 int8x8_t in2 = vld1_s8(src2+x);
1943 int16x8_t in2_16 = vmovl_s8(in2);
1944 float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
1945 float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
1947 float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
1948 float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
1949 out_f_l = vaddq_f32(out_f_l, g);
1950 out_f_h = vaddq_f32(out_f_h, g);
1952 int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
1953 int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
1955 int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
1956 int8x8_t out = vqmovn_s16(out_16);
1958 vst1_s8(dst + x, out);
1966 struct AddWeighted_SIMD<ushort, float>
1968 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1972 float32x4_t g = vdupq_n_f32(gamma);
1974 for( ; x <= width - 8; x += 8 )
1976 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
1978 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
1979 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
1980 uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1982 v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
1983 v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
1984 uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1986 vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
1994 struct AddWeighted_SIMD<short, float>
1996 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
2000 float32x4_t g = vdupq_n_f32(gamma);
2002 for( ; x <= width - 8; x += 8 )
2004 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
2006 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
2007 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
2008 int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
2010 v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
2011 v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
2012 int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
2014 vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
2025 #endif // __OPENCV_ARITHM_SIMD_HPP__