1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
5 #ifndef OPENCV_HAL_VSX_HPP
6 #define OPENCV_HAL_VSX_HPP
9 #include "opencv2/core/utility.hpp"
12 #define CV_SIMD128_64F 1
15 * todo: supporting half precision for power9
16 * convert instractions xvcvhpsp, xvcvsphp
24 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
26 ///////// Types ////////////
30 typedef uchar lane_type;
34 explicit v_uint8x16(const vec_uchar16& v) : val(v)
36 v_uint8x16() : val(vec_uchar16_z)
38 v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
40 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
41 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
42 : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
45 { return vec_extract(val, 0); }
50 typedef schar lane_type;
54 explicit v_int8x16(const vec_char16& v) : val(v)
56 v_int8x16() : val(vec_char16_z)
58 v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
60 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
61 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
62 : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
65 { return vec_extract(val, 0); }
70 typedef ushort lane_type;
74 explicit v_uint16x8(const vec_ushort8& v) : val(v)
76 v_uint16x8() : val(vec_ushort8_z)
78 v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
80 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
81 : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
84 { return vec_extract(val, 0); }
89 typedef short lane_type;
93 explicit v_int16x8(const vec_short8& v) : val(v)
95 v_int16x8() : val(vec_short8_z)
97 v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
99 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
100 : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
103 { return vec_extract(val, 0); }
108 typedef unsigned lane_type;
112 explicit v_uint32x4(const vec_uint4& v) : val(v)
114 v_uint32x4() : val(vec_uint4_z)
116 v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
118 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
121 { return vec_extract(val, 0); }
126 typedef int lane_type;
130 explicit v_int32x4(const vec_int4& v) : val(v)
132 v_int32x4() : val(vec_int4_z)
134 v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
136 v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
139 { return vec_extract(val, 0); }
144 typedef float lane_type;
148 explicit v_float32x4(const vec_float4& v) : val(v)
150 v_float32x4() : val(vec_float4_z)
152 v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
154 v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
157 { return vec_extract(val, 0); }
162 typedef uint64 lane_type;
166 explicit v_uint64x2(const vec_udword2& v) : val(v)
168 v_uint64x2() : val(vec_udword2_z)
170 v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
172 v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
175 { return vec_extract(val, 0); }
180 typedef int64 lane_type;
184 explicit v_int64x2(const vec_dword2& v) : val(v)
186 v_int64x2() : val(vec_dword2_z)
188 v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
190 v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
193 { return vec_extract(val, 0); }
198 typedef double lane_type;
202 explicit v_float64x2(const vec_double2& v) : val(v)
204 v_float64x2() : val(vec_double2_z)
206 v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
208 v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
211 { return vec_extract(val, 0); }
214 //////////////// Load and store operations ///////////////
217 * clang-5 aborted during parse "vec_xxx_c" only if it's
218 * inside a function template which is defined by preprocessor macro.
220 * if vec_xxx_c defined as C++ cast, clang-5 will pass it
222 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
223 inline _Tpvec v_setzero_##suffix() { return _Tpvec(); } \
224 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
225 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
226 { return _Tpvec((cast)a.val); }
228 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16)
229 OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16)
230 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8)
231 OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8)
232 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4)
233 OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4)
234 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2)
235 OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
236 OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
237 OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
239 #define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a) \
240 inline _Tpvec v_load(const _Tp* ptr) \
241 { return _Tpvec(ld(0, ptr)); } \
242 inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr)) \
243 { return _Tpvec(ld_a(0, ptr)); } \
244 inline _Tpvec v_load_low(const _Tp* ptr) \
245 { return _Tpvec(vec_ld_l8(ptr)); } \
246 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
247 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
248 inline void v_store(_Tp* ptr, const _Tpvec& a) \
249 { st(a.val, 0, ptr); } \
250 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
251 { st_a(a.val, 0, ptr); } \
252 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
253 { vec_st_l8(a.val, ptr); } \
254 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
255 { vec_st_h8(a.val, ptr); }
257 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
258 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
260 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16, uchar)
261 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16, schar)
262 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8, ushort)
263 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8, short)
264 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4, uint)
265 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4, int)
266 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
268 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld, vsx_ld, vsx_st, vsx_st)
269 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2, uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
270 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2, int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
272 //////////////// Value reordering ///////////////
275 #define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \
276 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
277 { vec_ld_deinterleave(ptr, a.val, b.val);} \
278 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
279 _Tpvec& b, _Tpvec& c) \
280 { vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \
281 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
282 _Tpvec& c, _Tpvec& d) \
283 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
284 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
285 { vec_st_interleave(a.val, b.val, ptr); } \
286 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
287 const _Tpvec& b, const _Tpvec& c) \
288 { vec_st_interleave(a.val, b.val, c.val, ptr); } \
289 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
290 const _Tpvec& c, const _Tpvec& d) \
291 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
293 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
294 OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16)
295 OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8)
296 OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8)
297 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
298 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
299 OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
300 OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
303 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \
304 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
306 b0.val = fh(a.val); \
307 b1.val = fl(a.val); \
309 inline _Tpwvec v_load_expand(const _Tp* ptr) \
310 { return _Tpwvec(fh(vec_ld_l8(ptr))); }
312 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
313 OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
314 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu)
315 OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh)
316 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
317 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
319 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
320 { return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
322 inline v_int32x4 v_load_expand_q(const schar* ptr)
323 { return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
326 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
327 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
329 return _Tpvec(pkfnc(a.val, b.val)); \
331 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
333 vec_st_l8(pkfnc(a.val, a.val), ptr); \
336 inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
338 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
339 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
340 return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \
343 inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
345 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
346 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
347 vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \
350 OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short,
351 vec_sr, vec_packs, vec_adds, pack)
352 OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short,
353 vec_sra, vec_packs, vec_adds, pack)
355 OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int,
356 vec_sr, vec_packs, vec_add, pack)
357 OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int,
358 vec_sra, vec_packs, vec_add, pack)
360 OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long,
361 vec_sr, vec_pack, vec_add, pack)
362 OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long,
363 vec_sra, vec_pack, vec_add, pack)
365 OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short,
366 vec_sra, vec_packsu, vec_adds, pack_u)
367 OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
368 vec_sra, vec_packsu, vec_add, pack_u)
369 // Following variant is not implemented on other platforms:
370 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
371 // vec_sra, vec_packsu, vec_add, pack_u)
374 template <typename _Tpvec>
375 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
377 b0.val = vec_mergeh(a0.val, a1.val);
378 b1.val = vec_mergel(a0.val, a1.val);
381 template <typename _Tpvec>
382 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)
383 { return _Tpvec(vec_mergesql(a.val, b.val)); }
385 template <typename _Tpvec>
386 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)
387 { return _Tpvec(vec_mergesqh(a.val, b.val)); }
389 template <typename _Tpvec>
390 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
392 c.val = vec_mergesqh(a.val, b.val);
393 d.val = vec_mergesql(a.val, b.val);
396 ////////// Arithmetic, bitwise and comparison operations /////////
398 /* Element-wise binary and unary operations */
400 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
401 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
402 { return _Tpvec(intrin(a.val, b.val)); } \
403 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
404 { a.val = intrin(a.val, b.val); return a; }
406 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
407 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
408 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
409 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
410 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
411 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
412 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul)
413 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
414 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
415 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul)
416 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
417 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
418 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
419 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
420 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
421 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
422 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
423 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
424 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
425 OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
426 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
427 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
428 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
429 OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
430 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
431 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
432 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
433 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
435 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d)
437 c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val));
438 d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val));
440 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d)
442 c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
443 d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
445 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
447 c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
448 d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
451 /** Non-saturating arithmetics **/
452 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
453 template<typename _Tpvec> \
454 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
455 { return _Tpvec(intrin(a.val, b.val)); }
457 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
458 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
460 /** Bitwise shifts **/
461 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
462 inline _Tpvec operator << (const _Tpvec& a, int imm) \
463 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
464 inline _Tpvec operator >> (const _Tpvec& a, int imm) \
465 { return _Tpvec(shr(a.val, splfunc(imm))); } \
466 template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
467 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
468 template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
469 { return _Tpvec(shr(a.val, splfunc(imm))); }
471 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
472 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
473 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
474 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
475 // algebraic right shift
476 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
477 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
478 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
479 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
481 /** Bitwise logic **/
482 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
483 OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
484 OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
485 OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
486 inline _Tpvec operator ~ (const _Tpvec& a) \
487 { return _Tpvec(vec_not(a.val)); }
489 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
490 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16)
491 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8)
492 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8)
493 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4)
494 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4)
495 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2)
496 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2)
497 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4)
498 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2)
500 /** Bitwise select **/
501 #define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \
502 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
503 { return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
505 OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c)
506 OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c)
507 OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c)
508 OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c)
509 OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c)
510 OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c)
511 OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c)
512 OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
515 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
516 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
517 { return _Tpvec(vec_cmpeq(a.val, b.val)); } \
518 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
519 { return _Tpvec(vec_cmpne(a.val, b.val)); } \
520 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
521 { return _Tpvec(vec_cmplt(a.val, b.val)); } \
522 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
523 { return _Tpvec(vec_cmpgt(a.val, b.val)); } \
524 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
525 { return _Tpvec(vec_cmple(a.val, b.val)); } \
526 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
527 { return _Tpvec(vec_cmpge(a.val, b.val)); }
529 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
530 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16)
531 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8)
532 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8)
533 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4)
534 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4)
535 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4)
536 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
537 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
538 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
541 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
542 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
545 #define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
547 inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
549 const int wd = imm * sizeof(typename _Tpvec::lane_type); \
552 return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
555 #define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
556 OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
557 OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
559 OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
560 OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
561 OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
562 OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
563 OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
564 OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
565 OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
566 OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
567 OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
568 OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
570 template<int imm, typename _Tpvec>
571 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
573 enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
577 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
579 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
583 template<int imm, typename _Tpvec>
584 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
586 enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
589 return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
592 #define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2) \
594 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
597 return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
598 return imm ? b : a; \
601 #define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec) \
602 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left, b, a) \
603 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
605 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
606 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
607 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
610 template<int s, typename _Tpvec>
611 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
612 { return v_rotate_right<s>(a, b); }
614 ////////// Reduce and mask /////////
617 inline short v_reduce_sum(const v_int16x8& a)
619 const vec_int4 zero = vec_int4_z;
620 return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
622 inline ushort v_reduce_sum(const v_uint16x8& a)
624 const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
625 return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3));
628 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
629 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
631 const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
632 return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \
634 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add)
635 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max)
636 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min)
637 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add)
638 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max)
639 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min)
640 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
641 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
642 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
644 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
645 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
647 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
648 rs = func(rs, vec_sld(rs, rs, 4)); \
649 return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \
651 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max)
652 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
653 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
654 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
656 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
657 const v_float32x4& c, const v_float32x4& d)
659 vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
660 ac = vec_add(ac, vec_sld(ac, ac, 8));
662 vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
663 bd = vec_add(bd, vec_sld(bd, bd, 8));
664 return v_float32x4(vec_mergeh(ac, bd));
668 template<typename _Tpvec>
669 inline v_uint32x4 v_popcount(const _Tpvec& a)
670 { return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); }
673 inline int v_signmask(const v_uint8x16& a)
675 vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7));
676 static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
677 sv = vec_sl(sv, slm);
678 vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
679 static const vec_uint4 slm4 = {0, 0, 8, 8};
680 sv4 = vec_sl(sv4, slm4);
681 return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
683 inline int v_signmask(const v_int8x16& a)
684 { return v_signmask(v_reinterpret_as_u8(a)); }
686 inline int v_signmask(const v_int16x8& a)
688 static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
689 vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
690 sv = vec_sl(sv, slm);
691 vec_int4 svi = vec_int4_z;
692 svi = vec_sums(vec_sum4s(sv, svi), svi);
693 return vec_extract(svi, 3);
695 inline int v_signmask(const v_uint16x8& a)
696 { return v_signmask(v_reinterpret_as_s16(a)); }
698 inline int v_signmask(const v_int32x4& a)
700 static const vec_uint4 slm = {0, 1, 2, 3};
701 vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31));
702 sv = vec_sl(sv, slm);
703 sv = vec_sums(sv, vec_int4_z);
704 return vec_extract(sv, 3);
706 inline int v_signmask(const v_uint32x4& a)
707 { return v_signmask(v_reinterpret_as_s32(a)); }
708 inline int v_signmask(const v_float32x4& a)
709 { return v_signmask(v_reinterpret_as_s32(a)); }
711 inline int v_signmask(const v_int64x2& a)
713 VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
714 return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
716 inline int v_signmask(const v_uint64x2& a)
717 { return v_signmask(v_reinterpret_as_s64(a)); }
718 inline int v_signmask(const v_float64x2& a)
719 { return v_signmask(v_reinterpret_as_s64(a)); }
721 template<typename _Tpvec>
722 inline bool v_check_all(const _Tpvec& a)
723 { return vec_all_lt(a.val, _Tpvec().val); }
724 inline bool v_check_all(const v_uint8x16& a)
725 { return v_check_all(v_reinterpret_as_s8(a)); }
726 inline bool v_check_all(const v_uint16x8& a)
727 { return v_check_all(v_reinterpret_as_s16(a)); }
728 inline bool v_check_all(const v_uint32x4& a)
729 { return v_check_all(v_reinterpret_as_s32(a)); }
730 inline bool v_check_all(const v_float32x4& a)
731 { return v_check_all(v_reinterpret_as_s32(a)); }
732 inline bool v_check_all(const v_float64x2& a)
733 { return v_check_all(v_reinterpret_as_s64(a)); }
735 template<typename _Tpvec>
736 inline bool v_check_any(const _Tpvec& a)
737 { return vec_any_lt(a.val, _Tpvec().val); }
738 inline bool v_check_any(const v_uint8x16& a)
739 { return v_check_any(v_reinterpret_as_s8(a)); }
740 inline bool v_check_any(const v_uint16x8& a)
741 { return v_check_any(v_reinterpret_as_s16(a)); }
742 inline bool v_check_any(const v_uint32x4& a)
743 { return v_check_any(v_reinterpret_as_s32(a)); }
744 inline bool v_check_any(const v_float32x4& a)
745 { return v_check_any(v_reinterpret_as_s32(a)); }
746 inline bool v_check_any(const v_float64x2& a)
747 { return v_check_any(v_reinterpret_as_s64(a)); }
749 ////////// Other math /////////
751 /** Some frequent operations **/
752 inline v_float32x4 v_sqrt(const v_float32x4& x)
753 { return v_float32x4(vec_sqrt(x.val)); }
754 inline v_float64x2 v_sqrt(const v_float64x2& x)
755 { return v_float64x2(vec_sqrt(x.val)); }
757 inline v_float32x4 v_invsqrt(const v_float32x4& x)
758 { return v_float32x4(vec_rsqrt(x.val)); }
759 inline v_float64x2 v_invsqrt(const v_float64x2& x)
760 { return v_float64x2(vec_rsqrt(x.val)); }
762 #define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \
763 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
764 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
765 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
766 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
767 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
768 { return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
769 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
770 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }
772 OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
773 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
775 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
776 { return a * b + c; }
778 // TODO: exp, log, sin, cos
780 /** Absolute values **/
781 inline v_uint8x16 v_abs(const v_int8x16& x)
782 { return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); }
784 inline v_uint16x8 v_abs(const v_int16x8& x)
785 { return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); }
787 inline v_uint32x4 v_abs(const v_int32x4& x)
788 { return v_uint32x4(vec_uint4_c(vec_abs(x.val))); }
790 inline v_float32x4 v_abs(const v_float32x4& x)
791 { return v_float32x4(vec_abs(x.val)); }
793 inline v_float64x2 v_abs(const v_float64x2& x)
794 { return v_float64x2(vec_abs(x.val)); }
796 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
798 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
799 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
800 { return _Tpvec2(cast(intrin(a.val, b.val))); }
802 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
803 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
804 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
805 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
807 ////////// Conversions /////////
810 inline v_int32x4 v_round(const v_float32x4& a)
811 { return v_int32x4(vec_cts(vec_round(a.val))); }
813 inline v_int32x4 v_round(const v_float64x2& a)
814 { return v_int32x4(vec_mergesqo(vec_ctso(vec_round(a.val)), vec_int4_z)); }
816 inline v_int32x4 v_floor(const v_float32x4& a)
817 { return v_int32x4(vec_cts(vec_floor(a.val))); }
819 inline v_int32x4 v_floor(const v_float64x2& a)
820 { return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
822 inline v_int32x4 v_ceil(const v_float32x4& a)
823 { return v_int32x4(vec_cts(vec_ceil(a.val))); }
825 inline v_int32x4 v_ceil(const v_float64x2& a)
826 { return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
828 inline v_int32x4 v_trunc(const v_float32x4& a)
829 { return v_int32x4(vec_cts(a.val)); }
831 inline v_int32x4 v_trunc(const v_float64x2& a)
832 { return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
835 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
836 { return v_float32x4(vec_ctf(a.val)); }
838 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
839 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
841 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
842 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
844 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
845 { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
847 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
848 { return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
850 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
851 { return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
853 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
854 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
856 ////////////// Lookup table access ////////////////////
858 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
860 int CV_DECL_ALIGNED(32) idx[4];
861 v_store_aligned(idx, idxvec);
862 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
865 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
867 int CV_DECL_ALIGNED(32) idx[4];
868 v_store_aligned(idx, idxvec);
869 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
872 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
874 int CV_DECL_ALIGNED(32) idx[4];
875 v_store_aligned(idx, idxvec);
876 return v_float64x2(tab[idx[0]], tab[idx[1]]);
879 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
881 int CV_DECL_ALIGNED(32) idx[4];
882 v_store_aligned(idx, idxvec);
883 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
884 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
887 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
889 int CV_DECL_ALIGNED(32) idx[4];
890 v_store_aligned(idx, idxvec);
891 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
892 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
895 inline void v_cleanup() {}
899 /** its up there with load and store operations **/
901 ////////// Matrix operations /////////
903 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
904 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
906 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
907 { return v_int32x4(vec_msum(a.val, b.val, c.val)); }
909 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
910 const v_float32x4& m1, const v_float32x4& m2,
911 const v_float32x4& m3)
913 const vec_float4 v0 = vec_splat(v.val, 0);
914 const vec_float4 v1 = vec_splat(v.val, 1);
915 const vec_float4 v2 = vec_splat(v.val, 2);
916 VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
917 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
920 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
921 const v_float32x4& m1, const v_float32x4& m2,
922 const v_float32x4& a)
924 const vec_float4 v0 = vec_splat(v.val, 0);
925 const vec_float4 v1 = vec_splat(v.val, 1);
926 const vec_float4 v2 = vec_splat(v.val, 2);
927 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
930 #define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
931 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
932 const _Tpvec& a2, const _Tpvec& a3, \
933 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
935 _Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \
936 _Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \
937 b0.val = vec_mergeh(a02, a13); \
938 b1.val = vec_mergel(a02, a13); \
939 a02 = vec_mergel(a0.val, a2.val); \
940 a13 = vec_mergel(a1.val, a3.val); \
941 b2.val = vec_mergeh(a02, a13); \
942 b3.val = vec_mergel(a02, a13); \
944 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
945 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
946 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
948 //! @name Check SIMD support
950 //! @brief Check CPU capability of SIMD operation
951 static inline bool hasSIMD128()
953 return (CV_CPU_HAS_SUPPORT_VSX) ? true : false;
958 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
964 #endif // OPENCV_HAL_VSX_HPP