1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
5 #ifndef OPENCV_HAL_VSX_HPP
6 #define OPENCV_HAL_VSX_HPP
9 #include "opencv2/core/utility.hpp"
12 #define CV_SIMD128_64F 1
15 * todo: supporting half precision for power9
16 * convert instractions xvcvhpsp, xvcvsphp
24 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
26 ///////// Types ////////////
30 typedef uchar lane_type;
34 explicit v_uint8x16(const vec_uchar16& v) : val(v)
36 v_uint8x16() : val(vec_uchar16_z)
38 v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
40 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
41 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
42 : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
45 { return vec_extract(val, 0); }
50 typedef schar lane_type;
54 explicit v_int8x16(const vec_char16& v) : val(v)
56 v_int8x16() : val(vec_char16_z)
58 v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
60 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
61 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
62 : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
65 { return vec_extract(val, 0); }
70 typedef ushort lane_type;
74 explicit v_uint16x8(const vec_ushort8& v) : val(v)
76 v_uint16x8() : val(vec_ushort8_z)
78 v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
80 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
81 : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
84 { return vec_extract(val, 0); }
89 typedef short lane_type;
93 explicit v_int16x8(const vec_short8& v) : val(v)
95 v_int16x8() : val(vec_short8_z)
97 v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
99 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
100 : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
103 { return vec_extract(val, 0); }
108 typedef unsigned lane_type;
112 explicit v_uint32x4(const vec_uint4& v) : val(v)
114 v_uint32x4() : val(vec_uint4_z)
116 v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
118 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
121 { return vec_extract(val, 0); }
126 typedef int lane_type;
130 explicit v_int32x4(const vec_int4& v) : val(v)
132 v_int32x4() : val(vec_int4_z)
134 v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
136 v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
139 { return vec_extract(val, 0); }
144 typedef float lane_type;
148 explicit v_float32x4(const vec_float4& v) : val(v)
150 v_float32x4() : val(vec_float4_z)
152 v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
154 v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
157 { return vec_extract(val, 0); }
162 typedef uint64 lane_type;
166 explicit v_uint64x2(const vec_udword2& v) : val(v)
168 v_uint64x2() : val(vec_udword2_z)
170 v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
172 v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
175 { return vec_extract(val, 0); }
180 typedef int64 lane_type;
184 explicit v_int64x2(const vec_dword2& v) : val(v)
186 v_int64x2() : val(vec_dword2_z)
188 v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
190 v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
193 { return vec_extract(val, 0); }
198 typedef double lane_type;
202 explicit v_float64x2(const vec_double2& v) : val(v)
204 v_float64x2() : val(vec_double2_z)
206 v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
208 v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
211 { return vec_extract(val, 0); }
214 //////////////// Load and store operations ///////////////
217 * clang-5 aborted during parse "vec_xxx_c" only if it's
218 * inside a function template which is defined by preprocessor macro.
220 * if vec_xxx_c defined as C++ cast, clang-5 will pass it
222 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
223 inline _Tpvec v_setzero_##suffix() { return _Tpvec(); } \
224 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
225 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
226 { return _Tpvec((cast)a.val); }
228 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16)
229 OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16)
230 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8)
231 OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8)
232 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4)
233 OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4)
234 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2)
235 OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
236 OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
237 OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
239 #define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a) \
240 inline _Tpvec v_load(const _Tp* ptr) \
241 { return _Tpvec(ld(0, ptr)); } \
242 inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr)) \
243 { return _Tpvec(ld_a(0, ptr)); } \
244 inline _Tpvec v_load_low(const _Tp* ptr) \
245 { return _Tpvec(vec_ld_l8(ptr)); } \
246 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
247 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
248 inline void v_store(_Tp* ptr, const _Tpvec& a) \
249 { st(a.val, 0, ptr); } \
250 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
251 { st_a(a.val, 0, ptr); } \
252 inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
253 { st_a(a.val, 0, ptr); } \
254 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
255 { if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
256 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
257 { vec_st_l8(a.val, ptr); } \
258 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
259 { vec_st_h8(a.val, ptr); }
261 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
262 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
264 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16, uchar)
265 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16, schar)
266 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8, ushort)
267 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8, short)
268 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4, uint)
269 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4, int)
270 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
272 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld, vsx_ld, vsx_st, vsx_st)
273 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2, uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
274 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2, int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
276 //////////////// Value reordering ///////////////
279 #define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \
280 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
281 { vec_ld_deinterleave(ptr, a.val, b.val);} \
282 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
283 _Tpvec& b, _Tpvec& c) \
284 { vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \
285 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
286 _Tpvec& c, _Tpvec& d) \
287 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
288 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
289 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
290 { vec_st_interleave(a.val, b.val, ptr); } \
291 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
292 const _Tpvec& b, const _Tpvec& c, \
293 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
294 { vec_st_interleave(a.val, b.val, c.val, ptr); } \
295 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
296 const _Tpvec& c, const _Tpvec& d, \
297 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
298 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
300 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
301 OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16)
302 OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8)
303 OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8)
304 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
305 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
306 OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
307 OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
308 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
309 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)
312 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \
313 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
315 b0.val = fh(a.val); \
316 b1.val = fl(a.val); \
318 inline _Tpwvec v_load_expand(const _Tp* ptr) \
319 { return _Tpwvec(fh(vec_ld_l8(ptr))); }
321 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
322 OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
323 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu)
324 OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh)
325 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
326 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
328 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
329 { return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
331 inline v_int32x4 v_load_expand_q(const schar* ptr)
332 { return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
335 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
336 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
338 return _Tpvec(pkfnc(a.val, b.val)); \
340 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
342 vec_st_l8(pkfnc(a.val, a.val), ptr); \
345 inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
347 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
348 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
349 return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \
352 inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
354 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
355 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
356 vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \
359 OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short,
360 vec_sr, vec_packs, vec_adds, pack)
361 OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short,
362 vec_sra, vec_packs, vec_adds, pack)
364 OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int,
365 vec_sr, vec_packs, vec_add, pack)
366 OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int,
367 vec_sra, vec_packs, vec_add, pack)
369 OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long,
370 vec_sr, vec_pack, vec_add, pack)
371 OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long,
372 vec_sra, vec_pack, vec_add, pack)
374 OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short,
375 vec_sra, vec_packsu, vec_adds, pack_u)
376 OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
377 vec_sra, vec_packsu, vec_add, pack_u)
378 // Following variant is not implemented on other platforms:
379 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
380 // vec_sra, vec_packsu, vec_add, pack_u)
383 template <typename _Tpvec>
384 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
386 b0.val = vec_mergeh(a0.val, a1.val);
387 b1.val = vec_mergel(a0.val, a1.val);
390 template <typename _Tpvec>
391 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)
392 { return _Tpvec(vec_mergesql(a.val, b.val)); }
394 template <typename _Tpvec>
395 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)
396 { return _Tpvec(vec_mergesqh(a.val, b.val)); }
398 template <typename _Tpvec>
399 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
401 c.val = vec_mergesqh(a.val, b.val);
402 d.val = vec_mergesql(a.val, b.val);
405 ////////// Arithmetic, bitwise and comparison operations /////////
407 /* Element-wise binary and unary operations */
409 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
410 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
411 { return _Tpvec(intrin(a.val, b.val)); } \
412 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
413 { a.val = intrin(a.val, b.val); return a; }
415 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
416 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
417 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
418 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
419 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
420 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
421 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul)
422 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
423 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
424 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul)
425 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
426 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
427 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
428 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
429 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
430 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
431 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
432 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
433 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
434 OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
435 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
436 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
437 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
438 OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
439 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
440 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
441 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
442 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
444 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d)
446 c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val));
447 d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val));
449 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d)
451 c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
452 d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
454 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
456 c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
457 d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
460 /** Non-saturating arithmetics **/
461 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
462 template<typename _Tpvec> \
463 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
464 { return _Tpvec(intrin(a.val, b.val)); }
466 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
467 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
469 /** Bitwise shifts **/
470 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
471 inline _Tpvec operator << (const _Tpvec& a, int imm) \
472 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
473 inline _Tpvec operator >> (const _Tpvec& a, int imm) \
474 { return _Tpvec(shr(a.val, splfunc(imm))); } \
475 template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
476 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
477 template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
478 { return _Tpvec(shr(a.val, splfunc(imm))); }
480 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
481 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
482 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
483 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
484 // algebraic right shift
485 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
486 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
487 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
488 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
490 /** Bitwise logic **/
491 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
492 OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
493 OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
494 OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
495 inline _Tpvec operator ~ (const _Tpvec& a) \
496 { return _Tpvec(vec_not(a.val)); }
498 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
499 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16)
500 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8)
501 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8)
502 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4)
503 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4)
504 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2)
505 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2)
506 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4)
507 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2)
509 /** Bitwise select **/
510 #define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \
511 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
512 { return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
514 OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c)
515 OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c)
516 OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c)
517 OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c)
518 OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c)
519 OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c)
520 OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c)
521 OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
524 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
525 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
526 { return _Tpvec(vec_cmpeq(a.val, b.val)); } \
527 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
528 { return _Tpvec(vec_cmpne(a.val, b.val)); } \
529 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
530 { return _Tpvec(vec_cmplt(a.val, b.val)); } \
531 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
532 { return _Tpvec(vec_cmpgt(a.val, b.val)); } \
533 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
534 { return _Tpvec(vec_cmple(a.val, b.val)); } \
535 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
536 { return _Tpvec(vec_cmpge(a.val, b.val)); }
538 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
539 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16)
540 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8)
541 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8)
542 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4)
543 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4)
544 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4)
545 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
546 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
547 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
550 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
551 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
554 #define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
556 inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
558 const int wd = imm * sizeof(typename _Tpvec::lane_type); \
561 return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
564 #define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
565 OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
566 OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
568 OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
569 OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
570 OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
571 OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
572 OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
573 OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
574 OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
575 OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
576 OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
577 OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
579 template<int imm, typename _Tpvec>
580 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
582 enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
586 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
588 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
592 template<int imm, typename _Tpvec>
593 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
595 enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
598 return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
601 #define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2) \
603 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
606 return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
607 return imm ? b : a; \
610 #define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec) \
611 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left, b, a) \
612 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
614 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
615 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
616 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
619 template<int s, typename _Tpvec>
620 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
621 { return v_rotate_right<s>(a, b); }
623 ////////// Reduce and mask /////////
626 inline short v_reduce_sum(const v_int16x8& a)
628 const vec_int4 zero = vec_int4_z;
629 return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
631 inline ushort v_reduce_sum(const v_uint16x8& a)
633 const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
634 return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3));
637 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
638 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
640 const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
641 return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \
643 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add)
644 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max)
645 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min)
646 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add)
647 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max)
648 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min)
649 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
650 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
651 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
653 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
654 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
656 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
657 rs = func(rs, vec_sld(rs, rs, 4)); \
658 return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \
660 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max)
661 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
662 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
663 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
665 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
666 const v_float32x4& c, const v_float32x4& d)
668 vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
669 ac = vec_add(ac, vec_sld(ac, ac, 8));
671 vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
672 bd = vec_add(bd, vec_sld(bd, bd, 8));
673 return v_float32x4(vec_mergeh(ac, bd));
677 template<typename _Tpvec>
678 inline v_uint32x4 v_popcount(const _Tpvec& a)
679 { return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); }
682 inline int v_signmask(const v_uint8x16& a)
684 vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7));
685 static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
686 sv = vec_sl(sv, slm);
687 vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
688 static const vec_uint4 slm4 = {0, 0, 8, 8};
689 sv4 = vec_sl(sv4, slm4);
690 return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
692 inline int v_signmask(const v_int8x16& a)
693 { return v_signmask(v_reinterpret_as_u8(a)); }
695 inline int v_signmask(const v_int16x8& a)
697 static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
698 vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
699 sv = vec_sl(sv, slm);
700 vec_int4 svi = vec_int4_z;
701 svi = vec_sums(vec_sum4s(sv, svi), svi);
702 return vec_extract(svi, 3);
704 inline int v_signmask(const v_uint16x8& a)
705 { return v_signmask(v_reinterpret_as_s16(a)); }
707 inline int v_signmask(const v_int32x4& a)
709 static const vec_uint4 slm = {0, 1, 2, 3};
710 vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31));
711 sv = vec_sl(sv, slm);
712 sv = vec_sums(sv, vec_int4_z);
713 return vec_extract(sv, 3);
715 inline int v_signmask(const v_uint32x4& a)
716 { return v_signmask(v_reinterpret_as_s32(a)); }
717 inline int v_signmask(const v_float32x4& a)
718 { return v_signmask(v_reinterpret_as_s32(a)); }
720 inline int v_signmask(const v_int64x2& a)
722 VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
723 return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
725 inline int v_signmask(const v_uint64x2& a)
726 { return v_signmask(v_reinterpret_as_s64(a)); }
727 inline int v_signmask(const v_float64x2& a)
728 { return v_signmask(v_reinterpret_as_s64(a)); }
730 template<typename _Tpvec>
731 inline bool v_check_all(const _Tpvec& a)
732 { return vec_all_lt(a.val, _Tpvec().val); }
733 inline bool v_check_all(const v_uint8x16& a)
734 { return v_check_all(v_reinterpret_as_s8(a)); }
735 inline bool v_check_all(const v_uint16x8& a)
736 { return v_check_all(v_reinterpret_as_s16(a)); }
737 inline bool v_check_all(const v_uint32x4& a)
738 { return v_check_all(v_reinterpret_as_s32(a)); }
739 inline bool v_check_all(const v_float32x4& a)
740 { return v_check_all(v_reinterpret_as_s32(a)); }
741 inline bool v_check_all(const v_float64x2& a)
742 { return v_check_all(v_reinterpret_as_s64(a)); }
744 template<typename _Tpvec>
745 inline bool v_check_any(const _Tpvec& a)
746 { return vec_any_lt(a.val, _Tpvec().val); }
747 inline bool v_check_any(const v_uint8x16& a)
748 { return v_check_any(v_reinterpret_as_s8(a)); }
749 inline bool v_check_any(const v_uint16x8& a)
750 { return v_check_any(v_reinterpret_as_s16(a)); }
751 inline bool v_check_any(const v_uint32x4& a)
752 { return v_check_any(v_reinterpret_as_s32(a)); }
753 inline bool v_check_any(const v_float32x4& a)
754 { return v_check_any(v_reinterpret_as_s32(a)); }
755 inline bool v_check_any(const v_float64x2& a)
756 { return v_check_any(v_reinterpret_as_s64(a)); }
758 ////////// Other math /////////
760 /** Some frequent operations **/
761 inline v_float32x4 v_sqrt(const v_float32x4& x)
762 { return v_float32x4(vec_sqrt(x.val)); }
763 inline v_float64x2 v_sqrt(const v_float64x2& x)
764 { return v_float64x2(vec_sqrt(x.val)); }
766 inline v_float32x4 v_invsqrt(const v_float32x4& x)
767 { return v_float32x4(vec_rsqrt(x.val)); }
768 inline v_float64x2 v_invsqrt(const v_float64x2& x)
769 { return v_float64x2(vec_rsqrt(x.val)); }
771 #define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \
772 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
773 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
774 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
775 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
776 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
777 { return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
778 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
779 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }
781 OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
782 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
784 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
785 { return a * b + c; }
787 // TODO: exp, log, sin, cos
789 /** Absolute values **/
790 inline v_uint8x16 v_abs(const v_int8x16& x)
791 { return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); }
793 inline v_uint16x8 v_abs(const v_int16x8& x)
794 { return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); }
796 inline v_uint32x4 v_abs(const v_int32x4& x)
797 { return v_uint32x4(vec_uint4_c(vec_abs(x.val))); }
799 inline v_float32x4 v_abs(const v_float32x4& x)
800 { return v_float32x4(vec_abs(x.val)); }
802 inline v_float64x2 v_abs(const v_float64x2& x)
803 { return v_float64x2(vec_abs(x.val)); }
805 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
807 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
808 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
809 { return _Tpvec2(cast(intrin(a.val, b.val))); }
811 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
812 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
813 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
814 OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
816 ////////// Conversions /////////
819 inline v_int32x4 v_round(const v_float32x4& a)
820 { return v_int32x4(vec_cts(vec_round(a.val))); }
822 inline v_int32x4 v_round(const v_float64x2& a)
823 { return v_int32x4(vec_mergesqo(vec_ctso(vec_round(a.val)), vec_int4_z)); }
825 inline v_int32x4 v_floor(const v_float32x4& a)
826 { return v_int32x4(vec_cts(vec_floor(a.val))); }
828 inline v_int32x4 v_floor(const v_float64x2& a)
829 { return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
831 inline v_int32x4 v_ceil(const v_float32x4& a)
832 { return v_int32x4(vec_cts(vec_ceil(a.val))); }
834 inline v_int32x4 v_ceil(const v_float64x2& a)
835 { return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
837 inline v_int32x4 v_trunc(const v_float32x4& a)
838 { return v_int32x4(vec_cts(a.val)); }
840 inline v_int32x4 v_trunc(const v_float64x2& a)
841 { return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
844 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
845 { return v_float32x4(vec_ctf(a.val)); }
847 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
848 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
850 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
851 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
853 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
854 { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
856 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
857 { return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
859 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
860 { return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
862 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
863 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
865 ////////////// Lookup table access ////////////////////
867 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
869 int CV_DECL_ALIGNED(32) idx[4];
870 v_store_aligned(idx, idxvec);
871 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
874 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
876 int CV_DECL_ALIGNED(32) idx[4];
877 v_store_aligned(idx, idxvec);
878 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
881 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
883 int CV_DECL_ALIGNED(32) idx[4];
884 v_store_aligned(idx, idxvec);
885 return v_float64x2(tab[idx[0]], tab[idx[1]]);
888 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
890 int CV_DECL_ALIGNED(32) idx[4];
891 v_store_aligned(idx, idxvec);
892 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
893 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
896 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
898 int CV_DECL_ALIGNED(32) idx[4];
899 v_store_aligned(idx, idxvec);
900 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
901 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
904 inline void v_cleanup() {}
908 /** its up there with load and store operations **/
910 ////////// Matrix operations /////////
912 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
913 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
915 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
916 { return v_int32x4(vec_msum(a.val, b.val, c.val)); }
918 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
919 const v_float32x4& m1, const v_float32x4& m2,
920 const v_float32x4& m3)
922 const vec_float4 v0 = vec_splat(v.val, 0);
923 const vec_float4 v1 = vec_splat(v.val, 1);
924 const vec_float4 v2 = vec_splat(v.val, 2);
925 VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
926 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
929 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
930 const v_float32x4& m1, const v_float32x4& m2,
931 const v_float32x4& a)
933 const vec_float4 v0 = vec_splat(v.val, 0);
934 const vec_float4 v1 = vec_splat(v.val, 1);
935 const vec_float4 v2 = vec_splat(v.val, 2);
936 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
939 #define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
940 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
941 const _Tpvec& a2, const _Tpvec& a3, \
942 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
944 _Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \
945 _Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \
946 b0.val = vec_mergeh(a02, a13); \
947 b1.val = vec_mergel(a02, a13); \
948 a02 = vec_mergel(a0.val, a2.val); \
949 a13 = vec_mergel(a1.val, a3.val); \
950 b2.val = vec_mergeh(a02, a13); \
951 b3.val = vec_mergel(a02, a13); \
953 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
954 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
955 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
957 //! @name Check SIMD support
959 //! @brief Check CPU capability of SIMD operation
960 static inline bool hasSIMD128()
962 return (CV_CPU_HAS_SUPPORT_VSX) ? true : false;
967 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
973 #endif // OPENCV_HAL_VSX_HPP