vst1_##suffix(ptr, a1); \
} \
template<int n> inline \
-_Tpvec v_rshift_round_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
{ \
hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
return _Tpvec(vcombine_##suffix(a1, b1)); \
} \
template<int n> inline \
-void v_rshift_round_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
{ \
hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
vst1_##suffix(ptr, a1); \
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
-OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
inline _Tpvec operator >> (const _Tpvec& a, int n) \
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
-template<int n> inline _Tpvec lshift(const _Tpvec& a) \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
-template<int n> inline _Tpvec rshift(const _Tpvec& a) \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
-template<int n> inline _Tpvec rshift_round(const _Tpvec& a) \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
template<int n> inline
-v_uint8x16 v_rshift_round_pack(const v_uint16x8& a, const v_uint16x8& b)
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
__m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
}
template<int n> inline
-void v_rshift_round_pack_store(uchar* ptr, const v_uint16x8& a)
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
{
__m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
__m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
}
template<int n> inline
-v_uint8x16 v_rshift_round_pack_u(const v_int16x8& a, const v_int16x8& b)
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
{
__m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
}
template<int n> inline
-void v_rshift_round_pack_u_store(uchar* ptr, const v_int16x8& a)
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
{
__m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
__m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
template<int n> inline
-v_int8x16 v_rshift_round_pack(const v_int16x8& a, const v_int16x8& b)
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
__m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
_mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
}
template<int n> inline
-void v_rshift_round_pack_store(schar* ptr, const v_int16x8& a)
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
__m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
}
template<int n> inline
-v_uint16x8 v_rshift_round_pack(const v_uint32x4& a, const v_uint32x4& b)
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
{
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
__m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
}
template<int n> inline
-void v_rshift_round_pack_store(ushort* ptr, const v_uint32x4& a)
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
{
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
__m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
}
template<int n> inline
-void v_rshift_round_pack_u_store(ushort* ptr, const v_int32x4& a)
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
{
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
__m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
}
template<int n> inline
-v_int16x8 v_rshift_round_pack(const v_int32x4& a, const v_int32x4& b)
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
{
__m128i delta = _mm_set1_epi32(1 << (n-1));
return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
}
template<int n> inline
-void v_rshift_round_pack_store(short* ptr, const v_int32x4& a)
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
{
__m128i delta = _mm_set1_epi32(1 << (n-1));
__m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
}
template<int n> inline
-v_uint32x4 v_rshift_round_pack(const v_uint64x2& a, const v_uint64x2& b)
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
{
uint64 delta = (uint64)1 << (n-1);
v_uint64x2 delta2(delta, delta);
- __m128 a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
- __m128 b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
+ __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+ __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
__m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
__m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
}
template<int n> inline
-void v_rshift_round_pack_store(unsigned* ptr, const v_uint64x2& a)
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
{
uint64 delta = (uint64)1 << (n-1);
v_uint64x2 delta2(delta, delta);
}
template<int n> inline
-v_int32x4 v_rshift_round_pack(const v_int64x2& a, const v_int64x2& b)
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
{
int64 delta = (int64)1 << (n-1);
v_int64x2 delta2(delta, delta);
}
template<int n> inline
-void v_rshift_round_pack_store(int* ptr, const v_int64x2& a)
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
{
int64 delta = (int64)1 << (n-1);
v_int64x2 delta2(delta, delta);
return _Tpsvec(srai(a.val, imm)); \
} \
template<int imm> \
-inline _Tpuvec v_lshift(const _Tpuvec& a) \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
{ \
return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
} \
template<int imm> \
-inline _Tpsvec v_lshift(const _Tpsvec& a) \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
{ \
return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
} \
template<int imm> \
-inline _Tpuvec v_rshift(const _Tpuvec& a) \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
{ \
return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
} \
template<int imm> \
-inline _Tpsvec v_rshift(const _Tpsvec& a) \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
{ \
return _Tpsvec(srai(a.val, imm)); \
}