b = v_uint64x4(b0);
}
-inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r )
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
g0 = _mm256_shuffle_epi8(g0, sh_g);
r0 = _mm256_shuffle_epi8(r0, sh_r);
- b = v_uint8x32(b0);
- g = v_uint8x32(g0);
- r = v_uint8x32(r0);
+ a = v_uint8x32(b0);
+ b = v_uint8x32(g0);
+ c = v_uint8x32(r0);
}
-inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
g0 = _mm256_shuffle_epi8(g0, sh_g);
r0 = _mm256_shuffle_epi8(r0, sh_r);
- b = v_uint16x16(b0);
- g = v_uint16x16(g0);
- r = v_uint16x16(r0);
+ a = v_uint16x16(b0);
+ b = v_uint16x16(g0);
+ c = v_uint16x16(r0);
}
-inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
g0 = _mm256_shuffle_epi32(g0, 0xb1);
r0 = _mm256_shuffle_epi32(r0, 0xc6);
- b = v_uint32x8(b0);
- g = v_uint32x8(g0);
- r = v_uint32x8(r0);
+ a = v_uint32x8(b0);
+ b = v_uint32x8(g0);
+ c = v_uint32x8(r0);
}
-inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
__m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
__m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
- b = v_uint64x4(b0);
- g = v_uint64x4(g0);
- r = v_uint64x4(r0);
+ a = v_uint64x4(b0);
+ b = v_uint64x4(g0);
+ c = v_uint64x4(r0);
}
-inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
- b = v_uint8x32(b0);
- g = v_uint8x32(g0);
- r = v_uint8x32(r0);
- a = v_uint8x32(a0);
+ a = v_uint8x32(b0);
+ b = v_uint8x32(g0);
+ c = v_uint8x32(r0);
+ d = v_uint8x32(a0);
}
-inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a )
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
- b = v_uint16x16(b0);
- g = v_uint16x16(g0);
- r = v_uint16x16(r0);
- a = v_uint16x16(a0);
+ a = v_uint16x16(b0);
+ b = v_uint16x16(g0);
+ c = v_uint16x16(r0);
+ d = v_uint16x16(a0);
}
-inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a )
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
{
__m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
- b = v_uint32x8(b0);
- g = v_uint32x8(g0);
- r = v_uint32x8(r0);
- a = v_uint32x8(a0);
+ a = v_uint32x8(b0);
+ b = v_uint32x8(g0);
+ c = v_uint32x8(r0);
+ d = v_uint32x8(a0);
}
-inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a )
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
{
__m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
__m256i r0 = _mm256_unpacklo_epi64(h02, h13);
__m256i a0 = _mm256_unpackhi_epi64(h02, h13);
- b = v_uint64x4(b0);
- g = v_uint64x4(g0);
- r = v_uint64x4(r0);
- a = v_uint64x4(a0);
+ a = v_uint64x4(b0);
+ b = v_uint64x4(g0);
+ c = v_uint64x4(r0);
+ d = v_uint64x4(a0);
}
///////////////////////////// store interleave /////////////////////////////////////
}
}
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r,
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
const __m256i sh_b = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
- __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
- __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
- __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
+ __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
+ __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
+ __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
}
}
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r,
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
const __m256i sh_b = _mm256_setr_epi8(
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
- __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
- __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
- __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
+ __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
+ __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
+ __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
}
}
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r,
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
- __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
- __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6);
+ __m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
+ __m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
+ __m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
__m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
__m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
}
}
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r,
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
- __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
- __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc);
+ __m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
+ __m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
+ __m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
__m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
__m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
}
}
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g,
- const v_uint8x32& r, const v_uint8x32& a,
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
+ const v_uint8x32& c, const v_uint8x32& d,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
- __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
- __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val);
- __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val);
+ __m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
+ __m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
+ __m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
+ __m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
__m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
__m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
}
}
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
- const v_uint16x16& r, const v_uint16x16& a,
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
+ const v_uint16x16& c, const v_uint16x16& d,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
- __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
- __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val);
- __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val);
+ __m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
+ __m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
+ __m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
+ __m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
__m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
__m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
}
}
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
- const v_uint32x8& r, const v_uint32x8& a,
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
+ const v_uint32x8& c, const v_uint32x8& d,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
- __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
- __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val);
- __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val);
+ __m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
+ __m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
+ __m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
+ __m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
__m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
__m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
}
}
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
- const v_uint64x4& r, const v_uint64x4& a,
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
+ const v_uint64x4& c, const v_uint64x4& d,
hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
- __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
- __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val);
- __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val);
+ __m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
+ __m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
+ __m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
+ __m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
__m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
__m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);