+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+ __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+ __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+ static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+ 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+ static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+ -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+ __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+ __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+ __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+ static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+ 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+ static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+ 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+ static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+ 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+ b0 = _mm256_shuffle_epi8(b0, sh_b);
+ g0 = _mm256_shuffle_epi8(g0, sh_g);
+ r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+ b = v_uint16x16(b0);
+ g = v_uint16x16(g0);
+ r = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+ __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+ __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+ __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
+ __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
+ __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
+
+ b0 = _mm256_shuffle_epi32(b0, 0x6c);
+ g0 = _mm256_shuffle_epi32(g0, 0xb1);
+ r0 = _mm256_shuffle_epi32(r0, 0xc6);
+
+ b = v_uint32x8(b0);
+ g = v_uint32x8(g0);
+ r = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+ __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
+ __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
+ __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
+ __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
+ __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
+ __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
+
+ b = v_uint64x4(b0);
+ g = v_uint64x4(g0);
+ r = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+ __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
+ static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+
+ __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+ __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+ __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+ __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+ __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+ __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+ __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+ __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+ __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+ __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+ __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+ __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+ __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+ __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+ __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+ __m256i a0 = _mm256_unpackhi_epi32(phl, phh);