return _mm_cvtsi32_si64 (data);
}
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+ return _mm_cvtsi32_si128 (data);
+}
+
static force_inline __m64
unpack_32_1x64 (uint32_t data)
{
return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
}
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
static force_inline __m64
expand_pixel_8_1x64 (uint8_t data)
{
unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
}
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+ return _mm_shufflelo_epi16 (
+ unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
static force_inline __m64
pix_multiply_1x64 (__m64 data,
__m64 alpha)
mask_x0101);
}
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+ __m128i alpha)
+{
+ return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+ mask_0080),
+ mask_0101);
+}
+
static force_inline __m64
pix_add_multiply_1x64 (__m64* src,
__m64* alpha_dst,
return _mm_adds_pu8 (t1, t2);
}
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+ __m128i* alpha_dst,
+ __m128i* dst,
+ __m128i* alpha_src)
+{
+ __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+ __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+ return _mm_adds_epu8 (t1, t2);
+}
+
static force_inline __m64
negate_1x64 (__m64 data)
{
return _mm_xor_si64 (data, mask_x00ff);
}
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+ return _mm_xor_si128 (data, mask_00ff);
+}
+
static force_inline __m64
invert_colors_1x64 (__m64 data)
{
return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
}
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
static force_inline __m64
over_1x64 (__m64 src, __m64 alpha, __m64 dst)
{
return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
}
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+ return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
static force_inline __m64
in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
{
*dst);
}
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+ return over_1x128 (pix_multiply_1x128 (*src, *mask),
+ pix_multiply_1x128 (*alpha, *mask),
+ *dst);
+}
+
static force_inline __m64
over_rev_non_pre_1x64 (__m64 src, __m64 dst)
{
dst);
}
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+ __m128i alpha = expand_alpha_1x128 (src);
+
+ return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+ _mm_or_si128 (alpha, mask_alpha)),
+ alpha,
+ dst);
+}
+
static force_inline uint32_t
pack_1x64_32 (__m64 data)
{
return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
}
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+ return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
*
* 00RR00GG00BB
return _mm_srli_pi16 (p, 8);
}
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+ __m128i m = _mm_cvtsi32_si128 (pixel);
+
+ m = unpack_565_to_8888 (m);
+
+ return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
/* ----------------------------------------------------------------------------
* Compose Core transformations
*/
core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
{
uint8_t a;
- __m64 ms;
+ __m128i xmms;
a = src >> 24;
}
else if (src)
{
- ms = unpack_32_1x64 (src);
- return pack_1x64_32 (
- over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
+ xmms = unpack_32_1x128 (src);
+ return pack_1x128_32 (
+ over_1x128 (xmms, expand_alpha_1x128 (xmms),
+ unpack_32_1x128 (dst)));
}
return dst;
if (pm)
{
- __m64 ms, mm;
+ __m128i ms, mm;
- mm = unpack_32_1x64 (*pm);
- mm = expand_alpha_1x64 (mm);
+ mm = unpack_32_1x128 (*pm);
+ mm = expand_alpha_1x128 (mm);
- ms = unpack_32_1x64 (s);
- ms = pix_multiply_1x64 (ms, mm);
+ ms = unpack_32_1x128 (s);
+ ms = pix_multiply_1x128 (ms, mm);
- s = pack_1x64_32 (ms);
+ s = pack_1x128_32 (ms);
}
return s;
}
else if (maska != 0xff)
{
- return pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (dst),
- expand_alpha_1x64 (unpack_32_1x64 (src))));
+ return pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (dst),
+ expand_alpha_1x128 (unpack_32_1x128 (src))));
}
return dst;
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
if (pm)
pm++;
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
ps++;
if (pm)
pm++;
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
ps++;
if (pm)
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
ps++;
if (pm)
core_combine_atop_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
- __m64 da = expand_alpha_1x64 (d);
+ __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+ __m128i da = expand_alpha_1x128 (d);
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
}
static force_inline void
core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
}
static force_inline void
core_combine_xor_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
- __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
+ __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
}
static force_inline void
ps++;
if (pm)
pm++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
w--;
}
d = *pd;
ps++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
if (pm)
pm++;
}
core_combine_saturate_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 ms = unpack_32_1x64 (src);
- __m64 md = unpack_32_1x64 (dst);
+ __m128i ms = unpack_32_1x128 (src);
+ __m128i md = unpack_32_1x128 (dst);
uint32_t sa = src >> 24;
uint32_t da = ~dst >> 24;
if (sa > da)
{
- ms = pix_multiply_1x64 (
- ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
+ ms = pix_multiply_1x128 (
+ ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
}
- return pack_1x64_32 (_mm_adds_pu16 (md, ms));
+ return pack_1x128_32 (_mm_adds_epu16 (md, ms));
}
static force_inline void
{
s = *ps++;
m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
w--;
}
{
s = *ps++;
m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
w--;
}
}
uint32_t mask,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 expAlpha = expand_alpha_1x64 (s);
- __m64 unpk_mask = unpack_32_1x64 (mask);
- __m64 unpk_dst = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i expAlpha = expand_alpha_1x128 (s);
+ __m128i unpk_mask = unpack_32_1x128 (mask);
+ __m128i unpk_dst = unpack_32_1x128 (dst);
- return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+ return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
}
static force_inline void
uint32_t mask,
uint32_t dst)
{
- __m64 d = unpack_32_1x64 (dst);
+ __m128i d = unpack_32_1x128 (dst);
- return pack_1x64_32 (
- over_1x64 (d, expand_alpha_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (src),
- unpack_32_1x64 (mask))));
+ return pack_1x128_32 (
+ over_1x128 (d, expand_alpha_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (src),
+ unpack_32_1x128 (mask))));
}
static force_inline void
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
w--;
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
w--;
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
w--;
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
w--;
}
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
w--;
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
w--;
}
}
uint32_t mask,
uint32_t dst)
{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = expand_alpha_1x64 (d);
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = expand_alpha_1x128 (d);
- s = pix_multiply_1x64 (s, m);
- m = negate_1x64 (pix_multiply_1x64 (m, sa));
+ s = pix_multiply_1x128 (s, m);
+ m = negate_1x128 (pix_multiply_1x128 (m, sa));
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
}
static force_inline void
uint32_t mask,
uint32_t dst)
{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
- __m64 sa = expand_alpha_1x64 (s);
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i sa = expand_alpha_1x128 (s);
- s = pix_multiply_1x64 (s, m);
- m = pix_multiply_1x64 (m, sa);
+ s = pix_multiply_1x128 (s, m);
+ m = pix_multiply_1x128 (m, sa);
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
}
static force_inline void
uint32_t mask,
uint32_t dst)
{
- __m64 a = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i a = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
- a, expand_alpha_1x64 (s)));
- __m64 dest = pix_multiply_1x64 (s, a);
- __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
+ __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+ a, expand_alpha_1x128 (s)));
+ __m128i dest = pix_multiply_1x128 (s, a);
+ __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
- return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d,
&alpha_dst,
&dest,
&alpha_src));
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
}
while (w && (unsigned long)dst & 15)
{
d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
w--;
}
while (w)
{
d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
w--;
}
d = *dst;
*dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
+ pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ expand565_16_1x128 (d))));
w--;
}
{
d = *dst;
*dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
+ pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+ expand565_16_1x128 (d))));
}
}
__m128i xmm_dst;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
srca = src >> 24;
xmm_src = _mm_unpacklo_epi8 (
create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
{
d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
}
pd++;
{
d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
}
pd++;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
xmm_src = _mm_unpacklo_epi8 (
create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
if (m)
{
d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
if (m)
{
d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *pd = pack_1x64_32 (
- in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+ *pd = pack_1x128_32 (
+ in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
}
pd++;
{
uint32_t d = *dst;
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 dest = _mm_movepi64_pi64 (xmm_mask);
- __m64 alpha_dst = unpack_32_1x64 (d);
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
}
dst++;
w--;
{
uint32_t d = *dst;
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &mask, &dest));
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
}
dst++;
uint32_t s = (*src++) | 0xff000000;
uint32_t d = *dst;
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
w--;
}
uint32_t s = (*src++) | 0xff000000;
uint32_t d = *dst;
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
w--;
}
static force_inline uint16_t
composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
{
- __m64 ms;
+ __m128i ms;
- ms = unpack_32_1x64 (src);
+ ms = unpack_32_1x128 (src);
return pack_565_32_16 (
- pack_1x64_32 (
- over_1x64 (
- ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
+ pack_1x128_32 (
+ over_1x128 (
+ ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
}
static void
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
xmm_def = create_mask_2x32_128 (src, src);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
if (m)
{
d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
if (m)
{
d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
if (m)
{
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
}
else
{
if (m)
{
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_src, expand_pixel_8_1x128 (m)));
}
else
{
int dst_stride, mask_stride;
int32_t w;
uint32_t m;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
__m128i xmm_src, xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
if (m)
{
d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
if (m)
{
d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
int32_t w;
uint32_t opaque, zero;
- __m64 ms;
+ __m128i ms;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
s = *src++;
d = *dst;
- ms = unpack_32_1x64 (s);
+ ms = unpack_32_1x128 (s);
*dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
w--;
}
s = *src++;
d = *dst;
- ms = unpack_32_1x64 (s);
+ ms = unpack_32_1x128 (s);
*dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
w--;
}
}
s = *src++;
d = *dst;
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
s = *src++;
d = *dst;
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
if (m)
{
d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
if (m)
{
d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (xmm_alpha,
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
}
{
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
w--;
}
{
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
w--;
}
}
s = (uint32_t) *src++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
s = (uint32_t) *src++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
}
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
while (w && ((unsigned long)dst & 15))
{
- *dst = (uint8_t)_mm_cvtsi64_si32 (
- _mm_adds_pu8 (
- _mm_movepi64_pi64 (xmm_src),
- _mm_cvtsi32_si64 (*dst)));
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
w--;
dst++;
while (w)
{
- *dst = (uint8_t)_mm_cvtsi64_si32 (
- _mm_adds_pu8 (
- _mm_movepi64_pi64 (xmm_src),
- _mm_cvtsi32_si64 (*dst)));
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
w--;
dst++;
uint32_t m;
int src_stride, mask_stride, dst_stride;
int32_t w;
- __m64 ms;
+ __m128i ms;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
s = 0xff000000 | *src++;
m = (uint32_t) *mask++;
d = *dst;
- ms = unpack_32_1x64 (s);
+ ms = unpack_32_1x128 (s);
if (m != 0xff)
{
- __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- __m64 md = unpack_32_1x64 (d);
+ __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ __m128i md = unpack_32_1x128 (d);
- ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+ ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
}
- *dst++ = pack_1x64_32 (ms);
+ *dst++ = pack_1x128_32 (ms);
w--;
}
}
else
{
- __m64 ma, md, ms;
+ __m128i ma, md, ms;
d = *dst;
- ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- md = unpack_32_1x64 (d);
- ms = unpack_32_1x64 (s);
+ ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ md = unpack_32_1x128 (d);
+ ms = unpack_32_1x128 (s);
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
}
}
}
else
{
- __m64 ms, md, ma, msa;
+ __m128i ms, md, ma, msa;
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
}
}
}
else
{
- __m64 ms, md, ma, msa;
+ __m128i ms, md, ma, msa;
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
}
}
while (w && (unsigned long)dst & 15)
{
- __m64 vd;
+ __m128i vd;
- vd = unpack_32_1x64 (*dst);
+ vd = unpack_32_1x128 (*dst);
- *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
- _mm_movepi64_pi64 (xmm_src)));
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
w--;
dst++;
}
while (w)
{
- __m64 vd;
+ __m128i vd;
- vd = unpack_32_1x64 (*dst);
+ vd = unpack_32_1x128 (*dst);
- *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
- _mm_movepi64_pi64 (xmm_src)));
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
w--;
dst++;
}
}
else
{
- __m64 ms, md, ma, msa;
+ __m128i ms, md, ma, msa;
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
}
}
}
else
{
- __m64 ms, md, ma, msa;
+ __m128i ms, md, ma, msa;
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
}
}
{
uint32_t d = *dst;
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 dest = _mm_movepi64_pi64 (xmm_mask);
- __m64 alpha_dst = unpack_32_1x64 (d);
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
}
dst++;
w--;
{
uint32_t d = *dst;
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &mask, &dest));
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
}
dst++;