Allows us to tune how we store data from the vector registers.
Signed-off-by: Matt Turner <mattst88@gmail.com>
return _mm_packs_pu16 (lo, hi);
}
return _mm_packs_pu16 (lo, hi);
}
-static force_inline uint32_t
-store8888 (__m64 v)
+static force_inline void
+store8888 (uint32_t *dest, __m64 v)
- return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
+ v = pack8888 (v, _mm_setzero_si64());
+ *dest = _mm_cvtsi64_si32 (v);
}
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
}
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
m = expand_alpha (m);
s = pix_multiply (s, m);
m = expand_alpha (m);
s = pix_multiply (s, m);
__m64 s, sa;
s = load8888 (ssrc);
sa = expand_alpha (s);
__m64 s, sa;
s = load8888 (ssrc);
sa = expand_alpha (s);
- *dest = store8888 (over (s, sa, load8888 (*dest)));
+ store8888 (dest, over (s, sa, load8888 (*dest)));
d = load8888 (*dest);
da = expand_alpha (d);
d = load8888 (*dest);
da = expand_alpha (d);
- *dest = store8888 (over (d, da, load8888 (s)));
+ store8888 (dest, over (d, da, load8888 (s)));
a = expand_alpha (a);
x = pix_multiply (x, a);
a = expand_alpha (a);
x = pix_multiply (x, a);
a = load8888 (combine (src, mask));
a = expand_alpha (a);
x = pix_multiply (x, a);
a = load8888 (combine (src, mask));
a = expand_alpha (a);
x = pix_multiply (x, a);
a = expand_alpha (a);
a = negate (a);
x = pix_multiply (x, a);
a = expand_alpha (a);
a = negate (a);
x = pix_multiply (x, a);
a = negate (a);
x = pix_multiply (x, a);
a = negate (a);
x = pix_multiply (x, a);
sia = negate (sia);
da = expand_alpha (d);
s = pix_add_mul (s, da, d, sia);
sia = negate (sia);
da = expand_alpha (d);
s = pix_add_mul (s, da, d, sia);
dia = expand_alpha (d);
dia = negate (dia);
s = pix_add_mul (s, dia, d, sa);
dia = expand_alpha (d);
dia = negate (dia);
s = pix_add_mul (s, dia, d, sa);
sia = negate (sia);
dia = negate (dia);
s = pix_add_mul (s, dia, d, sia);
sia = negate (sia);
dia = negate (dia);
s = pix_add_mul (s, dia, d, sia);
s = load8888 (combine (src, mask));
d = load8888 (*dest);
s = pix_add (s, d);
s = load8888 (combine (src, mask));
d = load8888 (*dest);
s = pix_add (s, d);
- *dest = store8888 (md);
__m64 s = load8888 (*src);
s = pix_multiply (s, a);
__m64 s = load8888 (*src);
s = pix_multiply (s, a);
__m64 d = load8888 (*dest);
__m64 sa = expand_alpha (s);
__m64 d = load8888 (*dest);
__m64 sa = expand_alpha (s);
- *dest = store8888 (in_over (s, sa, a, d));
+ store8888 (dest, in_over (s, sa, a, d));
__m64 d = load8888 (*dest);
__m64 da = expand_alpha (d);
__m64 d = load8888 (*dest);
__m64 da = expand_alpha (d);
- *dest = store8888 (over (d, da, in (s, a)));
+ store8888 (dest, over (d, da, in (s, a)));
s = pix_multiply (s, a);
s = pix_multiply (s, da);
s = pix_multiply (s, a);
s = pix_multiply (s, da);
a = pix_multiply (a, sa);
d = pix_multiply (d, a);
a = pix_multiply (a, sa);
d = pix_multiply (d, a);
da = negate (da);
s = pix_multiply (s, a);
s = pix_multiply (s, da);
da = negate (da);
s = pix_multiply (s, a);
s = pix_multiply (s, da);
a = pix_multiply (a, sa);
a = negate (a);
d = pix_multiply (d, a);
a = pix_multiply (a, sa);
a = negate (a);
d = pix_multiply (d, a);
a = pix_multiply (a, sa);
a = negate (a);
d = pix_add_mul (d, a, s, da);
a = pix_multiply (a, sa);
a = negate (a);
d = pix_add_mul (d, a, s, da);
a = pix_multiply (a, sa);
da = negate (da);
d = pix_add_mul (d, a, s, da);
a = pix_multiply (a, sa);
da = negate (da);
d = pix_add_mul (d, a, s, da);
da = negate (da);
a = negate (a);
d = pix_add_mul (d, a, s, da);
da = negate (da);
a = negate (a);
d = pix_add_mul (d, a, s, da);
s = pix_multiply (s, a);
d = pix_add (s, d);
s = pix_multiply (s, a);
d = pix_add (s, d);
while (w && (unsigned long)dst & 7)
{
while (w && (unsigned long)dst & 7)
{
- *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+ store8888 (dst, over (vsrc, vsrca, load8888 (*dst)));
- *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+ store8888 (dst, over (vsrc, vsrca, load8888 (*dst)));
{
__m64 vdest = load8888 (*q);
vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
{
__m64 vdest = load8888 (*q);
vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
- *q = store8888 (vdest);
{
__m64 vdest = load8888 (*q);
vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
{
__m64 vdest = load8888 (*q);
vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
- *q = store8888 (vdest);
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
- *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+ store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
- *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+ store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
__m64 s = load8888 (*src | 0xff000000);
__m64 d = load8888 (*dst);
__m64 s = load8888 (*src | 0xff000000);
__m64 d = load8888 (*dst);
- *dst = store8888 (in_over (s, srca, vmask, d));
+ store8888 (dst, in_over (s, srca, vmask, d));
__m64 s = load8888 (*src | 0xff000000);
__m64 d = load8888 (*dst);
__m64 s = load8888 (*src | 0xff000000);
__m64 d = load8888 (*dst);
- *dst = store8888 (in_over (s, srca, vmask, d));
+ store8888 (dst, in_over (s, srca, vmask, d));
__m64 ms, sa;
ms = load8888 (s);
sa = expand_alpha (ms);
__m64 ms, sa;
ms = load8888 (s);
sa = expand_alpha (ms);
- *dst = store8888 (over (ms, sa, load8888 (*dst)));
+ store8888 (dst, over (ms, sa, load8888 (*dst)));
expand_alpha_rev (to_m64 (m)),
load8888 (*dst));
expand_alpha_rev (to_m64 (m)),
load8888 (*dst));
- *dst = store8888 (vdest);
+ store8888 (dst, vdest);
vdest = in_over (
vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
vdest = in_over (
vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
- *dst = store8888 (vdest);
+ store8888 (dst, vdest);
{
__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
{
__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
- *dst = store8888 (vdest);
+ store8888 (dst, vdest);
__m64 vdest = load8888 (*dst);
vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
__m64 vdest = load8888 (*dst);
vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
- *dst = store8888 (vdest);
+ store8888 (dst, vdest);
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
- *dst = store8888 (over_rev_non_pre (s, d));
+ store8888 (dst, over_rev_non_pre (s, d));
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
- *dst = store8888 (over_rev_non_pre (s, d));
+ store8888 (dst, over_rev_non_pre (s, d));
vmask = load8888 (ldl_u((uint32_t *)mask));
vdest = load8888 (*(uint32_t *)dst);
vmask = load8888 (ldl_u((uint32_t *)mask));
vdest = load8888 (*(uint32_t *)dst);
- *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
+ store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
uint32_t *s = (uint32_t *)src;
uint32_t *d = (uint32_t *)dst;
uint32_t *s = (uint32_t *)src;
uint32_t *d = (uint32_t *)dst;
- *d = store8888 (in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d)));
+ store8888 (d, in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d)));
vmask = load8888 (ldl_u((uint32_t *)mask));
vdest = load8888 (*(uint32_t *)dst);
vmask = load8888 (ldl_u((uint32_t *)mask));
vdest = load8888 (*(uint32_t *)dst);
- *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
+ store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
__m64 vm = expand_alpha_rev (to_m64 (m));
__m64 vdest = in_over (s, sa, vm, load8888 (*dst));
__m64 vm = expand_alpha_rev (to_m64 (m));
__m64 vdest = in_over (s, sa, vm, load8888 (*dst));
- *dst = store8888 (vdest);
+ store8888 (dst, vdest);