static force_inline void
expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
{
- __m64 t0, t1, alpha = _mm_setzero_si64 ();;
+ __m64 t0, t1, alpha = _mm_setzero_si64 ();
__m64 r = _mm_and_si64 (vin, MC (expand_565_r));
__m64 g = _mm_and_si64 (vin, MC (expand_565_g));
__m64 b = _mm_and_si64 (vin, MC (expand_565_b));
{
__m64 vdest = *(__m64 *)dst;
__m64 v0, v1, v2, v3;
+ __m64 vsrc0, vsrc1, vsrc2, vsrc3;
expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
- __m64 vsrc0 = load8888 ((src + 0));
- __m64 vsrc1 = load8888 ((src + 1));
- __m64 vsrc2 = load8888 ((src + 2));
- __m64 vsrc3 = load8888 ((src + 3));
+ vsrc0 = load8888 ((src + 0));
+ vsrc1 = load8888 ((src + 1));
+ vsrc2 = load8888 ((src + 2));
+ vsrc3 = load8888 ((src + 3));
v0 = over (vsrc0, expand_alpha (vsrc0), v0);
v1 = over (vsrc1, expand_alpha (vsrc1), v1);
{
__m64 vdest = *(__m64 *)dst;
__m64 v0, v1, v2, v3;
+ __m64 vm0, vm1, vm2, vm3;
expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
- __m64 vm0 = to_m64 (m0);
+ vm0 = to_m64 (m0);
v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
- __m64 vm1 = to_m64 (m1);
+ vm1 = to_m64 (m1);
v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
- __m64 vm2 = to_m64 (m2);
+ vm2 = to_m64 (m2);
v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
- __m64 vm3 = to_m64 (m3);
+ vm3 = to_m64 (m3);
v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
/* fetch 2x2 pixel block into 2 mmx registers */ \
__m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
__m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
- vx += unit_x; \
/* vertical interpolation */ \
__m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
__m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
__m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
__m64 hi = _mm_add_pi16 (t_hi, b_hi); \
__m64 lo = _mm_add_pi16 (t_lo, b_lo); \
+ vx += unit_x; \
if (BILINEAR_INTERPOLATION_BITS < 8) \
{ \
/* calculate horizontal weights */ \
__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
_mm_srli_pi16 (mm_x, \
16 - BILINEAR_INTERPOLATION_BITS))); \
- mm_x = _mm_add_pi16 (mm_x, mm_ux); \
/* horizontal interpolation */ \
__m64 p = _mm_unpacklo_pi16 (lo, hi); \
__m64 q = _mm_unpackhi_pi16 (lo, hi); \
16 - BILINEAR_INTERPOLATION_BITS)); \
__m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
16 - BILINEAR_INTERPOLATION_BITS); \
- mm_x = _mm_add_pi16 (mm_x, mm_ux); \
/* horizontal interpolation */ \
__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
_mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
} \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
/* shift and pack the result */ \
hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \