From: Matt Turner Date: Wed, 27 Jun 2012 17:00:36 +0000 (-0400) Subject: mmx: optimize bilinear function when using 7-bit precision X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9aa8e3a26071739d160496ef9f6126f296c500eb;p=platform%2Fupstream%2Fpixman.git mmx: optimize bilinear function when using 7-bit precision Loongson: image firefox-fishtank 1037.738 1040.218 0.19% 3/3 image firefox-fishtank 1056.611 1057.581 0.20% 3/3 ARM/iwMMXt: image firefox-fishtank 1487.282 1492.640 0.17% 3/3 image firefox-fishtank 1363.913 1364.366 0.11% 3/3 --- diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 0c79f3a..5441d6b 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -3522,11 +3522,14 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, } #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) +#define BMSK (BSHIFT - 1) #define BILINEAR_DECLARE_VARIABLES \ const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \ + const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ + const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ const __m64 mm_zero = _mm_setzero_si64 (); \ __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) @@ -3544,21 +3547,37 @@ do { \ __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ - /* calculate horizontal weights */ \ - __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ + if (BILINEAR_INTERPOLATION_BITS < 8) \ + { \ + /* calculate horizontal weights */ \ + __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ + _mm_srli_pi16 (mm_x, \ + 16 - BILINEAR_INTERPOLATION_BITS))); \ + mm_x = _mm_add_pi16 (mm_x, mm_ux); \ + /* horizontal interpolation */ \ + __m64 p = _mm_unpacklo_pi16 (lo, hi); \ + __m64 q = _mm_unpackhi_pi16 (lo, hi); \ + lo = _mm_madd_pi16 (p, mm_wh); \ + hi = _mm_madd_pi16 (q, mm_wh); \ + } \ + else \ + { \ + /* calculate horizontal weights */ \ + __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ 16 - BILINEAR_INTERPOLATION_BITS)); \ - __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ + __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ 16 - BILINEAR_INTERPOLATION_BITS); \ - mm_x = _mm_add_pi16 (mm_x, mm_ux); \ - /* horizontal interpolation */ \ - __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ - __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ - __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ - __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ - lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ - _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ - hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ - _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \ + mm_x = _mm_add_pi16 (mm_x, mm_ux); \ + /* horizontal interpolation */ \ + __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ + __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ + __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ + __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ + lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ + _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ + hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ + _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \ + } \ /* shift and pack the result */ \ hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \