From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date: Mon, 25 Jun 2012 04:24:27 +0000 (+0300)
Subject: sse2: faster bilinear scaling (use _mm_loadl_epi64)
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ff5d041b88c667141b891909acd3085c3ed54994;p=platform%2Fupstream%2Fpixman.git

sse2: faster bilinear scaling (use _mm_loadl_epi64)

Using _mm_loadl_epi64() to load two pixels at once (pairs of top
and bottom pixels) is faster than loading each pixel separately
and combining them with _mm_set_epi32().

=== cairo-perf-trace ===

before: image             firefox-fishtank   66.912   66.931   0.13%    3/3
after:  image             firefox-fishtank   57.584   58.349   0.74%    3/3

=== lowlevel-blt-bench ===

before: src_8888_8888 =  L1: 181.10  L2: 179.14  M:178.08 ( 11.02%)  HT:153.22  VT:133.45  R:142.24  RT: 95.32
after:  src_8888_8888 =  L1: 228.68  L2: 225.75  M:223.98 ( 14.23%)  HT:185.32  VT:155.06  R:162.73  RT:102.52

This improvement was suggested by Matt Turner on irc.
---

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 0604254..ef82a18 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5377,17 +5377,16 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
 do {										\
     __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
-    /* fetch 2x2 pixel block into sse2 register */				\
-    uint32_t tl = src_top [pixman_fixed_to_int (vx)];				\
-    uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];			\
-    uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];			\
-    uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];			\
-    a = _mm_set_epi32 (tr, tl, br, bl);						\
+    /* fetch 2x2 pixel block into sse2 registers */				\
+    __m128i tltr = _mm_loadl_epi64 (						\
+			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
+    __m128i blbr = _mm_loadl_epi64 (						\
+			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
     vx += unit_x;								\
     /* vertical interpolation */						\
-    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),	\
+    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
 					xmm_wt),				\
-		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),	\
+		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
 					xmm_wb));				\
     /* calculate horizontal weights */						\
     xmm_wh = _mm_add_epi16 (xmm_addc,						\