From: Liu Xinyun Date: Sat, 25 Sep 2010 06:56:38 +0000 (+0800) Subject: Remove cache prefetch code. X-Git-Tag: 1.0_branch~455 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ba69989374fe9cbe5151c5aac7b824da0806f94a;p=profile%2Fivi%2Fpixman.git Remove cache prefetch code. The performance is decreased with cache prefetch, especially for ATOM. So remove these code. Following is the experiment. old: 0.19.5-with-cache-prefetch new: 0.19.5-without-cache-prefetch CPU: Intel Atom N270@1.6GHz OS: MeeGo (32 bits) Speedups ======== image-rgba poppler-0 17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%): 1.16x speedup image-rgba ocitysmap-0 9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%): 1.09x speedup image-rgba xfce4-terminal-a1-0 18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%): 1.08x speedup image-rgba gnome-terminal-vim-0 25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%): 1.07x speedup image-rgba firefox-talos-gfx-0 57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%): 1.06x speedup image-rgba firefox-planet-gnome-0 102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%): 1.06x speedup image-rgba swfdec-giant-steps-0 12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%): 1.06x speedup CPU: Intel Core(TM)2 Duo CPU T9600@2.80GHz OS: Ubuntu 10.04 (64bits) Speedups ======== image-rgba ocitysmap-0 2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%): 1.16x speedup image-rgba swfdec-giant-steps-0 1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%): 1.05x speedup Signed-off-by: Liu Xinyun Signed-off-by: Chen Miaobo --- diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 33d71ee..112a8c2 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo, over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); } -static force_inline void -cache_prefetch (__m128i* addr) -{ - _mm_prefetch ((void const*)addr, _MM_HINT_T0); -} - -static force_inline void -cache_prefetch_next (__m128i* addr) -{ - _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ -} - -/* prefetching NULL is very slow on some systems. don't do that. */ - -static force_inline void -maybe_prefetch (__m128i* addr) -{ - if (addr) - cache_prefetch (addr); -} - -static force_inline void -maybe_prefetch_next (__m128i* addr) -{ - if (addr) - cache_prefetch_next (addr); -} - /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) @@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { @@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure about * the address alignment. */ @@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) @@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); @@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); @@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t* dst, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t* dst, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); @@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* ps = src; const uint32_t* pm = mask; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t* dst, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i s; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - s = combine4 ((__m128i*)ps, (__m128i*)pm); save_128_aligned ( @@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t * pd, uint32_t pack_cmp; __m128i xmm_src, xmm_dst; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); @@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t* pd, __m128i xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); @@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, w--; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_565_128_4x128 (xmm_dst, @@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = *src++; @@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { *dst++ = *src++ | 0xff000000; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w >= 16) { __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - xmm_src1 = load_128_unaligned ((__m128i*)src + 0); xmm_src2 = load_128_unaligned ((__m128i*)src + 1); xmm_src3 = load_128_unaligned ((__m128i*)src + 2); @@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = (*src++) | 0xff000000; @@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = _mm_or_si128 ( load_128_unaligned ((__m128i*)src), mask_ff000000); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - /* It's a 8 pixel loop */ while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits, return FALSE; } - cache_prefetch ((__m128i*)byte_line); xmm_def = create_mask_2x32_128 (data, data); while (height--) @@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits, byte_line += stride; w = byte_width; - cache_prefetch_next ((__m128i*)d); - while (w >= 1 && ((unsigned long)d & 1)) { *(uint8_t *)d = data; @@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits, d += 4; } - cache_prefetch_next ((__m128i*)d); - while (w >= 128) { - cache_prefetch (((__m128i*)d) + 12); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits, if (w >= 64) { - cache_prefetch (((__m128i*)d) + 8); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 64; } - cache_prefetch_next ((__m128i*)d); - if (w >= 32) { save_128_aligned ((__m128i*)(d), xmm_def); @@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 16; } - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = data; @@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { m = *mask++; @@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*) dst); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); @@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src_hi = load_128_unaligned ((__m128i*)src); opaque = is_opaque (xmm_src_hi); @@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask_line += mask_stride; dst_line += dst_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = *(uint32_t *) mask; @@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { d = (uint32_t) *dst; @@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { s = (uint32_t) *src++; @@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { *dst = (uint8_t)_mm_cvtsi64_si32 ( @@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - save_128_aligned ( (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); @@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits, return FALSE; } - cache_prefetch ((__m128i*)src_bytes); - cache_prefetch ((__m128i*)dst_bytes); - while (height--) { int w; @@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits, dst_bytes += dst_stride; w = byte_width; - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; @@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits, d += 4; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 64) { __m128i xmm0, xmm1, xmm2, xmm3; - /* 128 bytes ahead */ - cache_prefetch (((__m128i*)s) + 8); - cache_prefetch (((__m128i*)d) + 8); - xmm0 = load_128_unaligned ((__m128i*)(s)); xmm1 = load_128_unaligned ((__m128i*)(s + 16)); xmm2 = load_128_unaligned ((__m128i*)(s + 32)); @@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits, w -= 64; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 16) { save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); @@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits, s += 16; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = *(uint32_t *)s; @@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w && (unsigned long)dst & 15) { s = 0xff000000 | *src++; @@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)mask); - m = *(uint32_t*) mask; xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); @@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - m = *(uint32_t *) mask; if (m) @@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, dst++; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { __m128i tmp_lo, tmp_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)(dst + 4)); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - xmm_mask = load_128_unaligned ((__m128i*)mask); if (!is_transparent (xmm_mask))