From: Liu Xinyun Date: Tue, 21 Sep 2010 16:15:10 +0000 (+0800) Subject: add enable-cache-prefetch option X-Git-Tag: 1.0_branch~459 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=19ea0e16b958e5abe491365c203293ab372f3586;p=profile%2Fivi%2Fpixman.git add enable-cache-prefetch option OK. here is the work to clear all cache prefetch. Please review it. 3x On Tue, Sep 21, 2010 at 11:36:30PM +0800, Soeren Sandmann wrote: > Liu Xinyun writes: > > > This patch is to add a new configuration option: enable-cache-prefetch, > > which is default yes. > > > > Here is a link which talks on cache issue. > > http://lists.freedesktop.org/archives/pixman/2010-June/000218.html > > > > When disable it on Atom CPU(configured with --enable-cache-prefetch=no), > > it will have a little performance gain. Here is the patch. > > I think the cache prefetch code should just be deleted outright. No > benchmarks that I'm aware of show it to be an improvement. > > > Thanks, > Soren >From bca2192ef524bcae4eea84d0ffed9e8c4855675f Mon Sep 17 00:00:00 2001 From: Liu Xinyun Date: Wed, 22 Sep 2010 00:11:56 +0800 Subject: [PATCH] remove cache prefetch --- diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 33d71ee..112a8c2 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo, over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); } -static force_inline void -cache_prefetch (__m128i* addr) -{ - _mm_prefetch ((void const*)addr, _MM_HINT_T0); -} - -static force_inline void -cache_prefetch_next (__m128i* addr) -{ - _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ -} - -/* prefetching NULL is very slow on some systems. don't do that. */ - -static force_inline void -maybe_prefetch (__m128i* addr) -{ - if (addr) - cache_prefetch (addr); -} - -static force_inline void -maybe_prefetch_next (__m128i* addr) -{ - if (addr) - cache_prefetch_next (addr); -} - /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) @@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { @@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure about * the address alignment. */ @@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) @@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); @@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); @@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t* dst, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t* dst, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); @@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* ps = src; const uint32_t* pm = mask; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t* dst, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i s; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - s = combine4 ((__m128i*)ps, (__m128i*)pm); save_128_aligned ( @@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t * pd, uint32_t pack_cmp; __m128i xmm_src, xmm_dst; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); @@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t* pd, __m128i xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); @@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, w--; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_565_128_4x128 (xmm_dst, @@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = *src++; @@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { *dst++ = *src++ | 0xff000000; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w >= 16) { __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - xmm_src1 = load_128_unaligned ((__m128i*)src + 0); xmm_src2 = load_128_unaligned ((__m128i*)src + 1); xmm_src3 = load_128_unaligned ((__m128i*)src + 2); @@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = (*src++) | 0xff000000; @@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = _mm_or_si128 ( load_128_unaligned ((__m128i*)src), mask_ff000000); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - /* It's a 8 pixel loop */ while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits, return FALSE; } - cache_prefetch ((__m128i*)byte_line); xmm_def = create_mask_2x32_128 (data, data); while (height--) @@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits, byte_line += stride; w = byte_width; - cache_prefetch_next ((__m128i*)d); - while (w >= 1 && ((unsigned long)d & 1)) { *(uint8_t *)d = data; @@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits, d += 4; } - cache_prefetch_next ((__m128i*)d); - while (w >= 128) { - cache_prefetch (((__m128i*)d) + 12); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits, if (w >= 64) { - cache_prefetch (((__m128i*)d) + 8); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 64; } - cache_prefetch_next ((__m128i*)d); - if (w >= 32) { save_128_aligned ((__m128i*)(d), xmm_def); @@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 16; } - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = data; @@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { m = *mask++; @@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*) dst); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); @@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src_hi = load_128_unaligned ((__m128i*)src); opaque = is_opaque (xmm_src_hi); @@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask_line += mask_stride; dst_line += dst_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = *(uint32_t *) mask; @@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { d = (uint32_t) *dst; @@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { s = (uint32_t) *src++; @@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { *dst = (uint8_t)_mm_cvtsi64_si32 ( @@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - save_128_aligned ( (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); @@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits, return FALSE; } - cache_prefetch ((__m128i*)src_bytes); - cache_prefetch ((__m128i*)dst_bytes); - while (height--) { int w; @@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits, dst_bytes += dst_stride; w = byte_width; - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; @@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits, d += 4; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 64) { __m128i xmm0, xmm1, xmm2, xmm3; - /* 128 bytes ahead */ - cache_prefetch (((__m128i*)s) + 8); - cache_prefetch (((__m128i*)d) + 8); - xmm0 = load_128_unaligned ((__m128i*)(s)); xmm1 = load_128_unaligned ((__m128i*)(s + 16)); xmm2 = load_128_unaligned ((__m128i*)(s + 32)); @@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits, w -= 64; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 16) { save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); @@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits, s += 16; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = *(uint32_t *)s; @@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w && (unsigned long)dst & 15) { s = 0xff000000 | *src++; @@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)mask); - m = *(uint32_t*) mask; xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); @@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - m = *(uint32_t *) mask; if (m) @@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, dst++; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { __m128i tmp_lo, tmp_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)(dst + 4)); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - xmm_mask = load_128_unaligned ((__m128i*)mask); if (!is_transparent (xmm_mask))