From: Liu Xinyun <xinyun.liu@intel.com>
Date: Sat, 25 Sep 2010 06:56:38 +0000 (+0800)
Subject: Remove cache prefetch code.
X-Git-Tag: 1.0_branch~455
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ba69989374fe9cbe5151c5aac7b824da0806f94a;p=profile%2Fivi%2Fpixman.git

Remove cache prefetch code.

The performance is decreased with cache prefetch, especially for
ATOM. So remove these code. Following is the experiment.

old: 0.19.5-with-cache-prefetch
new: 0.19.5-without-cache-prefetch

CPU: Intel Atom N270@1.6GHz
OS: MeeGo (32 bits)
Speedups
========
image-rgba                    poppler-0    17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%):  1.16x speedup
image-rgba                  ocitysmap-0    9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%):  1.09x speedup
image-rgba          xfce4-terminal-a1-0    18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%):  1.08x speedup
image-rgba         gnome-terminal-vim-0    25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%):  1.07x speedup
image-rgba          firefox-talos-gfx-0    57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%):  1.06x speedup
image-rgba       firefox-planet-gnome-0    102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%):  1.06x speedup
image-rgba         swfdec-giant-steps-0    12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%):  1.06x speedup

CPU: Intel Core(TM)2 Duo CPU T9600@2.80GHz
OS: Ubuntu 10.04 (64bits)
Speedups
========
image-rgba                  ocitysmap-0    2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%):  1.16x speedup
image-rgba         swfdec-giant-steps-0    1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%):  1.05x speedup

Signed-off-by: Liu Xinyun <xinyun.liu@intel.com>
Signed-off-by: Chen Miaobo <miaobo.chen@intel.com>
---

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 33d71ee..112a8c2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo,
     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 }
 
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
-    _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
-    _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch_next (addr);
-}
-
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
 load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
@@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure about
 	 * the address alignment.
 	 */
@@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w &&
            ((unsigned long)pd & 15))
@@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure
 	 * about the address alignment.
 	 */
@@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
@@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
@@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
                                  const uint32_t* pm,
                                  int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t*       pd,
                          const uint32_t* pm,
                          int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
 	xmm_dst = load_128_aligned ((__m128i*) pd);
 
@@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t*       dst,
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i s;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
 
 	save_128_aligned (
@@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     uint32_t pack_cmp;
     __m128i xmm_src, xmm_dst;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst = load_128_aligned  ((__m128i*)pd);
 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
 
@@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     __m128i xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
@@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = *src++;
@@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    *dst++ = *src++ | 0xff000000;
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 16)
 	{
 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
 	    
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = _mm_or_si128 (
 		load_128_unaligned ((__m128i*)src), mask_ff000000);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	/* It's a 8 pixel loop */
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* I'm loading unaligned because I'm not sure
 	     * about the address alignment.
 	     */
@@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)byte_line);
     xmm_def = create_mask_2x32_128 (data, data);
 
     while (height--)
@@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	byte_line += stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 1 && ((unsigned long)d & 1))
 	{
 	    *(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 128)
 	{
-	    cache_prefetch (((__m128i*)d) + 12);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits,
 
 	if (w >= 64)
 	{
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	if (w >= 32)
 	{
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
@@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 16;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    m = *mask++;
@@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*) dst);
 	    unpack_565_128_4x128 (xmm_dst,
 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
@@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
 
 	    opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	dst_line += dst_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    mask++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    *dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    save_128_aligned (
 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
 
@@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)src_bytes);
-    cache_prefetch ((__m128i*)dst_bytes);
-
     while (height--)
     {
 	int w;
@@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	dst_bytes += dst_stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 2 && ((unsigned long)d & 3))
 	{
 	    *(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 64)
 	{
 	    __m128i xmm0, xmm1, xmm2, xmm3;
 
-	    /* 128 bytes ahead */
-	    cache_prefetch (((__m128i*)s) + 8);
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    xmm0 = load_128_unaligned ((__m128i*)(s));
 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 16)
 	{
 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    s += 16;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w && (unsigned long)dst & 15)
         {
             s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)mask);
-
             m = *(uint32_t*) mask;
             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
 
@@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
             m = *(uint32_t *) mask;
 
 	    if (m)
@@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
 	    __m128i tmp_lo, tmp_hi;
 
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)(dst + 4));
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 
 	    if (!is_transparent (xmm_mask))