[PATCH] Remove cache prefetch code. The performance is decreased with cache prefetch, especially for ATOM. So remove these code. Following is the experiment.

Liu Xinyun xinyun.liu at intel.com
Fri Sep 24 23:56:38 PDT 2010


old: 0.19.5-with-cache-prefetch
new: 0.19.5-without-cache-prefetch

CPU: Intel Atom N270 at 1.6GHz
OS: MeeGo (32 bits)
Speedups
========
image-rgba                    poppler-0    17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%):  1.16x speedup
image-rgba                  ocitysmap-0    9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%):  1.09x speedup
image-rgba          xfce4-terminal-a1-0    18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%):  1.08x speedup
image-rgba         gnome-terminal-vim-0    25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%):  1.07x speedup
image-rgba          firefox-talos-gfx-0    57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%):  1.06x speedup
image-rgba       firefox-planet-gnome-0    102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%):  1.06x speedup
image-rgba         swfdec-giant-steps-0    12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%):  1.06x speedup

CPU: Intel Core(TM)2 Duo CPU T9600 at 2.80GHz
OS: Ubuntu 10.04 (64bits)
Speedups
========
image-rgba                  ocitysmap-0    2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%):  1.16x speedup
image-rgba         swfdec-giant-steps-0    1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%):  1.05x speedup

Signed-off-by: Liu Xinyun <xinyun.liu at intel.com>
Signed-off-by: Chen Miaobo <miaobo.chen at intel.com>
---
 pixman/pixman-sse2.c |  659 --------------------------------------------------
 1 files changed, 0 insertions(+), 659 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 33d71ee..112a8c2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo,
     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 }
 
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
-    _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
-    _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch_next (addr);
-}
-
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
 load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
@@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure about
 	 * the address alignment.
 	 */
@@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w &&
            ((unsigned long)pd & 15))
@@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure
 	 * about the address alignment.
 	 */
@@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
@@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
@@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
                                  const uint32_t* pm,
                                  int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t*       pd,
                          const uint32_t* pm,
                          int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
 	xmm_dst = load_128_aligned ((__m128i*) pd);
 
@@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t*       dst,
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i s;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
 
 	save_128_aligned (
@@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     uint32_t pack_cmp;
     __m128i xmm_src, xmm_dst;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst = load_128_aligned  ((__m128i*)pd);
 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
 
@@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     __m128i xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
@@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = *src++;
@@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    *dst++ = *src++ | 0xff000000;
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 16)
 	{
 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
 	    
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = _mm_or_si128 (
 		load_128_unaligned ((__m128i*)src), mask_ff000000);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	/* It's a 8 pixel loop */
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* I'm loading unaligned because I'm not sure
 	     * about the address alignment.
 	     */
@@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)byte_line);
     xmm_def = create_mask_2x32_128 (data, data);
 
     while (height--)
@@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	byte_line += stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 1 && ((unsigned long)d & 1))
 	{
 	    *(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 128)
 	{
-	    cache_prefetch (((__m128i*)d) + 12);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits,
 
 	if (w >= 64)
 	{
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	if (w >= 32)
 	{
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
@@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 16;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    m = *mask++;
@@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*) dst);
 	    unpack_565_128_4x128 (xmm_dst,
 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
@@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
 
 	    opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	dst_line += dst_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    mask++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    *dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    save_128_aligned (
 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
 
@@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)src_bytes);
-    cache_prefetch ((__m128i*)dst_bytes);
-
     while (height--)
     {
 	int w;
@@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	dst_bytes += dst_stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 2 && ((unsigned long)d & 3))
 	{
 	    *(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 64)
 	{
 	    __m128i xmm0, xmm1, xmm2, xmm3;
 
-	    /* 128 bytes ahead */
-	    cache_prefetch (((__m128i*)s) + 8);
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    xmm0 = load_128_unaligned ((__m128i*)(s));
 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 16)
 	{
 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    s += 16;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w && (unsigned long)dst & 15)
         {
             s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)mask);
-
             m = *(uint32_t*) mask;
             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
 
@@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
             m = *(uint32_t *) mask;
 
 	    if (m)
@@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
 	    __m128i tmp_lo, tmp_hi;
 
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)(dst + 4));
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 
 	    if (!is_transparent (xmm_mask))
-- 
1.7.0.4


--d6Gm4EdcadzBjdND--


More information about the Pixman mailing list