[PATCH] Remove cache prefetch code. The performance is decreased with cache prefetch, especially for ATOM. So remove these code. Following is the experiment.
Liu Xinyun
xinyun.liu at intel.com
Fri Sep 24 23:56:38 PDT 2010
old: 0.19.5-with-cache-prefetch
new: 0.19.5-without-cache-prefetch
CPU: Intel Atom N270 at 1.6GHz
OS: MeeGo (32 bits)
Speedups
========
image-rgba poppler-0 17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%): 1.16x speedup
image-rgba ocitysmap-0 9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%): 1.09x speedup
image-rgba xfce4-terminal-a1-0 18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%): 1.08x speedup
image-rgba gnome-terminal-vim-0 25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%): 1.07x speedup
image-rgba firefox-talos-gfx-0 57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%): 1.06x speedup
image-rgba firefox-planet-gnome-0 102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%): 1.06x speedup
image-rgba swfdec-giant-steps-0 12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%): 1.06x speedup
CPU: Intel Core(TM)2 Duo CPU T9600 at 2.80GHz
OS: Ubuntu 10.04 (64bits)
Speedups
========
image-rgba ocitysmap-0 2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%): 1.16x speedup
image-rgba swfdec-giant-steps-0 1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%): 1.05x speedup
Signed-off-by: Liu Xinyun <xinyun.liu at intel.com>
Signed-off-by: Chen Miaobo <miaobo.chen at intel.com>
---
pixman/pixman-sse2.c | 659 --------------------------------------------------
1 files changed, 0 insertions(+), 659 deletions(-)
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 33d71ee..112a8c2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo,
over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
- _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
- _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
- if (addr)
- cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
- if (addr)
- cache_prefetch_next (addr);
-}
-
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
@@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
/* I'm loading unaligned because I'm not sure about
* the address alignment.
*/
@@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
@@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
@@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
@@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
@@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd,
const uint32_t* pm,
int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t* pd,
const uint32_t* pm,
int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t* pd,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t* dst,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t* dst,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
xmm_dst = load_128_aligned ((__m128i*) pd);
@@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t* dst,
const uint32_t* ps = src;
const uint32_t* pm = mask;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t* dst,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i s;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
s = combine4 ((__m128i*)ps, (__m128i*)pm);
save_128_aligned (
@@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t * pd,
uint32_t pack_cmp;
__m128i xmm_src, xmm_dst;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst = load_128_aligned ((__m128i*)pd);
xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
@@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t* pd,
__m128i xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t* pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
w--;
}
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
@@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
@@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
@@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
@@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = *src++;
@@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
*dst++ = *src++ | 0xff000000;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
-
while (w >= 16)
{
__m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
-
xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = _mm_or_si128 (
load_128_unaligned ((__m128i*)src), mask_ff000000);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
dst = dst_line;
src = src_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
src_line += src_stride;
w = width;
@@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
/* It's a 8 pixel loop */
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
@@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits,
return FALSE;
}
- cache_prefetch ((__m128i*)byte_line);
xmm_def = create_mask_2x32_128 (data, data);
while (height--)
@@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits,
byte_line += stride;
w = byte_width;
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 1 && ((unsigned long)d & 1))
{
*(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits,
d += 4;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 128)
{
- cache_prefetch (((__m128i*)d) + 12);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
@@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits,
if (w >= 64)
{
- cache_prefetch (((__m128i*)d) + 8);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
@@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits,
w -= 64;
}
- cache_prefetch_next ((__m128i*)d);
-
if (w >= 32)
{
save_128_aligned ((__m128i*)(d), xmm_def);
@@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits,
w -= 16;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
m = *mask++;
@@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*) dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
@@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
@@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src_hi = load_128_unaligned ((__m128i*)src);
opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
mask_line += mask_stride;
dst_line += dst_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
mask++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
dst_line += dst_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
dst_line += dst_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
*dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
save_128_aligned (
(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
@@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
dst = dst_line;
src = src_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
src_line += src_stride;
w = width;
@@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
return FALSE;
}
- cache_prefetch ((__m128i*)src_bytes);
- cache_prefetch ((__m128i*)dst_bytes);
-
while (height--)
{
int w;
@@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
dst_bytes += dst_stride;
w = byte_width;
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 2 && ((unsigned long)d & 3))
{
*(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
d += 4;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 64)
{
__m128i xmm0, xmm1, xmm2, xmm3;
- /* 128 bytes ahead */
- cache_prefetch (((__m128i*)s) + 8);
- cache_prefetch (((__m128i*)d) + 8);
-
xmm0 = load_128_unaligned ((__m128i*)(s));
xmm1 = load_128_unaligned ((__m128i*)(s + 16));
xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
w -= 64;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 16)
{
save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
s += 16;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w && (unsigned long)dst & 15)
{
s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)mask);
-
m = *(uint32_t*) mask;
xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
@@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
@@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i *)src);
- cache_prefetch_next ((__m128i *)dst);
- cache_prefetch_next ((__m128i *)mask);
-
m = *(uint32_t *) mask;
if (m)
@@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
dst++;
}
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
__m128i tmp_lo, tmp_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)(dst + 4));
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
@@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i *)src);
- cache_prefetch_next ((__m128i *)dst);
- cache_prefetch_next ((__m128i *)mask);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
if (!is_transparent (xmm_mask))
--
1.7.0.4
--d6Gm4EdcadzBjdND--
More information about the Pixman
mailing list