[Pixman] [PATCH] ARM: NEON: added forgotten cache preload for over_n_8888/over_n_0565

Siarhei Siamashka siarhei.siamashka at gmail.com
Thu Sep 30 06:15:45 PDT 2010


From: Siarhei Siamashka <siarhei.siamashka at nokia.com>

Prefetch provides up to 40-50% better performance when working
with large images and/or when having lots of L2 cache misses
on ARM Cortex-A8 @ 720MHz:

== before ==

    over_n_8888 =  L1: 225.83  L2: 181.02  M: 55.57 ( 41.41%)
                   HT: 38.96   VT: 36.92   R: 32.84  RT: 14.15 ( 123Kops/s)

    over_n_0565 =  L1: 153.91  L2: 149.69  M: 83.17 ( 30.95%)
                   HT: 50.41   VT: 49.15   R: 40.56  RT: 15.45 ( 131Kops/s)

== after ==

    over_n_8888 =  L1: 222.39  L2: 170.95  M: 76.86 ( 57.27%)
                   HT: 58.80   VT: 53.03   R: 45.51  RT: 14.13 ( 124Kops/s)

    over_n_0565 =  L1: 151.87  L2: 149.54  M:125.63 ( 46.80%)
                   HT: 67.85   VT: 57.54   R: 50.21  RT: 15.32 ( 130Kops/s)


---
 pixman/pixman-arm-neon-asm.S |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 9f6568f..8ebe089 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -388,6 +388,7 @@ generate_composite_function \
     vld1.16     {d4, d5}, [DST_R, :128]!
     vst1.16     {d28, d29}, [DST_W, :128]!
     pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
 .endm
 
 .macro pixman_composite_over_n_0565_init
@@ -710,6 +711,7 @@ generate_composite_function_single_scanline \
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
     vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
     pixman_composite_over_8888_8888_process_pixblock_head
+    cache_preload 8, 8
 .endm
 
 .macro pixman_composite_over_n_8888_init
-- 
1.7.2.2



More information about the Pixman mailing list