[Pixman] [PATCH 12/12] MIPS: enable prefetch for store only for CPU with 32 byte cache line

Nemanja Lukic nemanja.lukic at rt-rk.com
Sun Sep 8 15:52:51 PDT 2013


---
 pixman/pixman-mips-common.h    |   31 +++++++++--
 pixman/pixman-mips-dspr1-asm.S |   59 +++++++++++++++++++++-
 pixman/pixman-mips-dspr1.c     |   15 ++++--
 pixman/pixman-mips-dspr2.c     |    6 +--
 pixman/pixman-mips.c           |   31 +++++++++++-
 pixman/pixman-mips32r2-asm.S   |  110 ++++++++++++++++++++++++++++++++++++++--
 pixman/pixman-mips32r2.c       |   25 +++++++--
 7 files changed, 249 insertions(+), 28 deletions(-)

diff --git a/pixman/pixman-mips-common.h b/pixman/pixman-mips-common.h
index 05ff7ad..a141226 100644
--- a/pixman/pixman-mips-common.h
+++ b/pixman/pixman-mips-common.h
@@ -39,16 +39,37 @@
 #define SKIP_ZERO_MASK 2
 #define DO_FAST_MEMCPY 3
 
-void
+#if defined(USE_MIPS_DSPR2) || defined(USE_MIPS_DSPR1) || \
+    defined(USE_MIPS32R2)
+extern int allow_prefetch;
+#endif
+
+void*
+(*pixman_fast_memcpy_mips) (void *dst, void *src, uint32_t n_bytes);
+void*
 pixman_fast_memcpy_mips32r2 (void *dst, void *src, uint32_t n_bytes);
+
+void
+(*pixman_fill_buff32_mips32r2) (void *dst, uint32_t n_bytes, uint32_t value);
 void
-pixman_fill_buff32_mips32r2 (void *dst, uint32_t n_bytes, uint32_t value);
+pixman_fill_buff32_pref_mips32r2 (void *dst, uint32_t n_bytes, uint32_t value);
 void
-pixman_fill_buff16_mips32r2 (void *dst, uint32_t n_bytes, uint16_t value);
+pixman_fill_buff32_no_pref_mips32r2 (void *dst, uint32_t n_bytes, uint32_t value);
+
+void
+(*pixman_fill_buff16_mips32r2) (void *dst, uint32_t n_bytes, uint16_t value);
+void
+pixman_fill_buff16_pref_mips32r2 (void *dst, uint32_t n_bytes, uint16_t value);
+void
+pixman_fill_buff16_no_pref_mips32r2 (void *dst, uint32_t n_bytes, uint16_t value);
 
 #ifdef USE_MIPS_DSPR1
 void
-pixman_fill_buff16_mips_dspr1 (void *dst, uint32_t n_bytes, uint16_t value);
+(*pixman_fill_buff16_mips_dspr1) (void *dst, uint32_t n_bytes, uint16_t value);
+void
+pixman_fill_buff16_pref_mips_dspr1 (void *dst, uint32_t n_bytes, uint16_t value);
+void
+pixman_fill_buff16_no_pref_mips_dspr1 (void *dst, uint32_t n_bytes, uint16_t value);
 #endif
 
 /****************************************************************/
@@ -85,7 +106,7 @@ mips_composite_##name (pixman_implementation_t *imp,             \
       src_line += src_stride;                                    \
                                                                  \
       if (flags == DO_FAST_MEMCPY)                               \
-        pixman_fast_memcpy_mips32r2 (dst, src, width * bpp);     \
+        pixman_fast_memcpy_mips (dst, src, width * bpp);         \
       else                                                       \
         pixman_composite_##name##_asm##suffix (dst, src, width); \
     }                                                            \
diff --git a/pixman/pixman-mips-dspr1-asm.S b/pixman/pixman-mips-dspr1-asm.S
index a4b9ebc..91fae9a 100644
--- a/pixman/pixman-mips-dspr1-asm.S
+++ b/pixman/pixman-mips-dspr1-asm.S
@@ -32,7 +32,62 @@
 #include "pixman-private.h"
 #include "pixman-mips-dspr1-asm.h"
 
-LEAF_MIPS_DSPR1(pixman_fill_buff16)
+LEAF_MIPS_DSPR1(pixman_fill_buff16_no_pref)
+/*
+ * a0 - *dest
+ * a1 - count (bytes)
+ * a2 - value to fill buffer with
+ */
+
+    beqz     a1, 3f
+     andi    t1, a0, 0x0002
+    beqz     t1, 0f          /* check if address is 4-byte aligned */
+     nop
+    sh       a2, 0(a0)
+    addiu    a0, a0, 2
+    addiu    a1, a1, -2
+0:
+    srl      t1, a1, 5       /* t1 how many multiples of 32 bytes */
+    replv.ph a2, a2          /* replicate fill value (16bit) in a2 */
+    beqz     t1, 2f
+     nop
+1:
+    addiu    t1, t1, -1
+    beqz     t1, 11f
+     addiu   a1, a1, -32
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+11:
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    addiu    a0, a0, 32
+2:
+    blez     a1, 3f
+     addiu   a1, a1, -2
+    sh       a2, 0(a0)
+    b        2b
+     addiu   a0, a0, 2
+3:
+    jr       ra
+     nop
+
+END_MIPS_DSPR1(pixman_fill_buff16_no_pref)
+
+LEAF_MIPS_DSPR1(pixman_fill_buff16_pref)
 /*
  * a0 - *dest
  * a1 - count (bytes)
@@ -86,7 +141,7 @@ LEAF_MIPS_DSPR1(pixman_fill_buff16)
     jr       ra
      nop
 
-END_MIPS_DSPR1(pixman_fill_buff16)
+END_MIPS_DSPR1(pixman_fill_buff16_pref)
 
 LEAF_MIPS_DSPR1(pixman_composite_add_8888_8888_asm)
 /*
diff --git a/pixman/pixman-mips-dspr1.c b/pixman/pixman-mips-dspr1.c
index 638d993..8b04fe6 100644
--- a/pixman/pixman-mips-dspr1.c
+++ b/pixman/pixman-mips-dspr1.c
@@ -152,11 +152,7 @@ mips_dspr1_blt (pixman_implementation_t *imp,
         uint8_t *dst = dst_bytes;
         src_bytes += src_stride;
         dst_bytes += dst_stride;
-#ifdef USE_MIPS32R2
-        pixman_fast_memcpy_mips32r2 (dst, src, byte_width);
-#else
-        memcpy (dst, src, byte_width);
-#endif
+        pixman_fast_memcpy_mips (dst, src, byte_width);
     }
 
     return TRUE;
@@ -175,6 +171,15 @@ _pixman_implementation_create_mips_dspr1 (pixman_implementation_t *fallback)
     pixman_implementation_t *imp =
         _pixman_implementation_create (fallback, mips_dspr1_fast_paths);
 
+    if (allow_prefetch)
+    {
+        pixman_fill_buff16_mips_dspr1 = pixman_fill_buff16_pref_mips_dspr1;
+    }
+    else
+    {
+        pixman_fill_buff16_mips_dspr1 = pixman_fill_buff16_no_pref_mips_dspr1;
+    }
+
     imp->blt = mips_dspr1_blt;
     imp->fill = mips_dspr1_fill;
 
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index a9773b7..a1551ca 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -292,11 +292,7 @@ mips_dspr2_blt (pixman_implementation_t *imp,
         uint8_t *dst = dst_bytes;
         src_bytes += src_stride;
         dst_bytes += dst_stride;
-#ifdef USE_MIPS32R2
-        pixman_fast_memcpy_mips32r2 (dst, src, byte_width);
-#else
-        memcpy (dst, src, byte_width);
-#endif
+        pixman_fast_memcpy_mips (dst, src, byte_width);
     }
 
     return TRUE;
diff --git a/pixman/pixman-mips.c b/pixman/pixman-mips.c
index 53c1023..7557ce3 100644
--- a/pixman/pixman-mips.c
+++ b/pixman/pixman-mips.c
@@ -24,10 +24,16 @@
 #endif
 
 #include "pixman-private.h"
+#include "pixman-mips-common.h"
 #include <string.h>
 #include <stdlib.h>
 #include <unistd.h>
 
+#if defined(USE_MIPS_DSPR2) || defined(USE_MIPS_DSPR1) || \
+    defined(USE_MIPS32R2)
+int allow_prefetch;
+#endif
+
 #ifdef USE_MIPS_DSPR2
 static const char *mips_dspr2_cores[] = {"MIPS 74K", NULL};
 #endif
@@ -137,16 +143,37 @@ _pixman_mips_get_implementations (pixman_implementation_t *imp)
     if (long_bit != 32)
         return imp;
 
+#if defined(USE_MIPS_DSPR2) || defined(USE_MIPS_DSPR1) || \
+    defined(USE_MIPS32R2)
+    allow_prefetch = 0;
+    pixman_fast_memcpy_mips = (void*)memcpy;
+#endif
+
 #ifdef USE_MIPS32R2
     if (!_pixman_disabled ("mips32r2"))
     {
         int already_compiling_everything_for_mips32r2 = 0;
+        pixman_bool_t temp = FALSE;
 #if defined(__mips__) && (__mips_isa_rev >= 2)
         already_compiling_everything_for_mips32r2 = 1;
 #endif
-        if (already_compiling_everything_for_mips32r2 ||
-            have_feature (mips32r2_cores, " mips32r2"))
+        temp = have_feature (mips32r2_cores, " mips32r2");
+        if (already_compiling_everything_for_mips32r2 || temp)
         {
+            if (temp)
+            {
+                int cache_line_size;
+
+                __asm__ volatile (
+                    ".set   arch=mips32r2 \n\t"
+                    "rdhwr  %0, $1        \n\t"
+                    : "=r" (cache_line_size)
+                    :
+                );
+
+                if (cache_line_size == 32)
+                    allow_prefetch = 1;
+            }
             imp = _pixman_implementation_create_mips32r2 (imp);
         }
     }
diff --git a/pixman/pixman-mips32r2-asm.S b/pixman/pixman-mips32r2-asm.S
index 75ff9e2..38048a0 100644
--- a/pixman/pixman-mips32r2-asm.S
+++ b/pixman/pixman-mips32r2-asm.S
@@ -381,7 +381,61 @@ $ua_smallCopy_loop:
 
 END_MIPS32R2(pixman_fast_memcpy)
 
-LEAF_MIPS32R2(pixman_fill_buff16)
+LEAF_MIPS32R2(pixman_fill_buff16_no_pref)
+/*
+ * a0 - *dest
+ * a1 - count (bytes)
+ * a2 - value to fill buffer with
+ */
+
+    beqz     a1, 3f
+     andi    t1, a0, 0x0002
+    beqz     t1, 0f          /* check if address is 4-byte aligned */
+     nop
+    sh       a2, 0(a0)
+    addiu    a0, a0, 2
+    addiu    a1, a1, -2
+0:
+    srl      t1, a1, 5       /* t1 how many multiples of 32 bytes */
+    beqz     t1, 2f
+     ins     a2, a2, 16, 16
+1:
+    addiu    t1, t1, -1
+    beqz     t1, 11f
+     addiu   a1, a1, -32
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+11:
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    addiu    a0, a0, 32
+2:
+    blez     a1, 3f
+     addiu   a1, a1, -2
+    sh       a2, 0(a0)
+    b        2b
+     addiu   a0, a0, 2
+3:
+    jr       ra
+     nop
+
+END_MIPS32R2(pixman_fill_buff16_no_pref)
+
+LEAF_MIPS32R2(pixman_fill_buff16_pref)
 /*
  * a0 - *dest
  * a1 - count (bytes)
@@ -434,9 +488,57 @@ LEAF_MIPS32R2(pixman_fill_buff16)
     jr       ra
      nop
 
-END_MIPS32R2(pixman_fill_buff16)
+END_MIPS32R2(pixman_fill_buff16_pref)
+
+LEAF_MIPS32R2(pixman_fill_buff32_no_pref)
+/*
+ * a0 - *dest
+ * a1 - count (bytes)
+ * a2 - value to fill buffer with
+ */
+
+    beqz     a1, 3f
+     nop
+    srl      t1, a1, 5 /* t1 how many multiples of 32 bytes */
+    beqz     t1, 2f
+     nop
+1:
+    addiu    t1, t1, -1
+    beqz     t1, 11f
+     addiu   a1, a1, -32
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+11:
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    addiu    a0, a0, 32
+2:
+    blez     a1, 3f
+     addiu   a1, a1, -4
+    sw       a2, 0(a0)
+    b        2b
+     addiu   a0, a0, 4
+3:
+    jr       ra
+     nop
+
+END_MIPS32R2(pixman_fill_buff32_no_pref)
 
-LEAF_MIPS32R2(pixman_fill_buff32)
+LEAF_MIPS32R2(pixman_fill_buff32_pref)
 /*
  * a0 - *dest
  * a1 - count (bytes)
@@ -483,7 +585,7 @@ LEAF_MIPS32R2(pixman_fill_buff32)
     jr       ra
      nop
 
-END_MIPS32R2(pixman_fill_buff32)
+END_MIPS32R2(pixman_fill_buff32_pref)
 
 LEAF_MIPS32R2(pixman_composite_src_x888_8888_asm)
 /*
diff --git a/pixman/pixman-mips32r2.c b/pixman/pixman-mips32r2.c
index 26b62f7..4b27608 100644
--- a/pixman/pixman-mips32r2.c
+++ b/pixman/pixman-mips32r2.c
@@ -146,13 +146,17 @@ mips32r2_blt (pixman_implementation_t *imp,
         uint8_t *dst = dst_bytes;
         src_bytes += src_stride;
         dst_bytes += dst_stride;
-        pixman_fast_memcpy_mips32r2 (dst, src, byte_width);
+        pixman_fast_memcpy_mips (dst, src, byte_width);
     }
     return TRUE;
 }
 
 static const pixman_fast_path_t mips32r2_fast_paths[] =
 {
+    /* pref ON */
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
+    /* pref OFF */
     PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, r5g6b5,   mips_composite_src_0565_0565),
     PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, b5g6r5,   mips_composite_src_0565_0565),
     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888),
@@ -162,16 +166,27 @@ static const pixman_fast_path_t mips32r2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888),
     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888),
     PIXMAN_STD_FAST_PATH (SRC, r8g8b8,   null, r8g8b8,   mips_composite_src_0888_0888),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
     { PIXMAN_OP_NONE },
 };
 
 pixman_implementation_t *
 _pixman_implementation_create_mips32r2 (pixman_implementation_t *fallback)
 {
-    pixman_implementation_t *imp =
-        _pixman_implementation_create (fallback, mips32r2_fast_paths);
+    pixman_implementation_t *imp;
+
+    if (allow_prefetch)
+    {
+        imp = _pixman_implementation_create (fallback, &mips32r2_fast_paths[0]);
+        pixman_fill_buff16_mips32r2 = pixman_fill_buff16_pref_mips32r2;
+        pixman_fill_buff32_mips32r2 = pixman_fill_buff32_pref_mips32r2;
+        pixman_fast_memcpy_mips = pixman_fast_memcpy_mips32r2;
+    }
+    else
+    {
+        imp = _pixman_implementation_create (fallback, &mips32r2_fast_paths[2]);
+        pixman_fill_buff16_mips32r2 = pixman_fill_buff16_no_pref_mips32r2;
+        pixman_fill_buff32_mips32r2 = pixman_fill_buff32_no_pref_mips32r2;
+    }
 
     imp->blt = mips32r2_blt;
     imp->fill = mips32r2_fill;
-- 
1.7.3



More information about the Pixman mailing list