[Pixman] [PATCH 10/13] MIPS: dspr1: Move fast paths implementation from dspr2 to dspr1

Nemanja Lukic nemanja.lukic at rt-rk.com
Fri Jun 27 09:05:47 PDT 2014


Some of the optimizations introduced in previous dspr2 commits, similar to
previous patch, were not dspr2 specific and utilized dspr1 instructions only.
Since Pixman's run-time CPU detection only added dspr2 fast-paths on 74K MIPS
cores, these optimizations couldn't be used on cores that don't support dspr2,
but do support dspr1 instructions (these are newer MIPS CPU cores like 24KE,
34K, 1004K, etc).
This patch extracts those dspr1 specific optimizations into new dspr1 set of
fast-paths, and adds infrastructure for future dspr1-only optimizations with
appropriate build and run time support.
Following is the list of dspr1 optimizations, introduced in previous dspr2
patches, tested on MIPS 1004Kc core:

Performance numbers before/after on MIPS-1004kc @ 800 MHz

Referent (before):

    add_8888_8888 =  L1:  26.48  L2:  19.70  M: 14.41 ( 42.67%)  HT: 13.70  VT: 13.41  R: 12.97  RT:  9.84 ( 105Kops/s)
       src_n_0565 =  L1: 344.47  L2: 193.46  M:136.23 (100.70%)  HT:115.48  VT:109.33  R:100.85  RT: 45.61 ( 253Kops/s)

Optimized (with these optimizations):

    add_8888_8888 =  L1: 226.21  L2:  64.57  M: 29.41 ( 86.96%)  HT: 26.95  VT: 23.63  R: 23.66  RT: 15.25 ( 145Kops/s)
       src_n_0565 = L1:-1041.15  L2: 695.23  M:444.03 (327.64%)  HT:144.46  VT:130.94  R:121.60  RT: 43.95 ( 245Kops/s)
---
 pixman/pixman-mips-common.h    |    4 +-
 pixman/pixman-mips-dspr1-asm.S |  133 ++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips-dspr1.c     |   40 ++++++++++++
 pixman/pixman-mips-dspr2-asm.S |  133 ----------------------------------------
 pixman/pixman-mips-dspr2.c     |   39 ------------
 5 files changed, 175 insertions(+), 174 deletions(-)

diff --git a/pixman/pixman-mips-common.h b/pixman/pixman-mips-common.h
index 1c64964..70af1f7 100644
--- a/pixman/pixman-mips-common.h
+++ b/pixman/pixman-mips-common.h
@@ -44,9 +44,9 @@ pixman_fast_memcpy_mips32r2 (void *dst, void *src, uint32_t n_bytes);
 void
 pixman_fill_buff32_mips32r2 (void *dst, uint32_t n_bytes, uint32_t value);
 
-#ifdef USE_MIPS_DSPR2
+#ifdef USE_MIPS_DSPR1
 void
-pixman_fill_buff16_mips_dspr2 (void *dst, uint32_t n_bytes, uint16_t value);
+pixman_fill_buff16_mips_dspr1 (void *dst, uint32_t n_bytes, uint16_t value);
 #endif
 
 /****************************************************************/
diff --git a/pixman/pixman-mips-dspr1-asm.S b/pixman/pixman-mips-dspr1-asm.S
index c6b4e98..a4b9ebc 100644
--- a/pixman/pixman-mips-dspr1-asm.S
+++ b/pixman/pixman-mips-dspr1-asm.S
@@ -31,3 +31,136 @@
 
 #include "pixman-private.h"
 #include "pixman-mips-dspr1-asm.h"
+
+LEAF_MIPS_DSPR1(pixman_fill_buff16)
+/*
+ * a0 - *dest
+ * a1 - count (bytes)
+ * a2 - value to fill buffer with
+ */
+
+    beqz     a1, 3f
+     andi    t1, a0, 0x0002
+    beqz     t1, 0f          /* check if address is 4-byte aligned */
+     nop
+    sh       a2, 0(a0)
+    addiu    a0, a0, 2
+    addiu    a1, a1, -2
+0:
+    srl      t1, a1, 5       /* t1 how many multiples of 32 bytes */
+    replv.ph a2, a2          /* replicate fill value (16bit) in a2 */
+    beqz     t1, 2f
+     nop
+1:
+    addiu    t1, t1, -1
+    beqz     t1, 11f
+     addiu   a1, a1, -32
+    pref     30, 32(a0)
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+11:
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    addiu    a0, a0, 32
+2:
+    blez     a1, 3f
+     addiu   a1, a1, -2
+    sh       a2, 0(a0)
+    b        2b
+     addiu   a0, a0, 2
+3:
+    jr       ra
+     nop
+
+END_MIPS_DSPR1(pixman_fill_buff16)
+
+LEAF_MIPS_DSPR1(pixman_composite_add_8888_8888_asm)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+    beqz         a2, 4f
+     nop
+
+    srl          t9, a2, 2      /* t1 = how many multiples of 4 src pixels */
+    beqz         t9, 3f         /* branch if less than 4 src pixels */
+     nop
+1:
+    addiu        t9, t9, -1
+    beqz         t9, 2f
+     addiu       a2, a2, -4
+
+    lw           t0, 0(a1)
+    lw           t1, 4(a1)
+    lw           t2, 8(a1)
+    lw           t3, 12(a1)
+    lw           t4, 0(a0)
+    lw           t5, 4(a0)
+    lw           t6, 8(a0)
+    lw           t7, 12(a0)
+    addiu        a1, a1, 16
+
+    addu_s.qb    t4, t4, t0
+    addu_s.qb    t5, t5, t1
+    addu_s.qb    t6, t6, t2
+    addu_s.qb    t7, t7, t3
+
+    sw           t4, 0(a0)
+    sw           t5, 4(a0)
+    sw           t6, 8(a0)
+    sw           t7, 12(a0)
+    b            1b
+     addiu       a0, a0, 16
+2:
+    lw           t0, 0(a1)
+    lw           t1, 4(a1)
+    lw           t2, 8(a1)
+    lw           t3, 12(a1)
+    lw           t4, 0(a0)
+    lw           t5, 4(a0)
+    lw           t6, 8(a0)
+    lw           t7, 12(a0)
+    addiu        a1, a1, 16
+
+    addu_s.qb    t4, t4, t0
+    addu_s.qb    t5, t5, t1
+    addu_s.qb    t6, t6, t2
+    addu_s.qb    t7, t7, t3
+
+    sw           t4, 0(a0)
+    sw           t5, 4(a0)
+    sw           t6, 8(a0)
+    sw           t7, 12(a0)
+
+    beqz         a2, 4f
+     addiu       a0, a0, 16
+3:
+    lw           t0, 0(a1)
+    lw           t1, 0(a0)
+    addiu        a1, a1, 4
+    addiu        a2, a2, -1
+    addu_s.qb    t1, t1, t0
+    sw           t1, 0(a0)
+    bnez         a2, 3b
+     addiu       a0, a0, 4
+4:
+    jr           ra
+     nop
+
+END_MIPS_DSPR1(pixman_composite_add_8888_8888_asm)
diff --git a/pixman/pixman-mips-dspr1.c b/pixman/pixman-mips-dspr1.c
index 875814d..2db4cb1 100644
--- a/pixman/pixman-mips-dspr1.c
+++ b/pixman/pixman-mips-dspr1.c
@@ -36,8 +36,46 @@
 #include "pixman-private.h"
 #include "pixman-mips-common.h"
 
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888,
+                                    uint32_t, 1, uint32_t, 1, _mips_dspr1)
+
+static pixman_bool_t
+mips_dspr1_fill (pixman_implementation_t *imp,
+                 uint32_t *               bits,
+                 int                      stride,
+                 int                      bpp,
+                 int                      x,
+                 int                      y,
+                 int                      width,
+                 int                      height,
+                 uint32_t                 _xor)
+{
+    uint8_t *byte_line;
+    uint32_t byte_width;
+
+    if (bpp == 16)
+    {
+        stride = stride * (int) sizeof (uint32_t) / 2;
+        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+        byte_width = width * 2;
+        stride *= 2;
+
+        while (height--)
+        {
+            uint8_t *dst = byte_line;
+            byte_line += stride;
+            pixman_fill_buff16_mips_dspr1 (dst, byte_width, _xor & 0xffff);
+        }
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
 static const pixman_fast_path_t mips_dspr1_fast_paths[] =
 {
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null, a8r8g8b8, mips_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null, a8b8g8r8, mips_composite_add_8888_8888),
     { PIXMAN_OP_NONE },
 };
 
@@ -47,5 +85,7 @@ _pixman_implementation_create_mips_dspr1 (pixman_implementation_t *fallback)
     pixman_implementation_t *imp =
         _pixman_implementation_create (fallback, mips_dspr1_fast_paths);
 
+    imp->fill = mips_dspr1_fill;
+
     return imp;
 }
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index f8eadf1..f3bd4d0 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -32,62 +32,6 @@
 #include "pixman-private.h"
 #include "pixman-mips-dspr2-asm.h"
 
-LEAF_MIPS_DSPR2(pixman_fill_buff16)
-/*
- * a0 - *dest
- * a1 - count (bytes)
- * a2 - value to fill buffer with
- */
-
-    beqz     a1, 3f
-     andi    t1, a0, 0x0002
-    beqz     t1, 0f          /* check if address is 4-byte aligned */
-     nop
-    sh       a2, 0(a0)
-    addiu    a0, a0, 2
-    addiu    a1, a1, -2
-0:
-    srl      t1, a1, 5       /* t1 how many multiples of 32 bytes */
-    replv.ph a2, a2          /* replicate fill value (16bit) in a2 */
-    beqz     t1, 2f
-     nop
-1:
-    addiu    t1, t1, -1
-    beqz     t1, 11f
-     addiu   a1, a1, -32
-    pref     30, 32(a0)
-    sw       a2, 0(a0)
-    sw       a2, 4(a0)
-    sw       a2, 8(a0)
-    sw       a2, 12(a0)
-    sw       a2, 16(a0)
-    sw       a2, 20(a0)
-    sw       a2, 24(a0)
-    sw       a2, 28(a0)
-    b        1b
-     addiu   a0, a0, 32
-11:
-    sw       a2, 0(a0)
-    sw       a2, 4(a0)
-    sw       a2, 8(a0)
-    sw       a2, 12(a0)
-    sw       a2, 16(a0)
-    sw       a2, 20(a0)
-    sw       a2, 24(a0)
-    sw       a2, 28(a0)
-    addiu    a0, a0, 32
-2:
-    blez     a1, 3f
-     addiu   a1, a1, -2
-    sh       a2, 0(a0)
-    b        2b
-     addiu   a0, a0, 2
-3:
-    jr       ra
-     nop
-
-END_MIPS_DSPR2(pixman_fill_buff16)
-
 LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm)
 /*
  * a0 - dst (r5g6b5)
@@ -2632,83 +2576,6 @@ LEAF_MIPS_DSPR2(pixman_composite_add_8_8_asm)
 
 END_MIPS_DSPR2(pixman_composite_add_8_8_asm)
 
-LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_asm)
-/*
- * a0 - dst (a8r8g8b8)
- * a1 - src (a8r8g8b8)
- * a2 - w
- */
-
-    beqz         a2, 4f
-     nop
-
-    srl          t9, a2, 2      /* t1 = how many multiples of 4 src pixels */
-    beqz         t9, 3f         /* branch if less than 4 src pixels */
-     nop
-1:
-    addiu        t9, t9, -1
-    beqz         t9, 2f
-     addiu       a2, a2, -4
-
-    lw           t0, 0(a1)
-    lw           t1, 4(a1)
-    lw           t2, 8(a1)
-    lw           t3, 12(a1)
-    lw           t4, 0(a0)
-    lw           t5, 4(a0)
-    lw           t6, 8(a0)
-    lw           t7, 12(a0)
-    addiu        a1, a1, 16
-
-    addu_s.qb    t4, t4, t0
-    addu_s.qb    t5, t5, t1
-    addu_s.qb    t6, t6, t2
-    addu_s.qb    t7, t7, t3
-
-    sw           t4, 0(a0)
-    sw           t5, 4(a0)
-    sw           t6, 8(a0)
-    sw           t7, 12(a0)
-    b            1b
-     addiu       a0, a0, 16
-2:
-    lw           t0, 0(a1)
-    lw           t1, 4(a1)
-    lw           t2, 8(a1)
-    lw           t3, 12(a1)
-    lw           t4, 0(a0)
-    lw           t5, 4(a0)
-    lw           t6, 8(a0)
-    lw           t7, 12(a0)
-    addiu        a1, a1, 16
-
-    addu_s.qb    t4, t4, t0
-    addu_s.qb    t5, t5, t1
-    addu_s.qb    t6, t6, t2
-    addu_s.qb    t7, t7, t3
-
-    sw           t4, 0(a0)
-    sw           t5, 4(a0)
-    sw           t6, 8(a0)
-    sw           t7, 12(a0)
-
-    beqz         a2, 4f
-     addiu       a0, a0, 16
-3:
-    lw           t0, 0(a1)
-    lw           t1, 0(a0)
-    addiu        a1, a1, 4
-    addiu        a2, a2, -1
-    addu_s.qb    t1, t1, t0
-    sw           t1, 0(a0)
-    bnez         a2, 3b
-     addiu       a0, a0, 4
-4:
-    jr           ra
-     nop
-
-END_MIPS_DSPR2(pixman_composite_add_8888_8888_asm)
-
 LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_0565_asm)
 /*
  * a0 - dst  (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 7e9a095..4a332c5 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -56,8 +56,6 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
                                     uint32_t, 1, uint16_t, 1, _mips_dspr2)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
                                     uint8_t, 1, uint8_t, 1, _mips_dspr2)
-PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888,
-                                    uint32_t, 1, uint32_t, 1, _mips_dspr2)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_0565,
                                     uint8_t, 1, uint16_t, 1, _mips_dspr2)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_8888,
@@ -155,39 +153,6 @@ PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, OVER,
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, ADD,
                                              uint32_t, uint32_t, _mips_dspr2)
 
-static pixman_bool_t
-mips_dspr2_fill (pixman_implementation_t *imp,
-                 uint32_t *               bits,
-                 int                      stride,
-                 int                      bpp,
-                 int                      x,
-                 int                      y,
-                 int                      width,
-                 int                      height,
-                 uint32_t                 _xor)
-{
-    uint8_t *byte_line;
-    uint32_t byte_width;
-
-    if (bpp == 16)
-    {
-        stride = stride * (int) sizeof (uint32_t) / 2;
-        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
-        byte_width = width * 2;
-        stride *= 2;
-
-        while (height--)
-        {
-            uint8_t *dst = byte_line;
-            byte_line += stride;
-            pixman_fill_buff16_mips_dspr2 (dst, byte_width, _xor & 0xffff);
-        }
-        return TRUE;
-    }
-
-    return FALSE;
-}
-
 static const pixman_fast_path_t mips_dspr2_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5,   mips_composite_src_8888_0565),
@@ -261,8 +226,6 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, mips_composite_add_8888_n_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, mips_composite_add_8888_n_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       mips_composite_add_8_8),
-    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, mips_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, mips_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, r5g6b5,   mips_composite_out_reverse_8_0565),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, b5g6r5,   mips_composite_out_reverse_8_0565),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, a8r8g8b8, mips_composite_out_reverse_8_8888),
@@ -351,7 +314,5 @@ _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback)
 
     imp->combine_32[PIXMAN_OP_OVER] = mips_dspr2_combine_over_u;
 
-    imp->fill = mips_dspr2_fill;
-
     return imp;
 }
-- 
1.7.3



More information about the Pixman mailing list