[Pixman] [PATCH 4/4] MIPS: DSPr2: Added OVER combiner and two new fast paths: - over_8888_8888 - over_8888_8888_8888

Nemanja Lukic nlukic at mips.com
Fri Sep 14 00:31:26 PDT 2012


From: Nemanja Lukic <nemanja.lukic at rt-rk.com>

Performance numbers before/after on MIPS-74kc @ 1GHz:

lowlevel-blt-bench results

Referent (before):
          over_8888_8888 =  L1:  19.61  L2:  17.10  M: 11.16 ( 59.20%)  HT: 16.47  VT: 15.81  R: 14.82  RT:  8.90 (  50Kops/s)
     over_8888_8888_8888 =  L1:  13.56  L2:  11.22  M:  7.46 ( 79.18%)  HT:  6.24  VT:  6.20  R:  6.11  RT:  3.95 (  29Kops/s)

Optimized:
          over_8888_8888 =  L1:  46.42  L2:  36.70  M: 16.69 ( 88.57%)  HT: 17.11  VT: 16.55  R: 15.31  RT:  9.48 (  52Kops/s)
     over_8888_8888_8888 =  L1:  26.06  L2:  22.53  M: 11.49 (121.91%)  HT:  9.93  VT:  9.62  R:  9.19  RT:  5.75 (  36Kops/s)
---
 pixman/pixman-mips-dspr2-asm.S |  124 ++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips-dspr2.c     |   27 +++++++++
 2 files changed, 151 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index a4426f2..3a6b26a 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1085,6 +1085,130 @@ LEAF_MIPS_DSPR2(pixman_composite_over_0565_8_0565_asm_mips)
 
 END(pixman_composite_over_0565_8_0565_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */
+    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */
+    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 8
+    srl      t2, t2, 24
+    srl      t3, t3, 24
+
+    OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t0, t1
+
+    sw       t7, 0(a0)
+    sw       t8, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    srl      t1, t1, 24
+
+    OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_over_8888_8888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li           t4, 0x00ff00ff
+    beqz         a2, 3f
+     nop
+    addiu        t1, a2, -1
+    beqz         t1, 2f
+     nop
+1:
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw           t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lw           t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw           t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu        a1, a1, 8
+
+    not          t5, t0
+    srl          t5, t5, 24
+    not          t6, t1
+    srl          t6, t6, 24
+
+    or           t7, t5, t6
+    beqz         t7, 11f
+     or          t8, t0, t1
+    beqz         t8, 12f
+
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t2, t3
+
+    addu_s.qb    t0, t7, t0
+    addu_s.qb    t1, t8, t1
+11:
+    sw           t0, 0(a0)
+    sw           t1, 4(a0)
+12:
+    addiu        a2, a2, -2
+    addiu        t1, a2, -1
+    bgtz         t1, 1b
+     addiu       a0, a0, 8
+2:
+    beqz         a2, 3f
+     nop
+
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw           t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+    addiu        a1, a1, 4
+
+    not          t2, t0
+    srl          t2, t2, 24
+
+    beqz         t2, 21f
+     nop
+    beqz         t0, 3f
+
+    MIPS_UN8x4_MUL_UN8 t1, t2, t3, t4, t5, t6, t7
+
+    addu_s.qb    t0, t3, t0
+21:
+    sw           t0, 0(a0)
+
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j            ra
+     nop
+
+END(pixman_composite_over_8888_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
 /*
  * a0     - *dst
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 881d5fa..072652c 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -48,6 +48,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
                                     uint8_t, 3, uint8_t, 3)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
+                                    uint32_t, 1, uint32_t, 1)
 
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
                                        uint32_t, 1, uint32_t, 1)
@@ -71,6 +73,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
                                          uint8_t, 1, uint16_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
                                          uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
+                                         uint32_t, 1, uint32_t, 1)
 
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
                                           uint32_t, uint32_t)
@@ -260,6 +264,11 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   mips_composite_over_8888_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   mips_composite_over_0565_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   mips_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, mips_composite_over_8888_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
@@ -342,12 +351,30 @@ mips_dspr2_fill (pixman_implementation_t *imp,
         imp->delegate, bits, stride, bpp, x, y, width, height, xor);
 }
 
+static void
+mips_dspr2_combine_over_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    if (mask)
+        pixman_composite_over_8888_8888_8888_asm_mips (
+            dest, (uint32_t *)src, (uint32_t *)mask, width);
+    else
+        pixman_composite_over_8888_8888_asm_mips (
+		    dest, (uint32_t *)src, width);
+}
+
 pixman_implementation_t *
 _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp =
         _pixman_implementation_create (fallback, mips_dspr2_fast_paths);
 
+    imp->combine_32[PIXMAN_OP_OVER] = mips_dspr2_combine_over_u;
+
     imp->blt = mips_dspr2_blt;
     imp->fill = mips_dspr2_fill;
 
-- 
1.7.3



More information about the Pixman mailing list