pixman: Branch 'master' - 3 commits

Søren Sandmann Pedersen sandmann at kemper.freedesktop.org
Thu Oct 25 07:11:19 PDT 2012


 pixman/pixman-mips-dspr2-asm.S |  624 +++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips-dspr2-asm.h |   22 +
 pixman/pixman-mips-dspr2.c     |   32 ++
 3 files changed, 678 insertions(+)

New commits:
commit f0750258459580bbc9f136710f8e5c551bd01a0f
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Sun Oct 14 11:58:52 2012 +0200

    MIPS: DSPr2: Added more fast-paths for ADD operation: - add_8888_8888_8888 - add_8_8 - add_8888_8888
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            add_8888_8888_8888 =  L1:  17.55  L2:  13.35  M:  8.13 ( 93.95%)  HT:  6.60  VT:  6.64  R:  6.45  RT:  3.47 (  26Kops/s)
            add_8_8            =  L1:  86.07  L2:  84.89  M: 62.36 ( 90.11%)  HT: 36.36  VT: 34.74  R: 29.56  RT: 11.56 (  52Kops/s)
            add_8888_8888      =  L1:  95.59  L2:  73.05  M: 17.62 (101.84%)  HT: 15.46  VT: 15.01  R: 13.94  RT:  6.71 (  42Kops/s)
    
    Optimized:
            add_8888_8888_8888 =  L1:  41.52  L2:  33.21  M: 11.97 (138.45%)  HT: 10.47  VT: 10.19  R:  9.42  RT:  4.86 (  32Kops/s)
            add_8_8            =  L1: 135.06  L2: 104.82  M: 57.13 ( 82.58%)  HT: 34.79  VT: 36.60  R: 28.28  RT: 10.54 (  51Kops/s)
            add_8888_8888      =  L1: 176.36  L2:  67.82  M: 17.48 (101.06%)  HT: 15.16  VT: 14.62  R: 13.88  RT:  8.05 (  45Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 7c8ca30..b5cae16 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1631,6 +1631,208 @@ LEAF_MIPS_DSPR2(pixman_composite_add_8888_n_8888_asm_mips)
 
 END(pixman_composite_add_8888_n_8888_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */
+    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */
+    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 8
+    srl      t2, t2, 24
+    srl      t3, t3, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t7, t8, \
+                                       t4, t9, s0, s1, s2, t0, t1
+
+    sw       t7, 0(a0)
+    sw       t8, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    srl      t1, t1, 24
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_8888_8888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (a8)
+ * a2 - w
+ */
+
+    beqz              a2, 3f
+     nop
+    srl               t9, a2, 2   /* t9 = how many multiples of 4 dst pixels */
+    beqz              t9, 1f      /* branch if less than 4 src pixels */
+     nop
+
+0:
+    beqz              t9, 1f
+     addiu            t9, t9, -1
+    lbu               t0, 0(a1)
+    lbu               t1, 1(a1)
+    lbu               t2, 2(a1)
+    lbu               t3, 3(a1)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a1, a1, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a2, a2, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a2, 3f
+     nop
+2:
+    lbu               t0, 0(a1)
+    lbu               t1, 0(a0)
+    addiu             a1, a1, 1
+
+    addu_s.qb         t2, t0, t1
+    sb                t2, 0(a0)
+    addiu             a2, a2, -1
+    bnez              a2, 2b
+     addiu            a0, a0, 1
+
+3:
+    j                 ra
+     nop
+
+END(pixman_composite_add_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+    beqz         a2, 4f
+     nop
+
+    srl          t9, a2, 2      /* t1 = how many multiples of 4 src pixels */
+    beqz         t9, 3f         /* branch if less than 4 src pixels */
+     nop
+1:
+    addiu        t9, t9, -1
+    beqz         t9, 2f
+     addiu       a2, a2, -4
+
+    lw           t0, 0(a1)
+    lw           t1, 4(a1)
+    lw           t2, 8(a1)
+    lw           t3, 12(a1)
+    lw           t4, 0(a0)
+    lw           t5, 4(a0)
+    lw           t6, 8(a0)
+    lw           t7, 12(a0)
+    addiu        a1, a1, 16
+
+    addu_s.qb    t4, t4, t0
+    addu_s.qb    t5, t5, t1
+    addu_s.qb    t6, t6, t2
+    addu_s.qb    t7, t7, t3
+
+    sw           t4, 0(a0)
+    sw           t5, 4(a0)
+    sw           t6, 8(a0)
+    sw           t7, 12(a0)
+    b            1b
+     addiu       a0, a0, 16
+2:
+    lw           t0, 0(a1)
+    lw           t1, 4(a1)
+    lw           t2, 8(a1)
+    lw           t3, 12(a1)
+    lw           t4, 0(a0)
+    lw           t5, 4(a0)
+    lw           t6, 8(a0)
+    lw           t7, 12(a0)
+    addiu        a1, a1, 16
+
+    addu_s.qb    t4, t4, t0
+    addu_s.qb    t5, t5, t1
+    addu_s.qb    t6, t6, t2
+    addu_s.qb    t7, t7, t3
+
+    sw           t4, 0(a0)
+    sw           t5, 4(a0)
+    sw           t6, 8(a0)
+    sw           t7, 12(a0)
+
+    beqz         a2, 4f
+     addiu       a0, a0, 16
+3:
+    lw           t0, 0(a1)
+    lw           t1, 0(a0)
+    addiu        a1, a1, 4
+    addiu        a2, a2, -1
+    addu_s.qb    t1, t1, t0
+    sw           t1, 0(a0)
+    bnez         a2, 3b
+     addiu       a0, a0, 4
+4:
+    jr           ra
+     nop
+
+END(pixman_composite_add_8888_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
 /*
  * a0     - *dst
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 1471750..9da636d 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -50,6 +50,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
                                     uint8_t, 3, uint8_t, 3)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
+                                    uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888,
+                                    uint32_t, 1, uint32_t, 1)
 
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
                                        uint32_t, 1, uint32_t, 1)
@@ -77,6 +81,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t,  1,
                                          uint8_t,  1, uint8_t,  1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8_8888, uint32_t, 1,
                                          uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8888_8888, uint32_t, 1,
+                                         uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_0565_8_0565, uint16_t, 1,
                                          uint8_t,  1, uint16_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1,
@@ -291,8 +297,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   mips_composite_add_0565_8_0565),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, mips_composite_add_8888_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, mips_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_add_8888_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, mips_composite_add_8888_n_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, mips_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       mips_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, mips_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, mips_composite_add_8888_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
commit ca83717c63813b6f53f89dd94b5771bd32382a18
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Sun Oct 14 11:58:51 2012 +0200

    MIPS: DSPr2: Added more fast-paths for ADD operation: - add_0565_8_0565 - add_8888_8_8888 - add_8888_n_8888
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            add_0565_8_0565 =  L1:   8.89  L2:   8.37  M:  7.35 ( 29.22%)  HT:  5.90  VT:  5.85  R:  5.67  RT:  3.31 (  26Kops/s)
            add_8888_8_8888 =  L1:  17.22  L2:  14.17  M:  9.89 ( 65.56%)  HT:  7.57  VT:  7.50  R:  7.36  RT:  4.10 (  30Kops/s)
            add_8888_n_8888 =  L1:  17.79  L2:  14.87  M: 10.35 ( 54.89%)  HT:  5.19  VT:  4.93  R:  4.92  RT:  1.90 (  19Kops/s)
    
    Optimized:
            add_0565_8_0565 =  L1:  21.72  L2:  20.01  M: 14.96 ( 59.54%)  HT: 12.03  VT: 11.81  R: 11.26  RT:  6.33 (  37Kops/s)
            add_8888_8_8888 =  L1:  47.42  L2:  38.64  M: 15.90 (105.48%)  HT: 13.34  VT: 13.03  R: 11.84  RT:  6.63 (  38Kops/s)
            add_8888_n_8888 =  L1:  54.83  L2:  42.66  M: 17.36 ( 92.11%)  HT: 15.20  VT: 14.82  R: 13.66  RT:  7.83 (  41Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 614c628..7c8ca30 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1461,6 +1461,176 @@ LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8888_asm_mips)
 
 END(pixman_composite_add_n_8_8888_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_add_0565_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (r5g6b5)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+    li       t4, 0xf800f800
+    li       t5, 0x07e007e0
+    li       t6, 0x001F001F
+    li       t7, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */
+    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */
+    addiu    a1, a1, 4
+    addiu    a2, a2, 2
+
+    CONVERT_2x0565_TO_2x8888  t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
+    CONVERT_2x0565_TO_2x8888  t8, t9, s2, s3, t5, t6, s4, s5, s6, s7
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4  s0, s1, \
+                                        t2, t3, \
+                                        s2, s3, \
+                                        t0, t1, \
+                                        t7, s4, s5, s6, s7, t8, t9
+    CONVERT_2x8888_TO_2x0565  t0, t1, s0, s1, t4, t5, t6, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888  t0, t3, t4, t5
+    CONVERT_1x0565_TO_1x8888  t2, t4, t5, t6
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4  t3, t1, t4, t0, t7, t2, t5, t6
+    CONVERT_1x8888_TO_1x0565  t0, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+    j        ra
+     nop
+
+END(pixman_composite_add_0565_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 2
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t7, t8, \
+                                       t4, t9, s0, s1, s2, t0, t1
+
+    sw       t7, 0(a0)
+    sw       t8, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_8888_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_n_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (32bit constant)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    srl      a2, a2, 24
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+                                       a2, a2, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t4, t7, t8, t9, s0, s1, s2
+
+    sw       t5, 0(a0)
+    sw       t6, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, a2, t1, t3, t4, t5, t6, t7
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_8888_n_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
 /*
  * a0     - *dst
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 30d2a85..1471750 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -70,9 +70,15 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565,
                                       uint32_t, 1, uint16_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565,
                                       uint16_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, add_8888_n_8888,
+                                      uint32_t, 1, uint32_t, 1)
 
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t,  1,
                                          uint8_t,  1, uint8_t,  1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8_8888, uint32_t, 1,
+                                         uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_0565_8_0565, uint16_t, 1,
+                                         uint8_t,  1, uint16_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1,
                                          uint8_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
@@ -281,6 +287,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, mips_composite_add_n_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, mips_composite_add_n_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       mips_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   mips_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   mips_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, mips_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, mips_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, mips_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, mips_composite_add_8888_n_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
commit 52d20e692ebc605077448ab6f52fd257f83481b2
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Sun Oct 14 11:58:50 2012 +0200

    MIPS: DSPr2: Added fast-paths for ADD operation: - add_n_8_8 - add_n_8_8888 - add_8_8_8
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            add_n_8_8    =  L1:  41.37  L2:  37.83  M: 30.38 ( 60.45%)  HT: 23.70  VT: 22.85  R: 21.51  RT: 10.32 (  45Kops/s)
            add_n_8_8888 =  L1:  16.01  L2:  14.46  M: 11.64 ( 46.32%)  HT:  5.50  VT:  5.18  R:  5.06  RT:  1.89 (  18Kops/s)
            add_8_8_8    =  L1:  13.26  L2:  12.47  M: 11.16 ( 29.61%)  HT:  8.09  VT:  8.04  R:  7.68  RT:  3.90 (  29Kops/s)
    
    Optimized:
            add_n_8_8    =  L1:  96.03  L2:  79.37  M: 51.89 (103.31%)  HT: 32.59  VT: 31.29  R: 28.52  RT: 11.08 (  46Kops/s)
            add_n_8_8888 =  L1:  53.61  L2:  46.92  M: 23.78 ( 94.70%)  HT: 19.06  VT: 18.64  R: 17.30  RT:  9.15 (  43Kops/s)
            add_8_8_8    =  L1:  89.65  L2:  66.82  M: 37.10 ( 98.48%)  HT: 22.10  VT: 21.74  R: 20.12  RT:  8.12 (  41Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 3a6b26a..614c628 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1209,6 +1209,258 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
 
 END(pixman_composite_over_8888_8888_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_add_8_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (a8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0, v1
+    li                t9, 0x00ff00ff
+    beqz              a3, 3f
+     nop
+
+    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
+    beqz              v0, 1f      /* branch if less than 4 src pixels */
+     nop
+
+0:
+    beqz              v0, 1f
+     addiu            v0, v0, -1
+    lbu               t0, 0(a2)
+    lbu               t1, 1(a2)
+    lbu               t2, 2(a2)
+    lbu               t3, 3(a2)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a2, a2, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    lbu               t4, 0(a1)
+    lbu               v1, 1(a1)
+    lbu               t7, 2(a1)
+    lbu               t8, 3(a1)
+
+    addiu             a1, a1, 4
+
+    precr_sra.ph.w    v1, t4, 0
+    precr_sra.ph.w    t8, t7, 0
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, v1
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t0, t2, t3
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a3, a3, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a3, 3f
+     nop
+2:
+    lbu               t8, 0(a1)
+    lbu               t0, 0(a2)
+    lbu               t1, 0(a0)
+    addiu             a1, a1, 1
+    addiu             a2, a2, 1
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0xff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+    andi              t2, t2, 0xff
+
+    addu_s.qb         t2, t2, t1
+    sb                t2, 0(a0)
+    addiu             a3, a3, -1
+    bnez              a3, 2b
+     addiu            a0, a0, 1
+
+3:
+    RESTORE_REGS_FROM_STACK 0, v0, v1
+    j                 ra
+     nop
+
+END(pixman_composite_add_8_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li                t9, 0x00ff00ff
+    beqz              a3, 3f
+     nop
+
+    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
+    beqz              v0, 1f      /* branch if less than 4 src pixels */
+     nop
+
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              v0, 1f
+     addiu            v0, v0, -1
+    lbu               t0, 0(a2)
+    lbu               t1, 1(a2)
+    lbu               t2, 2(a2)
+    lbu               t3, 3(a2)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a2, a2, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t0, t2, t3
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a3, a3, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a3, 3f
+     nop
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a2)
+    lbu               t1, 0(a0)
+    addiu             a2, a2, 1
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0xff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+    andi              t2, t2, 0xff
+
+    addu_s.qb         t2, t2, t1
+    sb                t2, 0(a0)
+    addiu             a3, a3, -1
+    bnez              a3, 2b
+     addiu            a0, a0, 1
+
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j                 ra
+     nop
+
+END(pixman_composite_add_n_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+                       /* a1 = source      (32bit constant) */
+    lbu      t0, 0(a2) /* t0 = mask        (a8) */
+    lbu      t1, 1(a2) /* t1 = mask        (a8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu    a2, a2, 2
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 a1, a1, \
+                                       t0, t1, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t4, t7, t8, t9, s0, s1, s2
+
+    sw       t5, 0(a0)
+    sw       t6, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+                       /* a1 = source      (32bit constant) */
+    lbu      t0, 0(a2) /* t0 = mask        (a8) */
+    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 a1, t0, t1, t2, t4, t3, t5, t6
+
+    sw       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_n_8_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
 /*
  * a0     - *dst
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index 7327dc6..b330c0f 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -600,6 +600,28 @@ LEAF_MIPS32R2(symbol)                                   \
     addu_s.qb          \out_8888, \out_8888, \d_8888
 .endm
 
+.macro MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 s1_8888,   \
+                             s2_8888,   \
+                             m1_8,      \
+                             m2_8,      \
+                             d1_8888,   \
+                             d2_8888,   \
+                             out1_8888, \
+                             out2_8888, \
+                             maskLSR,   \
+                             scratch1,  scratch2, scratch3, \
+                             scratch4, scratch5, scratch6
+    MIPS_2xUN8x4_MUL_2xUN8 \s1_8888,   \s2_8888, \
+                           \m1_8,      \m2_8, \
+                           \out1_8888, \out2_8888, \
+                           \maskLSR, \
+                           \scratch1,  \scratch2, \scratch3, \
+                           \scratch4,  \scratch5, \scratch6
+
+    addu_s.qb             \out1_8888, \out1_8888, \d1_8888
+    addu_s.qb             \out2_8888, \out2_8888, \d2_8888
+.endm
+
 .macro BILINEAR_INTERPOLATE_SINGLE_PIXEL tl, tr, bl, br,         \
                                          scratch1, scratch2,     \
                                          alpha, red, green, blue \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index e80bbb6..30d2a85 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -59,6 +59,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
                                        uint8_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
                                        uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8,
+                                       uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8888,
+                                       uint8_t, 1, uint32_t, 1)
 
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_8888,
                                       uint32_t, 1, uint32_t, 1)
@@ -67,6 +71,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565,
                                       uint16_t, 1, uint16_t, 1)
 
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t,  1,
+                                         uint8_t,  1, uint8_t,  1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1,
                                          uint8_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
@@ -271,6 +277,10 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, mips_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, mips_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       mips_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, mips_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, mips_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       mips_composite_add_8_8_8),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),


More information about the xorg-commit mailing list