[Pixman] [PATCH] MIPS: DSPr2: Added several bilinear fast paths: - src_8888_8_8888 - src_8888_8_0565 - src_0565_8_x888 - src_0565_8_0565 - add_8888_8_8888

Nemanja Lukic nlukic at mips.com
Mon Jun 25 12:08:33 PDT 2012


From: Nemanja Lukic <nemanja.lukic at rt-rk.com>

Performance numbers before/after on MIPS-74kc @ 1GHz:

Referent (before):

         src_8888_8_8888 =  L1:   2.25  L2:   6.15  M:  5.48 ( 32.72%)  HT:  4.67  VT:  4.65  R:  4.55  RT:  2.89 (  24Kops/s)
         src_8888_8_0565 =  L1:   5.97  L2:   5.71  M:  5.13 ( 23.81%)  HT:  4.39  VT:  4.36  R:  4.28  RT:  2.72 (  23Kops/s)
         src_0565_8_x888 =  L1:   3.32  L2:   3.27  M:  3.16 ( 14.68%)  HT:  2.87  VT:  2.85  R:  2.81  RT:  2.09 (  19Kops/s)
         src_0565_8_0565 =  L1:   3.19  L2:   3.15  M:  3.04 ( 10.09%)  HT:  2.76  VT:  2.74  R:  2.71  RT:  2.00 (  18Kops/s)
        over_8888_8_8888 =  L1:   5.04  L2:   4.75  M:  4.12 ( 27.31%)  HT:  3.60  VT:  3.58  R:  3.51  RT:  2.35 (  21Kops/s)
         add_8888_8_8888 =  L1:   5.68  L2:   5.31  M:  4.53 ( 30.05%)  HT:  3.95  VT:  3.92  R:  3.84  RT:  2.52 (  22Kops/s)

Optimized:

         src_8888_8_8888 =  L1:  13.19  L2:  12.13  M:  9.75 ( 58.22%)  HT:  8.60  VT:  8.44  R:  7.90  RT:  5.06 (  33Kops/s)
         src_8888_8_0565 =  L1:  11.64  L2:  10.81  M:  9.18 ( 42.63%)  HT:  8.04  VT:  7.90  R:  7.57  RT:  5.02 (  32Kops/s)
         src_0565_8_x888 =  L1:   8.34  L2:   7.95  M:  7.29 ( 33.85%)  HT:  6.55  VT:  6.48  R:  6.25  RT:  4.35 (  30Kops/s)
         src_0565_8_0565 =  L1:   7.71  L2:   7.35  M:  6.90 ( 22.90%)  HT:  6.14  VT:  6.10  R:  5.94  RT:  4.07 (  29Kops/s)
        over_8888_8_8888 =  L1:   9.73  L2:   8.99  M:  7.15 ( 47.41%)  HT:  6.40  VT:  6.30  R:  6.11  RT:  4.28 (  30Kops/s)
         add_8888_8_8888 =  L1:  13.01  L2:  11.72  M:  8.70 ( 57.68%)  HT:  7.59  VT:  7.46  R:  7.20  RT:  4.74 (  32Kops/s)

Benchmark results are obtained using tweaked version of the lowlevel-blt-bench (which does bilinear scaling using almost identity matrix).
---
 pixman/pixman-mips-dspr2-asm.S |  322 ++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips-dspr2-asm.h |   13 ++
 pixman/pixman-mips-dspr2.c     |   22 +++
 3 files changed, 357 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 87558f0..74d1ee9 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -748,6 +748,266 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips)
 
 END(pixman_composite_over_n_8_0565_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       v1, 32(sp)
+    beqz     v1, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)        /* s0 = wt */
+    lw       s1, 48(sp)        /* s1 = wb */
+    lw       s2, 52(sp)        /* s2 = vx */
+    lw       s3, 56(sp)        /* s3 = unit_x */
+    li       v0, 256
+    li       s8, 0x00ff00ff
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, 8         /* t4 = vx >> 8 */
+    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a2)        /* t0 = tl */
+    lwx      t1, t8(a2)        /* t1 = tr */
+    addiu    v1, v1, -1
+    lwx      t2, t9(a3)        /* t2 = bl */
+    lwx      t3, t8(a3)        /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     v1, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       v1, 32(sp)
+    beqz     v1, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)        /* s0 = wt */
+    lw       s1, 48(sp)        /* s1 = wb */
+    lw       s2, 52(sp)        /* s2 = vx */
+    lw       s3, 56(sp)        /* s3 = unit_x */
+    li       v0, 256
+    li       s8, 0x00ff00ff
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, 8         /* t4 = vx >> 8 */
+    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a2)        /* t0 = tl */
+    lwx      t1, t8(a2)        /* t1 = tr */
+    addiu    v1, v1, -1
+    lwx      t2, t9(a3)        /* t2 = bl */
+    lwx      t3, t8(a3)        /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sh       t1, 0(a0)
+    bnez     v1, 0b
+     addiu   a0, a0, 2
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       t0, 32(sp)
+    beqz     t0, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+
+    lw       s0, 48(sp)        /* s0 = wt */
+    lw       s1, 52(sp)        /* s1 = wb */
+    lw       s2, 56(sp)        /* s2 = vx */
+    lw       s3, 60(sp)        /* s3 = unit_x */
+    lw       ra, 64(sp)        /* ra = w */
+    li       v0, 0x00ff00ff
+    li       v1, 0x07e007e0
+    li       s8, 0x001f001f
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, 8         /* t4 = vx >> 8 */
+    xori     t5, t4, 0xff
+    addiu    t5, t5, 1         /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 1
+    addiu    t8, t9, 2
+    lhx      t0, t9(a2)        /* t0 = tl */
+    lhx      t1, t8(a2)        /* t1 = tr */
+    andi     t1, t1, 0xffff
+    addiu    ra, ra, -1
+    lhx      t2, t9(a3)        /* t2 = bl */
+    lhx      t3, t8(a3)        /* t3 = br */
+    andi     t3, t3, 0xffff
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     ra, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       t0, 32(sp)
+    beqz     t0, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+
+    lw       s0, 48(sp)        /* s0 = wt */
+    lw       s1, 52(sp)        /* s1 = wb */
+    lw       s2, 56(sp)        /* s2 = vx */
+    lw       s3, 60(sp)        /* s3 = unit_x */
+    lw       ra, 64(sp)        /* ra = w */
+    li       v0, 0x00ff00ff
+    li       v1, 0x07e007e0
+    li       s8, 0x001f001f
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, 8         /* t4 = vx >> 8 */
+    xori     t5, t4, 0xff
+    addiu    t5, t5, 1         /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 1
+    addiu    t8, t9, 2
+    lhx      t0, t9(a2)        /* t0 = tl */
+    lhx      t1, t8(a2)        /* t1 = tr */
+    andi     t1, t1, 0xffff
+    addiu    ra, ra, -1
+    lhx      t2, t9(a3)        /* t2 = bl */
+    lhx      t3, t8(a3)        /* t3 = br */
+    andi     t3, t3, 0xffff
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sh       t1, 0(a0)
+    bnez     ra, 0b
+     addiu   a0, a0, 2
+
+    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
 /*
  * a0     - dst        (a8r8g8b8)
@@ -810,3 +1070,65 @@ LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
      nop
 
 END(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       v1, 32(sp)
+    beqz     v1, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)        /* s0 = wt */
+    lw       s1, 48(sp)        /* s1 = wb */
+    lw       s2, 52(sp)        /* s2 = vx */
+    lw       s3, 56(sp)        /* s3 = unit_x */
+    li       v0, 256
+    li       s8, 0x00ff00ff
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, 8         /* t4 = vx >> 8 */
+    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a2)        /* t0 = tl */
+    lwx      t1, t8(a2)        /* t1 = tr */
+    addiu    v1, v1, -1
+    lwx      t2, t9(a3)        /* t2 = bl */
+    lwx      t3, t8(a3)        /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    lw       t2, 0(a0)         /* t2 = dst */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t0, s8, t3, t4, t5
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     v1, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index 7cf3281..24b049e 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -566,6 +566,19 @@ LEAF_MIPS32R2(symbol)                                   \
     addu_s.qb              \out2_8888, \d2_8888,  \scratch2
 .endm
 
+.macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888,   \
+                                    m_8,      \
+                                    d_8888,   \
+                                    out_8888, \
+                                    maskLSR,  \
+                                    scratch1, scratch2, scratch3
+    MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \
+                       \out_8888, \maskLSR, \
+                       \scratch1, \scratch2, \scratch3
+
+    addu_s.qb          \out_8888, \out_8888, \d_8888
+.endm
+
 .macro BILINEAR_INTERPOLATE_SINGLE_PIXEL tl, tr, bl, br,         \
                                          scratch1, scratch2,     \
                                          alpha, red, green, blue \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 06d4335..66c0e5d 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -58,8 +58,18 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
                                        uint8_t, 1, uint16_t, 1)
 
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_8888, SRC,
+                                             uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_0565, SRC,
+                                             uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_x888, SRC,
+                                             uint16_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_0565, SRC,
+                                             uint16_t, uint16_t)
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, OVER,
                                              uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, ADD,
+                                             uint32_t, uint32_t)
 
 static pixman_bool_t
 pixman_fill_mips (uint32_t *bits,
@@ -209,9 +219,21 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   mips_composite_over_n_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   mips_composite_over_n_8_0565),
 
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_8_0565),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8_x888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, mips_0565_8_0565),
+
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
 
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
     { PIXMAN_OP_NONE },
 };
 
-- 
1.7.3



More information about the Pixman mailing list