[Pixman] [PATCH 4/4] MIPS: DSPr2: Added OVER combiner and two new fast paths: - over_8888_8888 - over_8888_8888_8888
Nemanja Lukic
nlukic at mips.com
Fri Sep 14 00:31:26 PDT 2012
From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results
Referent (before):
over_8888_8888 = L1: 19.61 L2: 17.10 M: 11.16 ( 59.20%) HT: 16.47 VT: 15.81 R: 14.82 RT: 8.90 ( 50Kops/s)
over_8888_8888_8888 = L1: 13.56 L2: 11.22 M: 7.46 ( 79.18%) HT: 6.24 VT: 6.20 R: 6.11 RT: 3.95 ( 29Kops/s)
Optimized:
over_8888_8888 = L1: 46.42 L2: 36.70 M: 16.69 ( 88.57%) HT: 17.11 VT: 16.55 R: 15.31 RT: 9.48 ( 52Kops/s)
over_8888_8888_8888 = L1: 26.06 L2: 22.53 M: 11.49 (121.91%) HT: 9.93 VT: 9.62 R: 9.19 RT: 5.75 ( 36Kops/s)
---
pixman/pixman-mips-dspr2-asm.S | 124 ++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2.c | 27 +++++++++
2 files changed, 151 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index a4426f2..3a6b26a 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1085,6 +1085,130 @@ LEAF_MIPS_DSPR2(pixman_composite_over_0565_8_0565_asm_mips)
END(pixman_composite_over_0565_8_0565_asm_mips)
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, s0, s1, s2
+ li t4, 0x00ff00ff
+ beqz a3, 3f
+ nop
+ addiu t1, a3, -1
+ beqz t1, 2f
+ nop
+1:
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lw t1, 4(a1) /* t1 = source (a8r8g8b8) */
+ lw t2, 0(a2) /* t2 = mask (a8r8g8b8) */
+ lw t3, 4(a2) /* t3 = mask (a8r8g8b8) */
+ lw t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+ lw t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+ addiu a1, a1, 8
+ addiu a2, a2, 8
+ srl t2, t2, 24
+ srl t3, t3, 24
+
+ OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t0, t1
+
+ sw t7, 0(a0)
+ sw t8, 4(a0)
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+2:
+ beqz a3, 3f
+ nop
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lw t1, 0(a2) /* t1 = mask (a8r8g8b8) */
+ lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+ srl t1, t1, 24
+
+ OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8
+
+ sw t3, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+ j ra
+ nop
+
+END(pixman_composite_over_8888_8888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, s0, s1, s2
+ li t4, 0x00ff00ff
+ beqz a2, 3f
+ nop
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+1:
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lw t1, 4(a1) /* t1 = source (a8r8g8b8) */
+ lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+ lw t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+ addiu a1, a1, 8
+
+ not t5, t0
+ srl t5, t5, 24
+ not t6, t1
+ srl t6, t6, 24
+
+ or t7, t5, t6
+ beqz t7, 11f
+ or t8, t0, t1
+ beqz t8, 12f
+
+ MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t2, t3
+
+ addu_s.qb t0, t7, t0
+ addu_s.qb t1, t8, t1
+11:
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+12:
+ addiu a2, a2, -2
+ addiu t1, a2, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+2:
+ beqz a2, 3f
+ nop
+
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lw t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+ addiu a1, a1, 4
+
+ not t2, t0
+ srl t2, t2, 24
+
+ beqz t2, 21f
+ nop
+ beqz t0, 3f
+
+ MIPS_UN8x4_MUL_UN8 t1, t2, t3, t4, t5, t6, t7
+
+ addu_s.qb t0, t3, t0
+21:
+ sw t0, 0(a0)
+
+3:
+ RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+ j ra
+ nop
+
+END(pixman_composite_over_8888_8888_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
/*
* a0 - *dst
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 881d5fa..072652c 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -48,6 +48,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
uint8_t, 3, uint8_t, 3)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
@@ -71,6 +73,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
uint8_t, 1, uint16_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
uint32_t, uint32_t)
@@ -260,6 +264,11 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, mips_composite_over_8888_8_0565),
PIXMAN_STD_FAST_PATH (OVER, r5g6b5, a8, r5g6b5, mips_composite_over_0565_8_0565),
PIXMAN_STD_FAST_PATH (OVER, b5g6r5, a8, b5g6r5, mips_composite_over_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_over_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mips_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mips_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mips_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mips_composite_over_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
@@ -342,12 +351,30 @@ mips_dspr2_fill (pixman_implementation_t *imp,
imp->delegate, bits, stride, bpp, x, y, width, height, xor);
}
+static void
+mips_dspr2_combine_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ if (mask)
+ pixman_composite_over_8888_8888_8888_asm_mips (
+ dest, (uint32_t *)src, (uint32_t *)mask, width);
+ else
+ pixman_composite_over_8888_8888_asm_mips (
+ dest, (uint32_t *)src, width);
+}
+
pixman_implementation_t *
_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp =
_pixman_implementation_create (fallback, mips_dspr2_fast_paths);
+ imp->combine_32[PIXMAN_OP_OVER] = mips_dspr2_combine_over_u;
+
imp->blt = mips_dspr2_blt;
imp->fill = mips_dspr2_fill;
--
1.7.3
More information about the Pixman
mailing list