[Pixman] [PATCH 1/2] MIPS: DSPr2: Added over_n_8_8888 and over_n_8_0565 fast paths.
Nemanja Lukic
nlukic at mips.com
Wed May 2 15:03:42 PDT 2012
From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Performance numbers before/after on MIPS-74kc @ 1GHz
Referent (before):
lowlevel-blt-bench:
over_n_8_8888 = L1: 10.40 L2: 9.79 M: 8.47 ( 33.62%) HT: 7.64 VT: 7.59 R: 7.48 RT: 5.30 ( 40Kops/s)
over_n_8_0565 = L1: 7.40 L2: 7.23 M: 6.78 ( 17.94%) HT: 6.23 VT: 6.17 R: 6.14 RT: 4.62 ( 37Kops/s)
Optimized:
lowlevel-blt-bench:
over_n_8_8888 = L1: 27.25 L2: 26.24 M: 18.15 ( 72.12%) HT: 14.52 VT: 14.31 R: 13.83 RT: 7.57 ( 48Kops/s)
over_n_8_0565 = L1: 18.91 L2: 17.59 M: 15.06 ( 39.90%) HT: 12.18 VT: 11.98 R: 11.83 RT: 6.80 ( 46Kops/s)
---
pixman/pixman-mips-dspr2-asm.S | 224 ++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2-asm.h | 67 ++++++++++++
pixman/pixman-mips-dspr2.c | 10 ++
3 files changed, 301 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 6a0fc18..68ad33f 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -527,3 +527,227 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
nop
END(pixman_composite_over_n_8888_0565_ca_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 4, s0, s1, s2, s3, s4
+ beqz a3, 4f
+ nop
+ li t4, 0x00ff00ff
+ li t5, 0xff
+ addiu t0, a3, -1
+ beqz t0, 3f /* last pixel */
+ srl t6, a1, 24 /* t6 = srca */
+ not s4, a1
+ beq t5, t6, 2f /* if (srca == 0xff) */
+ srl s4, s4, 24
+1:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 111f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ and t3, t0, t1
+
+ lw t2, 0(a0) /* t2 = dst */
+ beq t3, t5, 11f /* if (t0 == 0xff) && (t1 == 0xff) */
+ lw t3, 4(a0) /* t3 = dst */
+
+ MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s0, s1, t4, t6, t7, t8, t9, s2, s3
+ not s2, s0
+ not s3, s1
+ srl s2, s2, 24
+ srl s3, s3, 24
+ MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s2, s3, t2, t3, t4, t0, t1, t6, t7, t8, t9
+ addu_s.qb s2, t2, s0
+ addu_s.qb s3, t3, s1
+ sw s2, 0(a0)
+ b 111f
+ sw s3, 4(a0)
+11:
+ MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s4, s4, t2, t3, t4, t0, t1, t6, t7, t8, t9
+ addu_s.qb s2, t2, a1
+ addu_s.qb s3, t3, a1
+ sw s2, 0(a0)
+ sw s3, 4(a0)
+
+111:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 1b
+ addiu a0, a0, 8
+ b 3f
+ nop
+2:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 222f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ and t3, t0, t1
+ beq t3, t5, 22f /* if (t0 == 0xff) && (t1 == 0xff) */
+ nop
+ lw t2, 0(a0) /* t2 = dst */
+ lw t3, 4(a0) /* t3 = dst */
+
+ OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, t2, t3, \
+ t6, t7, t4, t8, t9, s0, s1, s2, s3
+ sw t6, 0(a0)
+ b 222f
+ sw t7, 4(a0)
+22:
+ sw a1, 0(a0)
+ sw a1, 4(a0)
+222:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 2b
+ addiu a0, a0, 8
+3:
+ blez a3, 4f
+ nop
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ beqz t0, 4f /* if (t0 == 0) */
+ addiu a2, a2, 1
+ move t3, a1
+ beq t0, t5, 31f /* if (t0 == 0xff) */
+ lw t1, 0(a0) /* t1 = dst */
+
+ MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t6, t7, t8
+31:
+ not t2, t3
+ srl t2, t2, 24
+ MIPS_UN8x4_MUL_UN8 t1, t2, t1, t4, t6, t7, t8
+ addu_s.qb t2, t1, t3
+ sw t2, 0(a0)
+4:
+ RESTORE_REGS_FROM_STACK 4, s0, s1, s2, s3, s4
+ j ra
+ nop
+
+END(pixman_composite_over_n_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+ SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ beqz a3, 4f
+ nop
+ li t4, 0x00ff00ff
+ li t5, 0xff
+ li t6, 0xf800f800
+ li t7, 0x07e007e0
+ li t8, 0x001F001F
+ addiu t1, a3, -1
+ beqz t1, 3f /* last pixel */
+ srl t0, a1, 24 /* t0 = srca */
+ not v0, a1
+ beq t0, t5, 2f /* if (srca == 0xff) */
+ srl v0, v0, 24
+1:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 111f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ lhu t2, 0(a0) /* t2 = dst */
+ lhu t3, 2(a0) /* t3 = dst */
+ CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t7, t8, t9, s2, s3, s4
+ and t9, t0, t1
+ beq t9, t5, 11f /* if (t0 == 0xff) && (t1 == 0xff) */
+ nop
+
+ MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s2, s3, t4, t9, s4, s5, s6, s7, s8
+ not s4, s2
+ not s5, s3
+ srl s4, s4, 24
+ srl s5, s5, 24
+ MIPS_2xUN8x4_MUL_2xUN8 s0, s1, s4, s5, s0, s1, t4, t9, t0, t1, s6, s7, s8
+ addu_s.qb s4, s2, s0
+ addu_s.qb s5, s3, s1
+ CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+ sh t2, 0(a0)
+ b 111f
+ sh t3, 2(a0)
+11:
+ MIPS_2xUN8x4_MUL_2xUN8 s0, s1, v0, v0, s0, s1, t4, t9, t0, t1, s6, s7, s8
+ addu_s.qb s4, a1, s0
+ addu_s.qb s5, a1, s1
+ CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+ sh t2, 0(a0)
+ sh t3, 2(a0)
+111:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 1b
+ addiu a0, a0, 4
+ b 3f
+ nop
+2:
+ CONVERT_1x8888_TO_1x0565 a1, s0, s1, s2
+21:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 222f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ and t9, t0, t1
+ move s2, s0
+ beq t9, t5, 22f /* if (t0 == 0xff) && (t2 == 0xff) */
+ move s3, s0
+ lhu t2, 0(a0) /* t2 = dst */
+ lhu t3, 2(a0) /* t3 = dst */
+
+ CONVERT_2x0565_TO_2x8888 t2, t3, s2, s3, t7, t8, s4, s5, s6, s7
+ OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, s2, s3, \
+ t2, t3, t4, t9, s4, s5, s6, s7, s8
+ CONVERT_2x8888_TO_2x0565 t2, t3, s2, s3, t6, t7, t8, s4, s5
+22:
+ sh s2, 0(a0)
+ sh s3, 2(a0)
+222:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 21b
+ addiu a0, a0, 4
+3:
+ blez a3, 4f
+ nop
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ beqz t0, 4f /* if (t0 == 0) */
+ nop
+ lhu t1, 0(a0) /* t1 = dst */
+ CONVERT_1x0565_TO_1x8888 t1, t2, t3, t7
+ beq t0, t5, 31f /* if (t0 == 0xff) */
+ move t3, a1
+
+ MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t7, t8, t9
+31:
+ not t6, t3
+ srl t6, t6, 24
+ MIPS_UN8x4_MUL_UN8 t2, t6, t2, t4, t7, t8, t9
+ addu_s.qb t1, t2, t3
+ CONVERT_1x8888_TO_1x0565 t1, t2, t3, t7
+ sh t2, 0(a0)
+4:
+ RESTORE_REGS_FROM_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ j ra
+ nop
+
+END(pixman_composite_over_n_8_0565_asm_mips)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index 12ff42c..8383060 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -499,4 +499,71 @@ LEAF_MIPS32R2(symbol) \
precr.qb.ph \d2_8888, \scratch5, \scratch6
.endm
+/*
+ * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8
+ * destination pixel (d_8888) using a8 mask (m_8). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro OVER_8888_8_8888 s_8888, \
+ m_8, \
+ d_8888, \
+ out_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, scratch4
+ MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \
+ \scratch1, \maskLSR, \
+ \scratch2, \scratch3, \scratch4
+
+ not \scratch2, \scratch1
+ srl \scratch2, \scratch2, 24
+
+ MIPS_UN8x4_MUL_UN8 \d_8888, \scratch2, \
+ \d_8888, \maskLSR, \
+ \scratch3, \scratch4, \out_8888
+
+ addu_s.qb \out_8888, \d_8888, \scratch1
+.endm
+
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888) using a8 masks (m1_8 and
+ * m2_8). It also requires maskLSR needed for rounding process. maskLSR must
+ * have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8_2x8888 s1_8888, \
+ s2_8888, \
+ m1_8, \
+ m2_8, \
+ d1_8888, \
+ d2_8888, \
+ out1_8888, \
+ out2_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, \
+ scratch4, scratch5, scratch6
+ MIPS_2xUN8x4_MUL_2xUN8 \s1_8888, \s2_8888, \
+ \m1_8, \m2_8, \
+ \scratch1, \scratch2, \
+ \maskLSR, \
+ \scratch3, \scratch4, \out1_8888, \
+ \out2_8888, \scratch5, \scratch6
+
+ not \scratch3, \scratch1
+ srl \scratch3, \scratch3, 24
+ not \scratch4, \scratch2
+ srl \scratch4, \scratch4, 24
+
+ MIPS_2xUN8x4_MUL_2xUN8 \d1_8888, \d2_8888, \
+ \scratch3, \scratch4, \
+ \d1_8888, \d2_8888, \
+ \maskLSR, \
+ \scratch5, \scratch6, \out1_8888, \
+ \out2_8888, \scratch3, \scratch4
+
+ addu_s.qb \out1_8888, \d1_8888, \scratch1
+ addu_s.qb \out2_8888, \d2_8888, \scratch2
+.endm
+
#endif //PIXMAN_MIPS_DSPR2_ASM_H
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 018770a..7081734 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -53,6 +53,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
+ uint8_t, 1, uint16_t, 1)
static pixman_bool_t
pixman_fill_mips (uint32_t *bits,
@@ -195,6 +199,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mips_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mips_composite_over_n_8_0565),
{ PIXMAN_OP_NONE },
};
--
1.7.3
More information about the Pixman
mailing list