pixman: Branch 'master' - 2 commits
Nemanja Lukic
nlukic at kemper.freedesktop.org
Wed Feb 27 05:43:40 PST 2013
pixman/pixman-mips-dspr2-asm.S | 587 +++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2.c | 17 +
2 files changed, 604 insertions(+)
New commits:
commit 5feda20fc39407879993ed4a6d861ef7f78d9432
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date: Wed Feb 27 14:40:51 2013 +0100
MIPS: DSPr2: Added more fast-paths for SRC operation:
- src_0888_8888_rev
- src_0888_0565_rev
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results
Referent (before):
src_0888_8888_rev = L1: 51.88 L2: 42.00 M: 19.04 ( 88.50%) HT: 15.27 VT: 14.62 R: 14.13 RT: 7.12 ( 45Kops/s)
src_0888_0565_rev = L1: 31.96 L2: 30.90 M: 22.60 ( 75.03%) HT: 15.32 VT: 15.11 R: 14.49 RT: 6.64 ( 43Kops/s)
Optimized:
src_0888_8888_rev = L1: 222.73 L2: 113.70 M: 20.97 ( 97.35%) HT: 18.31 VT: 17.14 R: 16.71 RT: 9.74 ( 54Kops/s)
src_0888_0565_rev = L1: 100.37 L2: 74.27 M: 29.43 ( 97.63%) HT: 22.92 VT: 21.59 R: 20.52 RT: 10.56 ( 56Kops/s)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 299f739..3adbb2a 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -310,6 +310,395 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips)
END(pixman_composite_src_x888_8888_asm_mips)
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+LEAF_MIPS_DSPR2(pixman_composite_src_0888_8888_rev_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (b8g8r8)
+ * a2 - w
+ */
+
+ beqz a2, 6f
+ nop
+
+ lui t8, 0xff00;
+ srl t9, a2, 2 /* t9 = how many multiples of 4 src pixels */
+ beqz t9, 4f /* branch if less than 4 src pixels */
+ nop
+
+ li t0, 0x1
+ li t1, 0x2
+ li t2, 0x3
+ andi t3, a1, 0x3
+ beq t3, t0, 1f
+ nop
+ beq t3, t1, 2f
+ nop
+ beq t3, t2, 3f
+ nop
+
+0:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 0(a1) /* t0 = R2 | B1 | G1 | R1 */
+ lw t1, 4(a1) /* t1 = G3 | R3 | B2 | G2 */
+ lw t2, 8(a1) /* t2 = B4 | G4 | R4 | B3 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = B1 | R2 | R1 | G1 */
+ wsbh t1, t1 /* t1 = R3 | G3 | G2 | B2 */
+ wsbh t2, t2 /* t2 = G4 | B4 | B3 | R4 */
+
+ packrl.ph t3, t1, t0 /* t3 = G2 | B2 | B1 | R2 */
+ packrl.ph t4, t0, t0 /* t4 = R1 | G1 | B1 | R2 */
+ rotr t3, t3, 16 /* t3 = B1 | R2 | G2 | B2 */
+ or t3, t3, t8 /* t3 = FF | R2 | G2 | B2 */
+ srl t4, t4, 8 /* t4 = 0 | R1 | G1 | B1 */
+ or t4, t4, t8 /* t4 = FF | R1 | G1 | B1 */
+ packrl.ph t5, t2, t1 /* t5 = B3 | R4 | R3 | G3 */
+ rotr t5, t5, 24 /* t5 = R4 | R3 | G3 | B3 */
+ or t5, t5, t8 /* t5 = FF | R3 | G3 | B3 */
+ rotr t2, t2, 16 /* t2 = B3 | R4 | G4 | B4 */
+ or t2, t2, t8 /* t5 = FF | R3 | G3 | B3 */
+
+ sw t4, 0(a0)
+ sw t3, 4(a0)
+ sw t5, 8(a0)
+ sw t2, 12(a0)
+ b 0b
+ addiu a0, a0, 16
+
+1:
+ lbu t6, 0(a1) /* t6 = 0 | 0 | 0 | R1 */
+ lhu t7, 1(a1) /* t7 = 0 | 0 | B1 | G1 */
+ sll t6, t6, 16 /* t6 = 0 | R1 | 0 | 0 */
+ wsbh t7, t7 /* t7 = 0 | 0 | G1 | B1 */
+ or t7, t6, t7 /* t7 = 0 | R1 | G1 | B1 */
+11:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 3(a1) /* t0 = R3 | B2 | G2 | R2 */
+ lw t1, 7(a1) /* t1 = G4 | R4 | B3 | G3 */
+ lw t2, 11(a1) /* t2 = B5 | G5 | R5 | B4 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = B2 | R3 | R2 | G2 */
+ wsbh t1, t1 /* t1 = R4 | G4 | G3 | B3 */
+ wsbh t2, t2 /* t2 = G5 | B5 | B4 | R5 */
+
+ packrl.ph t3, t1, t0 /* t3 = G3 | B3 | B2 | R3 */
+ packrl.ph t4, t2, t1 /* t4 = B4 | R5 | R4 | G4 */
+ rotr t0, t0, 24 /* t0 = R3 | R2 | G2 | B2 */
+ rotr t3, t3, 16 /* t3 = B2 | R3 | G3 | B3 */
+ rotr t4, t4, 24 /* t4 = R5 | R4 | G4 | B4 */
+ or t7, t7, t8 /* t7 = FF | R1 | G1 | B1 */
+ or t0, t0, t8 /* t0 = FF | R2 | G2 | B2 */
+ or t3, t3, t8 /* t1 = FF | R3 | G3 | B3 */
+ or t4, t4, t8 /* t3 = FF | R4 | G4 | B4 */
+
+ sw t7, 0(a0)
+ sw t0, 4(a0)
+ sw t3, 8(a0)
+ sw t4, 12(a0)
+ rotr t7, t2, 16 /* t7 = xx | R5 | G5 | B5 */
+ b 11b
+ addiu a0, a0, 16
+
+2:
+ lhu t7, 0(a1) /* t7 = 0 | 0 | G1 | R1 */
+ wsbh t7, t7 /* t7 = 0 | 0 | R1 | G1 */
+21:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 2(a1) /* t0 = B2 | G2 | R2 | B1 */
+ lw t1, 6(a1) /* t1 = R4 | B3 | G3 | R3 */
+ lw t2, 10(a1) /* t2 = G5 | R5 | B4 | G4 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = G2 | B2 | B1 | R2 */
+ wsbh t1, t1 /* t1 = B3 | R4 | R3 | G3 */
+ wsbh t2, t2 /* t2 = R5 | G5 | G4 | B4 */
+
+ precr_sra.ph.w t7, t0, 0 /* t7 = R1 | G1 | B1 | R2 */
+ rotr t0, t0, 16 /* t0 = B1 | R2 | G2 | B2 */
+ packrl.ph t3, t2, t1 /* t3 = G4 | B4 | B3 | R4 */
+ rotr t1, t1, 24 /* t1 = R4 | R3 | G3 | B3 */
+ srl t7, t7, 8 /* t7 = 0 | R1 | G1 | B1 */
+ rotr t3, t3, 16 /* t3 = B3 | R4 | G4 | B4 */
+ or t7, t7, t8 /* t7 = FF | R1 | G1 | B1 */
+ or t0, t0, t8 /* t0 = FF | R2 | G2 | B2 */
+ or t1, t1, t8 /* t1 = FF | R3 | G3 | B3 */
+ or t3, t3, t8 /* t3 = FF | R4 | G4 | B4 */
+
+ sw t7, 0(a0)
+ sw t0, 4(a0)
+ sw t1, 8(a0)
+ sw t3, 12(a0)
+ srl t7, t2, 16 /* t7 = 0 | 0 | R5 | G5 */
+ b 21b
+ addiu a0, a0, 16
+
+3:
+ lbu t7, 0(a1) /* t7 = 0 | 0 | 0 | R1 */
+31:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 1(a1) /* t0 = G2 | R2 | B1 | G1 */
+ lw t1, 5(a1) /* t1 = B3 | G3 | R3 | B2 */
+ lw t2, 9(a1) /* t2 = R5 | B4 | G4 | R4 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = R2 | G2 | G1 | B1 */
+ wsbh t1, t1 /* t1 = G3 | B3 | B2 | R3 */
+ wsbh t2, t2 /* t2 = B4 | R5 | R4 | G4 */
+
+ precr_sra.ph.w t7, t0, 0 /* t7 = xx | R1 | G1 | B1 */
+ packrl.ph t3, t1, t0 /* t3 = B2 | R3 | R2 | G2 */
+ rotr t1, t1, 16 /* t1 = B2 | R3 | G3 | B3 */
+ rotr t4, t2, 24 /* t4 = R5 | R4 | G4 | B4 */
+ rotr t3, t3, 24 /* t3 = R3 | R2 | G2 | B2 */
+ or t7, t7, t8 /* t7 = FF | R1 | G1 | B1 */
+ or t3, t3, t8 /* t3 = FF | R2 | G2 | B2 */
+ or t1, t1, t8 /* t1 = FF | R3 | G3 | B3 */
+ or t4, t4, t8 /* t4 = FF | R4 | G4 | B4 */
+
+ sw t7, 0(a0)
+ sw t3, 4(a0)
+ sw t1, 8(a0)
+ sw t4, 12(a0)
+ srl t7, t2, 16 /* t7 = 0 | 0 | xx | R5 */
+ b 31b
+ addiu a0, a0, 16
+
+4:
+ beqz a2, 6f
+ nop
+5:
+ lbu t0, 0(a1) /* t0 = 0 | 0 | 0 | R */
+ lbu t1, 1(a1) /* t1 = 0 | 0 | 0 | G */
+ lbu t2, 2(a1) /* t2 = 0 | 0 | 0 | B */
+ addiu a1, a1, 3
+
+ sll t0, t0, 16 /* t2 = 0 | R | 0 | 0 */
+ sll t1, t1, 8 /* t1 = 0 | 0 | G | 0 */
+
+ or t2, t2, t1 /* t2 = 0 | 0 | G | B */
+ or t2, t2, t0 /* t2 = 0 | R | G | B */
+ or t2, t2, t8 /* t2 = FF | R | G | B */
+
+ sw t2, 0(a0)
+ addiu a2, a2, -1
+ bnez a2, 5b
+ addiu a0, a0, 4
+6:
+ j ra
+ nop
+
+END(pixman_composite_src_0888_8888_rev_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_0888_0565_rev_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (b8g8r8)
+ * a2 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, v0, v1
+ beqz a2, 6f
+ nop
+
+ li t6, 0xf800f800
+ li t7, 0x07e007e0
+ li t8, 0x001F001F
+ srl t9, a2, 2 /* t9 = how many multiples of 4 src pixels */
+ beqz t9, 4f /* branch if less than 4 src pixels */
+ nop
+
+ li t0, 0x1
+ li t1, 0x2
+ li t2, 0x3
+ andi t3, a1, 0x3
+ beq t3, t0, 1f
+ nop
+ beq t3, t1, 2f
+ nop
+ beq t3, t2, 3f
+ nop
+
+0:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 0(a1) /* t0 = R2 | B1 | G1 | R1 */
+ lw t1, 4(a1) /* t1 = G3 | R3 | B2 | G2 */
+ lw t2, 8(a1) /* t2 = B4 | G4 | R4 | B3 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = B1 | R2 | R1 | G1 */
+ wsbh t1, t1 /* t1 = R3 | G3 | G2 | B2 */
+ wsbh t2, t2 /* t2 = G4 | B4 | B3 | R4 */
+
+ packrl.ph t3, t1, t0 /* t3 = G2 | B2 | B1 | R2 */
+ packrl.ph t4, t0, t0 /* t4 = R1 | G1 | B1 | R2 */
+ rotr t3, t3, 16 /* t3 = B1 | R2 | G2 | B2 */
+ srl t4, t4, 8 /* t4 = 0 | R1 | G1 | B1 */
+ packrl.ph t5, t2, t1 /* t5 = B3 | R4 | R3 | G3 */
+ rotr t5, t5, 24 /* t5 = R4 | R3 | G3 | B3 */
+ rotr t2, t2, 16 /* t2 = B3 | R4 | G4 | B4 */
+
+ CONVERT_2x8888_TO_2x0565 t4, t3, t4, t3, t6, t7, t8, v0, v1
+ CONVERT_2x8888_TO_2x0565 t5, t2, t5, t2, t6, t7, t8, v0, v1
+
+ sh t4, 0(a0)
+ sh t3, 2(a0)
+ sh t5, 4(a0)
+ sh t2, 6(a0)
+ b 0b
+ addiu a0, a0, 8
+
+1:
+ lbu t4, 0(a1) /* t4 = 0 | 0 | 0 | R1 */
+ lhu t5, 1(a1) /* t5 = 0 | 0 | B1 | G1 */
+ sll t4, t4, 16 /* t4 = 0 | R1 | 0 | 0 */
+ wsbh t5, t5 /* t5 = 0 | 0 | G1 | B1 */
+ or t5, t4, t5 /* t5 = 0 | R1 | G1 | B1 */
+11:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 3(a1) /* t0 = R3 | B2 | G2 | R2 */
+ lw t1, 7(a1) /* t1 = G4 | R4 | B3 | G3 */
+ lw t2, 11(a1) /* t2 = B5 | G5 | R5 | B4 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = B2 | R3 | R2 | G2 */
+ wsbh t1, t1 /* t1 = R4 | G4 | G3 | B3 */
+ wsbh t2, t2 /* t2 = G5 | B5 | B4 | R5 */
+
+ packrl.ph t3, t1, t0 /* t3 = G3 | B3 | B2 | R3 */
+ packrl.ph t4, t2, t1 /* t4 = B4 | R5 | R4 | G4 */
+ rotr t0, t0, 24 /* t0 = R3 | R2 | G2 | B2 */
+ rotr t3, t3, 16 /* t3 = B2 | R3 | G3 | B3 */
+ rotr t4, t4, 24 /* t4 = R5 | R4 | G4 | B4 */
+
+ CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1
+ CONVERT_2x8888_TO_2x0565 t3, t4, t3, t4, t6, t7, t8, v0, v1
+
+ sh t5, 0(a0)
+ sh t0, 2(a0)
+ sh t3, 4(a0)
+ sh t4, 6(a0)
+ rotr t5, t2, 16 /* t5 = xx | R5 | G5 | B5 */
+ b 11b
+ addiu a0, a0, 8
+
+2:
+ lhu t5, 0(a1) /* t5 = 0 | 0 | G1 | R1 */
+ wsbh t5, t5 /* t5 = 0 | 0 | R1 | G1 */
+21:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 2(a1) /* t0 = B2 | G2 | R2 | B1 */
+ lw t1, 6(a1) /* t1 = R4 | B3 | G3 | R3 */
+ lw t2, 10(a1) /* t2 = G5 | R5 | B4 | G4 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = G2 | B2 | B1 | R2 */
+ wsbh t1, t1 /* t1 = B3 | R4 | R3 | G3 */
+ wsbh t2, t2 /* t2 = R5 | G5 | G4 | B4 */
+
+ precr_sra.ph.w t5, t0, 0 /* t5 = R1 | G1 | B1 | R2 */
+ rotr t0, t0, 16 /* t0 = B1 | R2 | G2 | B2 */
+ packrl.ph t3, t2, t1 /* t3 = G4 | B4 | B3 | R4 */
+ rotr t1, t1, 24 /* t1 = R4 | R3 | G3 | B3 */
+ srl t5, t5, 8 /* t5 = 0 | R1 | G1 | B1 */
+ rotr t3, t3, 16 /* t3 = B3 | R4 | G4 | B4 */
+
+ CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1
+ CONVERT_2x8888_TO_2x0565 t1, t3, t1, t3, t6, t7, t8, v0, v1
+
+ sh t5, 0(a0)
+ sh t0, 2(a0)
+ sh t1, 4(a0)
+ sh t3, 6(a0)
+ srl t5, t2, 16 /* t5 = 0 | 0 | R5 | G5 */
+ b 21b
+ addiu a0, a0, 8
+
+3:
+ lbu t5, 0(a1) /* t5 = 0 | 0 | 0 | R1 */
+31:
+ beqz t9, 4f
+ addiu t9, t9, -1
+ lw t0, 1(a1) /* t0 = G2 | R2 | B1 | G1 */
+ lw t1, 5(a1) /* t1 = B3 | G3 | R3 | B2 */
+ lw t2, 9(a1) /* t2 = R5 | B4 | G4 | R4 */
+
+ addiu a1, a1, 12
+ addiu a2, a2, -4
+
+ wsbh t0, t0 /* t0 = R2 | G2 | G1 | B1 */
+ wsbh t1, t1 /* t1 = G3 | B3 | B2 | R3 */
+ wsbh t2, t2 /* t2 = B4 | R5 | R4 | G4 */
+
+ precr_sra.ph.w t5, t0, 0 /* t5 = xx | R1 | G1 | B1 */
+ packrl.ph t3, t1, t0 /* t3 = B2 | R3 | R2 | G2 */
+ rotr t1, t1, 16 /* t1 = B2 | R3 | G3 | B3 */
+ rotr t4, t2, 24 /* t4 = R5 | R4 | G4 | B4 */
+ rotr t3, t3, 24 /* t3 = R3 | R2 | G2 | B2 */
+
+ CONVERT_2x8888_TO_2x0565 t5, t3, t5, t3, t6, t7, t8, v0, v1
+ CONVERT_2x8888_TO_2x0565 t1, t4, t1, t4, t6, t7, t8, v0, v1
+
+ sh t5, 0(a0)
+ sh t3, 2(a0)
+ sh t1, 4(a0)
+ sh t4, 6(a0)
+ srl t5, t2, 16 /* t5 = 0 | 0 | xx | R5 */
+ b 31b
+ addiu a0, a0, 8
+
+4:
+ beqz a2, 6f
+ nop
+5:
+ lbu t0, 0(a1) /* t0 = 0 | 0 | 0 | R */
+ lbu t1, 1(a1) /* t1 = 0 | 0 | 0 | G */
+ lbu t2, 2(a1) /* t2 = 0 | 0 | 0 | B */
+ addiu a1, a1, 3
+
+ sll t0, t0, 16 /* t2 = 0 | R | 0 | 0 */
+ sll t1, t1, 8 /* t1 = 0 | 0 | G | 0 */
+
+ or t2, t2, t1 /* t2 = 0 | 0 | G | B */
+ or t2, t2, t0 /* t2 = 0 | R | G | B */
+
+ CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
+
+ sh t3, 0(a0)
+ addiu a2, a2, -1
+ bnez a2, 5b
+ addiu a0, a0, 2
+6:
+ RESTORE_REGS_FROM_STACK 0, v0, v1
+ j ra
+ nop
+
+END(pixman_composite_src_0888_0565_rev_asm_mips)
+#endif
+
LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
/*
* a0 - dst (a8r8g8b8)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index cdc71cd..1ea2445 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -48,6 +48,12 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
uint8_t, 3, uint8_t, 3)
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_8888_rev,
+ uint8_t, 3, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev,
+ uint8_t, 3, uint16_t, 1)
+#endif
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
@@ -282,6 +288,10 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888),
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, mips_composite_src_0888_8888_rev),
+ PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, mips_composite_src_0888_0565_rev),
+#endif
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mips_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mips_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mips_composite_src_n_8_8888),
commit 43914d68d1c87a9da6f53e6b0a12941c97bb0e5d
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date: Wed Feb 27 14:39:45 2013 +0100
MIPS: DSPr2: Added more fast-paths for OVER operation:
- over_8888_0565
- over_n_8_8
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results
Referent (before):
over_8888_0565 = L1: 14.30 L2: 13.22 M: 10.43 ( 41.56%) HT: 12.51 VT: 12.95 R: 11.82 RT: 7.34 ( 49Kops/s)
over_n_8_8 = L1: 12.77 L2: 16.93 M: 15.03 ( 29.94%) HT: 10.78 VT: 10.72 R: 10.29 RT: 4.92 ( 33Kops/s)
Optimized:
over_8888_0565 = L1: 26.03 L2: 22.92 M: 15.68 ( 62.43%) HT: 16.19 VT: 16.27 R: 14.93 RT: 8.60 ( 52Kops/s)
over_n_8_8 = L1: 62.00 L2: 55.17 M: 40.29 ( 80.23%) HT: 26.77 VT: 25.64 R: 24.13 RT: 10.01 ( 47Kops/s)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index ddfacef..299f739 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -658,6 +658,126 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
END(pixman_composite_over_n_8888_0565_ca_asm_mips)
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8_asm_mips)
+/*
+ * a0 - dst (a8)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, v0
+ li t9, 0x00ff00ff
+ beqz a3, 3f
+ nop
+ srl v0, a3, 2 /* v0 = how many multiples of 4 dst pixels */
+ beqz v0, 1f /* branch if less than 4 src pixels */
+ nop
+
+ srl t8, a1, 24
+ replv.ph t8, t8
+
+0:
+ beqz v0, 1f
+ addiu v0, v0, -1
+ lbu t0, 0(a2)
+ lbu t1, 1(a2)
+ lbu t2, 2(a2)
+ lbu t3, 3(a2)
+ lbu t4, 0(a0)
+ lbu t5, 1(a0)
+ lbu t6, 2(a0)
+ lbu t7, 3(a0)
+
+ addiu a2, a2, 4
+
+ precr_sra.ph.w t1, t0, 0
+ precr_sra.ph.w t3, t2, 0
+ precr_sra.ph.w t5, t4, 0
+ precr_sra.ph.w t7, t6, 0
+
+ precr.qb.ph t0, t3, t1
+ precr.qb.ph t1, t7, t5
+
+ muleu_s.ph.qbl t2, t0, t8
+ muleu_s.ph.qbr t3, t0, t8
+ shra_r.ph t4, t2, 8
+ shra_r.ph t5, t3, 8
+ and t4, t4, t9
+ and t5, t5, t9
+ addq.ph t2, t2, t4
+ addq.ph t3, t3, t5
+ shra_r.ph t2, t2, 8
+ shra_r.ph t3, t3, 8
+ precr.qb.ph t0, t2, t3
+ not t6, t0
+
+ preceu.ph.qbl t7, t6
+ preceu.ph.qbr t6, t6
+
+ muleu_s.ph.qbl t2, t1, t7
+ muleu_s.ph.qbr t3, t1, t6
+ shra_r.ph t4, t2, 8
+ shra_r.ph t5, t3, 8
+ and t4, t4, t9
+ and t5, t5, t9
+ addq.ph t2, t2, t4
+ addq.ph t3, t3, t5
+ shra_r.ph t2, t2, 8
+ shra_r.ph t3, t3, 8
+ precr.qb.ph t1, t2, t3
+
+ addu_s.qb t2, t0, t1
+
+ sb t2, 0(a0)
+ srl t2, t2, 8
+ sb t2, 1(a0)
+ srl t2, t2, 8
+ sb t2, 2(a0)
+ srl t2, t2, 8
+ sb t2, 3(a0)
+ addiu a3, a3, -4
+ b 0b
+ addiu a0, a0, 4
+
+1:
+ beqz a3, 3f
+ nop
+ srl t8, a1, 24
+2:
+ lbu t0, 0(a2)
+ lbu t1, 0(a0)
+ addiu a2, a2, 1
+
+ mul t2, t0, t8
+ shra_r.ph t3, t2, 8
+ andi t3, t3, 0x00ff
+ addq.ph t2, t2, t3
+ shra_r.ph t2, t2, 8
+ not t3, t2
+ andi t3, t3, 0x00ff
+
+
+ mul t4, t1, t3
+ shra_r.ph t5, t4, 8
+ andi t5, t5, 0x00ff
+ addq.ph t4, t4, t5
+ shra_r.ph t4, t4, 8
+ andi t4, t4, 0x00ff
+
+ addu_s.qb t2, t2, t4
+ sb t2, 0(a0)
+ addiu a3, a3, -1
+ bnez a3, 2b
+ addiu a0, a0, 1
+
+3:
+ RESTORE_REGS_FROM_STACK 0, v0
+ j ra
+ nop
+
+END(pixman_composite_over_n_8_8_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips)
/*
* a0 - dst (a8r8g8b8)
@@ -1342,6 +1462,84 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
END(pixman_composite_over_8888_8888_asm_mips)
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+ SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
+ li t4, 0x00ff00ff
+ li s3, 0xf800f800
+ li s4, 0x07e007e0
+ li s5, 0x001F001F
+ beqz a2, 3f
+ nop
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+1:
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lw t1, 4(a1) /* t1 = source (a8r8g8b8) */
+ lhu t2, 0(a0) /* t2 = destination (r5g6b5) */
+ lhu t3, 2(a0) /* t3 = destination (r5g6b5) */
+ addiu a1, a1, 8
+
+ not t5, t0
+ srl t5, t5, 24
+ not t6, t1
+ srl t6, t6, 24
+
+ or t7, t5, t6
+ beqz t7, 11f
+ or t8, t0, t1
+ beqz t8, 12f
+
+ CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, s4, s5, t7, t8, t9, s2
+ MIPS_2xUN8x4_MUL_2xUN8 s0, s1, t5, t6, t7, t8, t4, t9, t2, t3, s2, s0, s1
+
+ addu_s.qb t0, t7, t0
+ addu_s.qb t1, t8, t1
+11:
+ CONVERT_2x8888_TO_2x0565 t0, t1, t7, t8, s3, s4, s5, t2, t3
+ sh t7, 0(a0)
+ sh t8, 2(a0)
+12:
+ addiu a2, a2, -2
+ addiu t1, a2, -1
+ bgtz t1, 1b
+ addiu a0, a0, 4
+2:
+ beqz a2, 3f
+ nop
+
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lhu t1, 0(a0) /* t1 = destination (r5g6b5) */
+ addiu a1, a1, 4
+
+ not t2, t0
+ srl t2, t2, 24
+
+ beqz t2, 21f
+ nop
+ beqz t0, 3f
+
+ CONVERT_1x0565_TO_1x8888 t1, s0, t8, t9
+ MIPS_UN8x4_MUL_UN8 s0, t2, t3, t4, t5, t6, t7
+
+ addu_s.qb t0, t3, t0
+21:
+ CONVERT_1x8888_TO_1x0565 t0, s0, t8, t9
+ sh s0, 0(a0)
+
+3:
+ RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+ j ra
+ nop
+
+END(pixman_composite_over_8888_0565_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips)
/*
* a0 - dst (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index e14e1c4..cdc71cd 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -50,6 +50,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
uint8_t, 3, uint8_t, 3)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
+ uint32_t, 1, uint16_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888,
@@ -67,6 +69,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
@@ -290,6 +294,7 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, mips_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mips_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mips_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mips_composite_over_n_8_8888),
@@ -318,6 +323,8 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mips_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mips_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mips_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mips_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mips_composite_over_8888_0565),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mips_composite_add_n_8_8),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, mips_composite_add_n_8_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, mips_composite_add_n_8_8888),
More information about the xorg-commit
mailing list