[Pixman] [PATCH] MIPS: DSPr2: Added over_n_8888_8888_ca and over_n_8888_0565_ca fast paths.
Nemanja Lukic
nlukic at mips.com
Sun Mar 11 10:52:25 PDT 2012
From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Performance numbers before/after on MIPS-74kc @ 1GHz
Referent (before):
lowlevel-blt-bench:
over_n_8888_8888_ca = L1: 8.32 L2: 7.65 M: 6.38 ( 51.08%) HT: 5.78 VT: 5.74 R: 5.84 RT: 4.39 ( 37Kops/s)
over_n_8888_0565_ca = L1: 7.40 L2: 6.95 M: 6.16 ( 41.06%) HT: 5.72 VT: 5.52 R: 5.63 RT: 4.28 ( 36Kops/s)
cairo-perf-trace:
[ # ] backend test min(s) median(s) stddev. count
[ # ] image: pixman 0.25.3
[ 0] image xfce4-terminal-a1 138.223 139.070 0.33% 6/6
[ # ] image16: pixman 0.25.3
[ 0] image16 xfce4-terminal-a1 132.763 132.939 0.06% 5/6
Optimized:
lowlevel-blt-bench:
over_n_8888_8888_ca = L1: 19.35 L2: 23.84 M: 13.68 (109.39%) HT: 11.39 VT: 11.19 R: 11.27 RT: 6.90 ( 47Kops/s)
over_n_8888_0565_ca = L1: 18.68 L2: 17.00 M: 12.56 ( 83.70%) HT: 10.72 VT: 10.45 R: 10.43 RT: 5.79 ( 43Kops/s)
cairo-perf-trace:
[ # ] backend test min(s) median(s) stddev. count
[ # ] image: pixman 0.25.3
[ 0] image xfce4-terminal-a1 130.400 131.720 0.46% 6/6
[ # ] image16: pixman 0.25.3
[ 0] image16 xfce4-terminal-a1 125.830 126.604 0.34% 6/6
---
pixman/pixman-mips-dspr2-asm.S | 219 +++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2-asm.h | 296 ++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2.c | 12 ++
pixman/pixman-mips-dspr2.h | 42 ++++++
4 files changed, 569 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index f1087a7..6a0fc18 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -308,3 +308,222 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips)
nop
END(pixman_composite_src_x888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7
+ beqz a3, 4f
+ nop
+ li t6, 0xff
+ addiu t7, zero, -1 /* t7 = 0xffffffff */
+ srl t8, a1, 24 /* t8 = srca */
+ li t9, 0x00ff00ff
+ addiu t1, a3, -1
+ beqz t1, 3f /* last pixel */
+ nop
+ beq t8, t6, 2f /* if (srca == 0xff) */
+ nop
+1:
+ /* a1 = src */
+ lw t0, 0(a2) /* t0 = mask */
+ lw t1, 4(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 8
+ and t3, t0, t1
+ move s0, t8 /* s0 = srca */
+ move s1, t8 /* s1 = srca */
+ move t4, a1 /* t4 = src */
+ move t5, a1 /* t5 = src */
+ lw t2, 0(a0) /* t2 = dst */
+ beq t3, t7, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+ lw t3, 4(a0) /* t0 = dst */
+ MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
+ MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, s0, s1, t9, s2, s3, s4, s5, s6, s7
+11:
+ not s0, s0
+ not s1, s1
+ MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, s0, s1, s2, s3, t9, t0, t1, s4, s5, s6, s7
+ addu_s.qb t0, t4, s2
+ addu_s.qb t1, t5, s3
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+12:
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+ b 3f
+ nop
+2:
+ /* a1 = src */
+ lw t0, 0(a2) /* t0 = mask */
+ lw t1, 4(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */
+ addiu a2, a2, 8
+ and t2, t0, t1
+ move s0, a1
+ beq t2, t7, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+ move s1, a1
+ lw t2, 0(a0) /* t2 = dst */
+ lw t3, 4(a0) /* t3 = dst */
+ MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
+ not t0, t0
+ not t1, t1
+ MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, s0, s1, t9, s2, s3, s4, s5, s6, s7
+ addu_s.qb s0, t4, s0
+ addu_s.qb s1, t5, s1
+21:
+ sw s0, 0(a0)
+ sw s1, 4(a0)
+22:
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 2b
+ addiu a0, a0, 8
+3:
+ blez a3, 4f
+ nop
+ /* a1 = src */
+ lw t1, 0(a2) /* t1 = mask */
+ beqz t1, 4f
+ nop
+ move s0, t8 /* s0 = srca */
+ move t2, a1 /* t2 = src */
+ beq t1, t7, 31f
+ lw t0, 0(a0) /* t0 = dst */
+
+ MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6
+ MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5
+31:
+ not s0, s0
+ MIPS_UN8x4_MUL_UN8x4 t0, s0, t3, t9, t4, t5, t6, t1
+ addu_s.qb t0, t2, t3
+ sw t0, 0(a0)
+4:
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7
+ j ra
+ nop
+
+END(pixman_composite_over_n_8888_8888_ca_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ beqz a3, 4f
+ nop
+ li t5, 0xf800f800
+ li t6, 0x07e007e0
+ li t7, 0x001F001F
+ li t9, 0x00ff00ff
+
+ srl t8, a1, 24 /* t8 = srca */
+ addiu t1, a3, -1
+ beqz t1, 3f /* last pixel */
+ nop
+ li s0, 0xff /* s0 = 0xff */
+ addiu s1, zero, -1 /* s1 = 0xffffffff */
+
+ beq t8, s0, 2f /* if (srca == 0xff) */
+ nop
+1:
+ /* a1 = src */
+ lw t0, 0(a2) /* t0 = mask */
+ lw t1, 4(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 8
+ and t3, t0, t1
+ move t0, t8
+ move t1, a1
+ lhu t2, 0(a0) /* t2 = dst */
+ beq t3, s1, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+ lhu t3, 2(a0) /* t3 = dst */
+ MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
+ MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8
+11:
+ not t0, t0
+ not t1, t1
+ CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
+ MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1
+ addu_s.qb s2, s2, s4
+ addu_s.qb s3, s3, s5
+ CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s1, s2
+ sh t2, 0(a0)
+ sh t3, 2(a0)
+12:
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 4
+ b 3f
+ nop
+2:
+ /* a1 = src */
+ lw t0, 0(a2) /* t0 = mask */
+ lw t1, 4(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */
+ addiu a2, a2, 8
+ and t3, t0, t1
+ move t2, a1
+ beq t3, s1, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+ move t3, a1
+ lhu t2, 0(a0) /* t2 = dst */
+ lhu t3, 2(a0) /* t3 = dst */
+ MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
+ not t0, t0
+ not t1, t1
+ CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
+ MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3
+ addu_s.qb t2, s2, s4
+ addu_s.qb t3, s3, s5
+21:
+ CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3
+ sh t0, 0(a0)
+ sh t1, 2(a0)
+22:
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 2b
+ addiu a0, a0, 4
+3:
+ blez a3, 4f
+ nop
+ /* a1 = src */
+ lw t1, 0(a2) /* t1 = mask */
+ beqz t1, 4f
+ nop
+ move s0, t8 /* s0 = srca */
+ move t2, a1 /* t2 = src */
+ beq t1, t7, 31f
+ lhu t0, 0(a0) /* t0 = dst */
+
+ MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6
+ MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5
+31:
+ not s0, s0
+ CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3
+ MIPS_UN8x4_MUL_UN8x4 s1, s0, t3, t9, t4, t5, t6, t1
+ addu_s.qb t0, t2, t3
+ CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3
+ sh s1, 0(a0)
+4:
+ RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ j ra
+ nop
+
+END(pixman_composite_over_n_8888_0565_ca_asm_mips)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index e07cda4..12ff42c 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -96,6 +96,170 @@ LEAF_MIPS32R2(symbol) \
.size function,.-function
/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+ .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+ .endif
+ .if \stack_offset != 0
+ addiu sp, sp, -\stack_offset
+ .endif
+ sw \r1, 0(sp)
+ .if \r2 != 0
+ sw \r2, 4(sp)
+ .endif
+ .if \r3 != 0
+ sw \r3, 8(sp)
+ .endif
+ .if \r4 != 0
+ sw \r4, 12(sp)
+ .endif
+ .if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ sw \r5, 16(sp)
+ .endif
+ .if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ sw \r6, 20(sp)
+ .endif
+ .if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ sw \r7, 24(sp)
+ .endif
+ .if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ sw \r8, 28(sp)
+ .endif
+ .if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ sw \r9, 32(sp)
+ .endif
+ .if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ sw \r10, 36(sp)
+ .endif
+ .if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ sw \r11, 40(sp)
+ .endif
+ .if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ sw \r12, 44(sp)
+ .endif
+ .if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ sw \r13, 48(sp)
+ .endif
+ .if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ sw \r14, 52(sp)
+ .endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+ .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
+ .error "Stack offset must be pozitive and multiple of 4."
+ .endif
+ lw \r1, 0(sp)
+ .if \r2 != 0
+ lw \r2, 4(sp)
+ .endif
+ .if \r3 != 0
+ lw \r3, 8(sp)
+ .endif
+ .if \r4 != 0
+ lw \r4, 12(sp)
+ .endif
+ .if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ lw \r5, 16(sp)
+ .endif
+ .if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ lw \r6, 20(sp)
+ .endif
+ .if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ lw \r7, 24(sp)
+ .endif
+ .if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ lw \r8, 28(sp)
+ .endif
+ .if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ lw \r9, 32(sp)
+ .endif
+ .if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ lw \r10, 36(sp)
+ .endif
+ .if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ lw \r11, 40(sp)
+ .endif
+ .if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ lw \r12, 44(sp)
+ .endif
+ .if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ lw \r13, 48(sp)
+ .endif
+ .if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ lw \r14, 52(sp)
+ .endif
+ .if \stack_offset != 0
+ addiu sp, sp, \stack_offset
+ .endif
+.endm
+
+/*
* Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel
* returned in (out_8888) register. Requires two temporary registers
* (scratch1 and scratch2).
@@ -203,4 +367,136 @@ LEAF_MIPS32R2(symbol) \
srl \out2_565, \out1_565, 16
.endm
+/*
+ * Multiply pixel (a8) with single pixel (a8r8g8b8). It requires maskLSR needed
+ * for rounding process. maskLSR must have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro MIPS_UN8x4_MUL_UN8 s_8888, \
+ m_8, \
+ d_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3
+ replv.ph \m_8, \m_8 /* 0 | M | 0 | M */
+ muleu_s.ph.qbl \scratch1, \s_8888, \m_8 /* A*M | R*M */
+ muleu_s.ph.qbr \scratch2, \s_8888, \m_8 /* G*M | B*M */
+ shra_r.ph \scratch3, \scratch1, 8
+ shra_r.ph \d_8888, \scratch2, 8
+ and \scratch3, \scratch3, \maskLSR /* 0 |A*M| 0 |R*M */
+ and \d_8888, \d_8888, \maskLSR /* 0 |G*M| 0 |B*M */
+ addq.ph \scratch1, \scratch1, \scratch3 /* A*M+A*M | R*M+R*M */
+ addq.ph \scratch2, \scratch2, \d_8888 /* G*M+G*M | B*M+B*M */
+ shra_r.ph \scratch1, \scratch1, 8
+ shra_r.ph \scratch2, \scratch2, 8
+ precr.qb.ph \d_8888, \scratch1, \scratch2
+.endm
+
+/*
+ * Multiply two pixels (a8) with two pixels (a8r8g8b8). It requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro MIPS_2xUN8x4_MUL_2xUN8 s1_8888, \
+ s2_8888, \
+ m1_8, \
+ m2_8, \
+ d1_8888, \
+ d2_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, \
+ scratch4, scratch5, scratch6
+ replv.ph \m1_8, \m1_8 /* 0 | M1 | 0 | M1 */
+ replv.ph \m2_8, \m2_8 /* 0 | M2 | 0 | M2 */
+ muleu_s.ph.qbl \scratch1, \s1_8888, \m1_8 /* A1*M1 | R1*M1 */
+ muleu_s.ph.qbr \scratch2, \s1_8888, \m1_8 /* G1*M1 | B1*M1 */
+ muleu_s.ph.qbl \scratch3, \s2_8888, \m2_8 /* A2*M2 | R2*M2 */
+ muleu_s.ph.qbr \scratch4, \s2_8888, \m2_8 /* G2*M2 | B2*M2 */
+ shra_r.ph \scratch5, \scratch1, 8
+ shra_r.ph \d1_8888, \scratch2, 8
+ shra_r.ph \scratch6, \scratch3, 8
+ shra_r.ph \d2_8888, \scratch4, 8
+ and \scratch5, \scratch5, \maskLSR /* 0 |A1*M1| 0 |R1*M1 */
+ and \d1_8888, \d1_8888, \maskLSR /* 0 |G1*M1| 0 |B1*M1 */
+ and \scratch6, \scratch6, \maskLSR /* 0 |A2*M2| 0 |R2*M2 */
+ and \d2_8888, \d2_8888, \maskLSR /* 0 |G2*M2| 0 |B2*M2 */
+ addq.ph \scratch1, \scratch1, \scratch5
+ addq.ph \scratch2, \scratch2, \d1_8888
+ addq.ph \scratch3, \scratch3, \scratch6
+ addq.ph \scratch4, \scratch4, \d2_8888
+ shra_r.ph \scratch1, \scratch1, 8
+ shra_r.ph \scratch2, \scratch2, 8
+ shra_r.ph \scratch3, \scratch3, 8
+ shra_r.ph \scratch4, \scratch4, 8
+ precr.qb.ph \d1_8888, \scratch1, \scratch2
+ precr.qb.ph \d2_8888, \scratch3, \scratch4
+.endm
+
+/*
+ * Multiply pixel (a8r8g8b8) with single pixel (a8r8g8b8). It requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro MIPS_UN8x4_MUL_UN8x4 s_8888, \
+ m_8888, \
+ d_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, scratch4
+ preceu.ph.qbl \scratch1, \m_8888 /* 0 | A | 0 | R */
+ preceu.ph.qbr \scratch2, \m_8888 /* 0 | G | 0 | B */
+ muleu_s.ph.qbl \scratch3, \s_8888, \scratch1 /* A*A | R*R */
+ muleu_s.ph.qbr \scratch4, \s_8888, \scratch2 /* G*G | B*B */
+ shra_r.ph \scratch1, \scratch3, 8
+ shra_r.ph \scratch2, \scratch4, 8
+ and \scratch1, \scratch1, \maskLSR /* 0 |A*A| 0 |R*R */
+ and \scratch2, \scratch2, \maskLSR /* 0 |G*G| 0 |B*B */
+ addq.ph \scratch1, \scratch1, \scratch3
+ addq.ph \scratch2, \scratch2, \scratch4
+ shra_r.ph \scratch1, \scratch1, 8
+ shra_r.ph \scratch2, \scratch2, 8
+ precr.qb.ph \d_8888, \scratch1, \scratch2
+.endm
+
+/*
+ * Multiply two pixels (a8r8g8b8) with two pixels (a8r8g8b8). It requires
+ * maskLSR needed for rounding process. maskLSR must have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+
+.macro MIPS_2xUN8x4_MUL_2xUN8x4 s1_8888, \
+ s2_8888, \
+ m1_8888, \
+ m2_8888, \
+ d1_8888, \
+ d2_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, \
+ scratch4, scratch5, scratch6
+ preceu.ph.qbl \scratch1, \m1_8888 /* 0 | A | 0 | R */
+ preceu.ph.qbr \scratch2, \m1_8888 /* 0 | G | 0 | B */
+ preceu.ph.qbl \scratch3, \m2_8888 /* 0 | A | 0 | R */
+ preceu.ph.qbr \scratch4, \m2_8888 /* 0 | G | 0 | B */
+ muleu_s.ph.qbl \scratch5, \s1_8888, \scratch1 /* A*A | R*R */
+ muleu_s.ph.qbr \scratch6, \s1_8888, \scratch2 /* G*G | B*B */
+ muleu_s.ph.qbl \scratch1, \s2_8888, \scratch3 /* A*A | R*R */
+ muleu_s.ph.qbr \scratch2, \s2_8888, \scratch4 /* G*G | B*B */
+ shra_r.ph \scratch3, \scratch5, 8
+ shra_r.ph \scratch4, \scratch6, 8
+ shra_r.ph \d1_8888, \scratch1, 8
+ shra_r.ph \d2_8888, \scratch2, 8
+ and \scratch3, \scratch3, \maskLSR /* 0 |A*A| 0 |R*R */
+ and \scratch4, \scratch4, \maskLSR /* 0 |G*G| 0 |B*B */
+ and \d1_8888, \d1_8888, \maskLSR /* 0 |A*A| 0 |R*R */
+ and \d2_8888, \d2_8888, \maskLSR /* 0 |G*G| 0 |B*B */
+ addq.ph \scratch3, \scratch3, \scratch5
+ addq.ph \scratch4, \scratch4, \scratch6
+ addq.ph \d1_8888, \d1_8888, \scratch1
+ addq.ph \d2_8888, \d2_8888, \scratch2
+ shra_r.ph \scratch3, \scratch3, 8
+ shra_r.ph \scratch4, \scratch4, 8
+ shra_r.ph \scratch5, \d1_8888, 8
+ shra_r.ph \scratch6, \d2_8888, 8
+ precr.qb.ph \d1_8888, \scratch3, \scratch4
+ precr.qb.ph \d2_8888, \scratch5, \scratch6
+.endm
+
#endif //PIXMAN_MIPS_DSPR2_ASM_H
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 2beada3..018770a 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -49,6 +49,11 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
uint8_t, 3, uint8_t, 3)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
+ uint32_t, 1, uint16_t, 1)
+
static pixman_bool_t
pixman_fill_mips (uint32_t *bits,
int stride,
@@ -184,6 +189,13 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mips_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mips_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mips_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca),
+
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
index a40e7c8..cc35d02 100644
--- a/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman-mips-dspr2.h
@@ -85,4 +85,46 @@ mips_composite_##name (pixman_implementation_t *imp, \
} \
}
+/*******************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name, \
+ mask_type, mask_cnt, \
+ dst_type, dst_cnt) \
+void \
+pixman_composite_##name##_asm_mips (dst_type *dst, \
+ uint32_t src, \
+ mask_type *mask, \
+ int32_t w); \
+ \
+static void \
+mips_composite_##name (pixman_implementation_t *imp, \
+ pixman_composite_info_t *info) \
+{ \
+ PIXMAN_COMPOSITE_ARGS (info); \
+ dst_type *dst_line, *dst; \
+ mask_type *mask_line, *mask; \
+ int32_t dst_stride, mask_stride; \
+ uint32_t src; \
+ \
+ src = _pixman_image_get_solid ( \
+ imp, src_image, dest_image->bits.format); \
+ \
+ if ((flags & SKIP_ZERO_SRC) && src == 0) \
+ return; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \
+ dst_stride, dst_line, dst_cnt); \
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \
+ mask_stride, mask_line, mask_cnt); \
+ \
+ while (height--) \
+ { \
+ dst = dst_line; \
+ dst_line += dst_stride; \
+ mask = mask_line; \
+ mask_line += mask_stride; \
+ pixman_composite_##name##_asm_mips (dst, src, mask, width); \
+ } \
+}
+
#endif //PIXMAN_MIPS_DSPR2_H
--
1.7.3
More information about the Pixman
mailing list