pixman: Branch 'master' - 2 commits
Nemanja Lukic
nlukic at kemper.freedesktop.org
Mon Jan 21 18:16:24 PST 2013
pixman/pixman-mips-dspr2-asm.S | 344 +++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2.c | 15 +
2 files changed, 359 insertions(+)
New commits:
commit 2c6577476e5b18e17904ae8af244a39c352e2e33
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date: Tue Jan 22 03:01:05 2013 +0100
MIPS: DSPr2: Added more fast-paths:
- over_reverse_n_8888
- in_n_8_8
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results
Referent (before):
over_reverse_n_8888 = L1: 19.42 L2: 19.07 M: 15.38 ( 40.80%) HT: 13.35 VT: 13.10 R: 12.92 RT: 8.27 ( 49Kops/s)
in_n_8_8 = L1: 21.20 L2: 22.86 M: 21.42 ( 14.21%) HT: 15.97 VT: 15.69 R: 15.47 RT: 8.00 ( 48Kops/s)
Optimized:
over_reverse_n_8888 = L1: 60.09 L2: 47.87 M: 28.65 ( 76.02%) HT: 23.58 VT: 22.51 R: 21.99 RT: 12.28 ( 60Kops/s)
in_n_8_8 = L1: 89.38 L2: 86.07 M: 65.48 ( 43.44%) HT: 44.64 VT: 41.50 R: 40.77 RT: 16.94 ( 66Kops/s)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 64ef660..ddfacef 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -2210,6 +2210,240 @@ LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_8888_asm_mips)
END(pixman_composite_out_reverse_8_8888_asm_mips)
+LEAF_MIPS_DSPR2(pixman_composite_over_reverse_n_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (32bit constant)
+ * a2 - w
+ */
+
+ beqz a2, 5f
+ nop
+
+ SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+ li t0, 0x00ff00ff
+ srl t9, a2, 2 /* t9 = how many multiples of 4 src pixels */
+ beqz t9, 2f /* branch if less than 4 src pixels */
+ nop
+1:
+ beqz t9, 2f
+ addiu t9, t9, -1
+
+ lw t1, 0(a0)
+ lw t2, 4(a0)
+ lw t3, 8(a0)
+ lw t4, 12(a0)
+
+ addiu a2, a2, -4
+
+ not t5, t1
+ not t6, t2
+ not t7, t3
+ not t8, t4
+ srl t5, t5, 24
+ srl t6, t6, 24
+ srl t7, t7, 24
+ srl t8, t8, 24
+ replv.ph t5, t5
+ replv.ph t6, t6
+ replv.ph t7, t7
+ replv.ph t8, t8
+ muleu_s.ph.qbl s0, a1, t5
+ muleu_s.ph.qbr s1, a1, t5
+ muleu_s.ph.qbl s2, a1, t6
+ muleu_s.ph.qbr s3, a1, t6
+ muleu_s.ph.qbl s4, a1, t7
+ muleu_s.ph.qbr s5, a1, t7
+ muleu_s.ph.qbl s6, a1, t8
+ muleu_s.ph.qbr s7, a1, t8
+
+ shra_r.ph t5, s0, 8
+ shra_r.ph t6, s1, 8
+ shra_r.ph t7, s2, 8
+ shra_r.ph t8, s3, 8
+ and t5, t5, t0
+ and t6, t6, t0
+ and t7, t7, t0
+ and t8, t8, t0
+ addq.ph s0, s0, t5
+ addq.ph s1, s1, t6
+ addq.ph s2, s2, t7
+ addq.ph s3, s3, t8
+ shra_r.ph s0, s0, 8
+ shra_r.ph s1, s1, 8
+ shra_r.ph s2, s2, 8
+ shra_r.ph s3, s3, 8
+ shra_r.ph t5, s4, 8
+ shra_r.ph t6, s5, 8
+ shra_r.ph t7, s6, 8
+ shra_r.ph t8, s7, 8
+ and t5, t5, t0
+ and t6, t6, t0
+ and t7, t7, t0
+ and t8, t8, t0
+ addq.ph s4, s4, t5
+ addq.ph s5, s5, t6
+ addq.ph s6, s6, t7
+ addq.ph s7, s7, t8
+ shra_r.ph s4, s4, 8
+ shra_r.ph s5, s5, 8
+ shra_r.ph s6, s6, 8
+ shra_r.ph s7, s7, 8
+
+ precr.qb.ph t5, s0, s1
+ precr.qb.ph t6, s2, s3
+ precr.qb.ph t7, s4, s5
+ precr.qb.ph t8, s6, s7
+ addu_s.qb t5, t1, t5
+ addu_s.qb t6, t2, t6
+ addu_s.qb t7, t3, t7
+ addu_s.qb t8, t4, t8
+
+ sw t5, 0(a0)
+ sw t6, 4(a0)
+ sw t7, 8(a0)
+ sw t8, 12(a0)
+ b 1b
+ addiu a0, a0, 16
+
+2:
+ beqz a2, 4f
+ nop
+3:
+ lw t1, 0(a0)
+
+ not t2, t1
+ srl t2, t2, 24
+ replv.ph t2, t2
+
+ muleu_s.ph.qbl t4, a1, t2
+ muleu_s.ph.qbr t5, a1, t2
+ shra_r.ph t6, t4, 8
+ shra_r.ph t7, t5, 8
+
+ and t6,t6,t0
+ and t7,t7,t0
+
+ addq.ph t8, t4, t6
+ addq.ph t9, t5, t7
+
+ shra_r.ph t8, t8, 8
+ shra_r.ph t9, t9, 8
+
+ precr.qb.ph t9, t8, t9
+
+ addu_s.qb t9, t1, t9
+ sw t9, 0(a0)
+
+ addiu a2, a2, -1
+ bnez a2, 3b
+ addiu a0, a0, 4
+4:
+ RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+5:
+ j ra
+ nop
+
+END(pixman_composite_over_reverse_n_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
+/*
+ * a0 - dst (a8)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+ beqz a2, 5f
+ nop
+
+ SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+ move t7, a1
+ srl t5, t7, 24
+ replv.ph t5, t5
+ srl t9, a2, 2 /* t1 = how many multiples of 4 src pixels */
+ beqz t9, 2f /* branch if less than 4 src pixels */
+ nop
+
+1:
+ addiu t9, t9, -1
+ addiu a2, a2, -4
+ lbu t0, 0(a0)
+ lbu t1, 1(a0)
+ lbu t2, 2(a0)
+ lbu t3, 3(a0)
+
+ muleu_s.ph.qbl s0, t0, t5
+ muleu_s.ph.qbr s1, t0, t5
+ muleu_s.ph.qbl s2, t1, t5
+ muleu_s.ph.qbr s3, t1, t5
+ muleu_s.ph.qbl s4, t2, t5
+ muleu_s.ph.qbr s5, t2, t5
+ muleu_s.ph.qbl s6, t3, t5
+ muleu_s.ph.qbr s7, t3, t5
+
+ shrl.ph t4, s0, 8
+ shrl.ph t6, s1, 8
+ shrl.ph t7, s2, 8
+ shrl.ph t8, s3, 8
+ addq.ph t0, s0, t4
+ addq.ph t1, s1, t6
+ addq.ph t2, s2, t7
+ addq.ph t3, s3, t8
+ shra_r.ph t0, t0, 8
+ shra_r.ph t1, t1, 8
+ shra_r.ph t2, t2, 8
+ shra_r.ph t3, t3, 8
+ shrl.ph t4, s4, 8
+ shrl.ph t6, s5, 8
+ shrl.ph t7, s6, 8
+ shrl.ph t8, s7, 8
+ addq.ph s0, s4, t4
+ addq.ph s1, s5, t6
+ addq.ph s2, s6, t7
+ addq.ph s3, s7, t8
+ shra_r.ph t4, s0, 8
+ shra_r.ph t6, s1, 8
+ shra_r.ph t7, s2, 8
+ shra_r.ph t8, s3, 8
+
+ precr.qb.ph s0, t0, t1
+ precr.qb.ph s1, t2, t3
+ precr.qb.ph s2, t4, t6
+ precr.qb.ph s3, t7, t8
+
+ sb s0, 0(a0)
+ sb s1, 1(a0)
+ sb s2, 2(a0)
+ sb s3, 3(a0)
+ bgtz t9, 1b
+ addiu a0, a0, 4
+2:
+ beqz a2, 4f
+ nop
+3:
+ lbu t1, 0(a0)
+
+ muleu_s.ph.qbl t4, t1, t5
+ muleu_s.ph.qbr t7, t1, t5
+ shrl.ph t6, t4, 8
+ shrl.ph t0, t7, 8
+ addq.ph t8, t4, t6
+ addq.ph t9, t7, t0
+ shra_r.ph t8, t8, 8
+ shra_r.ph t9, t9, 8
+ precr.qb.ph t2, t8, t9
+ sb t2, 0(a0)
+ addiu a2, a2, -1
+ bnez a2, 3b
+ addiu a0, a0, 1
+4:
+ RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+5:
+ j ra
+ nop
+
+END(pixman_composite_in_n_8_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
/*
* a0 - dst (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index a7e6f8a..e14e1c4 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -89,6 +89,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_0565,
uint16_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_8888,
uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_reverse_n_8888,
+ uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_DST (0, in_n_8,
+ uint8_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t, 1,
uint8_t, 1, uint8_t, 1)
@@ -332,6 +336,9 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, b5g6r5, mips_composite_out_reverse_8_0565),
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8r8g8b8, mips_composite_out_reverse_8_8888),
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8b8g8r8, mips_composite_out_reverse_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mips_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (IN, solid, null, a8, mips_composite_in_n_8),
PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
commit a67b0e24d7eaba3b9525eeb8bf357ded95cc6b7c
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date: Tue Jan 22 02:59:44 2013 +0100
MIPS: DSPr2: Added more fast-paths for REVERSE operation:
- out_reverse_8_0565
- out_reverse_8_8888
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results
Referent (before):
out_reverse_8_0565 = L1: 14.29 L2: 13.58 M: 12.14 ( 24.16%) HT: 9.23 VT: 9.12 R: 8.84 RT: 4.75 ( 36Kops/s)
out_reverse_8_8888 = L1: 27.46 L2: 23.24 M: 17.41 ( 57.73%) HT: 12.61 VT: 12.47 R: 11.79 RT: 5.86 ( 41Kops/s)
Optimized:
out_reverse_8_0565 = L1: 28.24 L2: 25.64 M: 20.63 ( 41.05%) HT: 16.69 VT: 16.14 R: 15.50 RT: 8.69 ( 52Kops/s)
out_reverse_8_8888 = L1: 52.78 L2: 41.44 M: 23.50 ( 77.94%) HT: 18.79 VT: 18.16 R: 16.90 RT: 9.11 ( 53Kops/s)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index ba22e62..64ef660 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -2100,6 +2100,116 @@ LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_asm_mips)
END(pixman_composite_add_8888_8888_asm_mips)
+LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (a8)
+ * a2 - w
+ */
+
+ beqz a2, 4f
+ nop
+
+ SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+ li t2, 0xf800f800
+ li t3, 0x07e007e0
+ li t4, 0x001F001F
+ li t5, 0x00ff00ff
+
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+1:
+ lbu t0, 0(a1) /* t0 = source (a8) */
+ lbu t1, 1(a1) /* t1 = source (a8) */
+ lhu t6, 0(a0) /* t6 = destination (r5g6b5) */
+ lhu t7, 2(a0) /* t7 = destination (r5g6b5) */
+ addiu a1, a1, 2
+
+ not t0, t0
+ not t1, t1
+ andi t0, 0xff /* t0 = neg source1 */
+ andi t1, 0xff /* t1 = neg source2 */
+ CONVERT_2x0565_TO_2x8888 t6, t7, t8, t9, t3, t4, s0, s1, s2, s3
+ MIPS_2xUN8x4_MUL_2xUN8 t8, t9, t0, t1, t6, t7, t5, s0, s1, s2, s3, t8, t9
+ CONVERT_2x8888_TO_2x0565 t6, t7, t8, t9, t2, t3, t4, s0, s1
+
+ sh t8, 0(a0)
+ sh t9, 2(a0)
+ addiu a2, a2, -2
+ addiu t1, a2, -1
+ bgtz t1, 1b
+ addiu a0, a0, 4
+2:
+ beqz a2, 3f
+ nop
+ lbu t0, 0(a1) /* t0 = source (a8) */
+ lhu t1, 0(a0) /* t1 = destination (r5g6b5) */
+
+ not t0, t0
+ andi t0, 0xff /* t0 = neg source */
+ CONVERT_1x0565_TO_1x8888 t1, t2, t3, t4
+ MIPS_UN8x4_MUL_UN8 t2, t0, t1, t5, t3, t4, t6
+ CONVERT_1x8888_TO_1x0565 t1, t2, t3, t4
+
+ sh t2, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+4:
+ j ra
+ nop
+
+END(pixman_composite_out_reverse_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8)
+ * a2 - w
+ */
+
+ beqz a2, 3f
+ nop
+ li t4, 0x00ff00ff
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+1:
+ lbu t0, 0(a1) /* t0 = source (a8) */
+ lbu t1, 1(a1) /* t1 = source (a8) */
+ lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+ lw t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+ addiu a1, a1, 2
+ not t0, t0
+ not t1, t1
+ andi t0, 0xff /* t0 = neg source */
+ andi t1, 0xff /* t1 = neg source */
+
+ MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t5, t6, t4, t7, t8, t9, t2, t3, t0
+
+ sw t5, 0(a0)
+ sw t6, 4(a0)
+ addiu a2, a2, -2
+ addiu t1, a2, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+2:
+ beqz a2, 3f
+ nop
+ lbu t0, 0(a1) /* t0 = source (a8) */
+ lw t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+ not t0, t0
+ andi t0, 0xff /* t0 = neg source */
+
+ MIPS_UN8x4_MUL_UN8 t1, t0, t2, t4, t3, t5, t6
+
+ sw t2, 0(a0)
+3:
+ j ra
+ nop
+
+END(pixman_composite_out_reverse_8_8888_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
/*
* a0 - dst (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 11f1254..a7e6f8a 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -54,6 +54,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_0565,
+ uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8888,
uint8_t, 1, uint32_t, 1)
@@ -324,6 +328,10 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mips_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mips_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mips_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, mips_composite_out_reverse_8_0565),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, b5g6r5, mips_composite_out_reverse_8_0565),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8r8g8b8, mips_composite_out_reverse_8_8888),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8b8g8r8, mips_composite_out_reverse_8_8888),
PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
More information about the xorg-commit
mailing list