[Pixman] [PATCH] MIPS: DSPr2: Added several bilinear fast paths: - src_8888_8_8888 - src_8888_8_0565 - src_0565_8_x888 - src_0565_8_0565 - add_8888_8_8888
Nemanja Lukic
nlukic at mips.com
Mon Jun 25 12:08:33 PDT 2012
From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Performance numbers before/after on MIPS-74kc @ 1GHz:
Referent (before):
src_8888_8_8888 = L1: 2.25 L2: 6.15 M: 5.48 ( 32.72%) HT: 4.67 VT: 4.65 R: 4.55 RT: 2.89 ( 24Kops/s)
src_8888_8_0565 = L1: 5.97 L2: 5.71 M: 5.13 ( 23.81%) HT: 4.39 VT: 4.36 R: 4.28 RT: 2.72 ( 23Kops/s)
src_0565_8_x888 = L1: 3.32 L2: 3.27 M: 3.16 ( 14.68%) HT: 2.87 VT: 2.85 R: 2.81 RT: 2.09 ( 19Kops/s)
src_0565_8_0565 = L1: 3.19 L2: 3.15 M: 3.04 ( 10.09%) HT: 2.76 VT: 2.74 R: 2.71 RT: 2.00 ( 18Kops/s)
over_8888_8_8888 = L1: 5.04 L2: 4.75 M: 4.12 ( 27.31%) HT: 3.60 VT: 3.58 R: 3.51 RT: 2.35 ( 21Kops/s)
add_8888_8_8888 = L1: 5.68 L2: 5.31 M: 4.53 ( 30.05%) HT: 3.95 VT: 3.92 R: 3.84 RT: 2.52 ( 22Kops/s)
Optimized:
src_8888_8_8888 = L1: 13.19 L2: 12.13 M: 9.75 ( 58.22%) HT: 8.60 VT: 8.44 R: 7.90 RT: 5.06 ( 33Kops/s)
src_8888_8_0565 = L1: 11.64 L2: 10.81 M: 9.18 ( 42.63%) HT: 8.04 VT: 7.90 R: 7.57 RT: 5.02 ( 32Kops/s)
src_0565_8_x888 = L1: 8.34 L2: 7.95 M: 7.29 ( 33.85%) HT: 6.55 VT: 6.48 R: 6.25 RT: 4.35 ( 30Kops/s)
src_0565_8_0565 = L1: 7.71 L2: 7.35 M: 6.90 ( 22.90%) HT: 6.14 VT: 6.10 R: 5.94 RT: 4.07 ( 29Kops/s)
over_8888_8_8888 = L1: 9.73 L2: 8.99 M: 7.15 ( 47.41%) HT: 6.40 VT: 6.30 R: 6.11 RT: 4.28 ( 30Kops/s)
add_8888_8_8888 = L1: 13.01 L2: 11.72 M: 8.70 ( 57.68%) HT: 7.59 VT: 7.46 R: 7.20 RT: 4.74 ( 32Kops/s)
Benchmark results are obtained using tweaked version of the lowlevel-blt-bench (which does bilinear scaling using almost identity matrix).
---
pixman/pixman-mips-dspr2-asm.S | 322 ++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2-asm.h | 13 ++
pixman/pixman-mips-dspr2.c | 22 +++
3 files changed, 357 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 87558f0..74d1ee9 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -748,6 +748,266 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips)
END(pixman_composite_over_n_8_0565_asm_mips)
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
+/*
+ * a0 - *dst
+ * a1 - *mask
+ * a2 - *src_top
+ * a3 - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+ lw v1, 32(sp)
+ beqz v1, 1f
+ nop
+
+ SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ lw s0, 44(sp) /* s0 = wt */
+ lw s1, 48(sp) /* s1 = wb */
+ lw s2, 52(sp) /* s2 = vx */
+ lw s3, 56(sp) /* s3 = unit_x */
+ li v0, 256
+ li s8, 0x00ff00ff
+0:
+ andi t4, s2, 0xffff /* t4 = (short)vx */
+ srl t4, t4, 8 /* t4 = vx >> 8 */
+ subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */
+
+ mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */
+ mul s5, s0, t4 /* s5 = wt*(vx>>8) */
+ mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */
+ mul s7, s1, t4 /* s7 = wb*(vx>>8) */
+
+ sra t9, s2, 16
+ sll t9, t9, 2
+ addiu t8, t9, 4
+ lwx t0, t9(a2) /* t0 = tl */
+ lwx t1, t8(a2) /* t1 = tr */
+ addiu v1, v1, -1
+ lwx t2, t9(a3) /* t2 = bl */
+ lwx t3, t8(a3) /* t3 = br */
+
+ BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+ lbu t1, 0(a1) /* t1 = mask */
+ addiu a1, a1, 1
+ MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
+
+ addu s2, s2, s3 /* vx += unit_x; */
+ sw t0, 0(a0)
+ bnez v1, 0b
+ addiu a0, a0, 4
+
+ RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+ j ra
+ nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
+/*
+ * a0 - *dst
+ * a1 - *mask
+ * a2 - *src_top
+ * a3 - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+ lw v1, 32(sp)
+ beqz v1, 1f
+ nop
+
+ SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ lw s0, 44(sp) /* s0 = wt */
+ lw s1, 48(sp) /* s1 = wb */
+ lw s2, 52(sp) /* s2 = vx */
+ lw s3, 56(sp) /* s3 = unit_x */
+ li v0, 256
+ li s8, 0x00ff00ff
+0:
+ andi t4, s2, 0xffff /* t4 = (short)vx */
+ srl t4, t4, 8 /* t4 = vx >> 8 */
+ subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */
+
+ mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */
+ mul s5, s0, t4 /* s5 = wt*(vx>>8) */
+ mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */
+ mul s7, s1, t4 /* s7 = wb*(vx>>8) */
+
+ sra t9, s2, 16
+ sll t9, t9, 2
+ addiu t8, t9, 4
+ lwx t0, t9(a2) /* t0 = tl */
+ lwx t1, t8(a2) /* t1 = tr */
+ addiu v1, v1, -1
+ lwx t2, t9(a3) /* t2 = bl */
+ lwx t3, t8(a3) /* t3 = br */
+
+ BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+ lbu t1, 0(a1) /* t1 = mask */
+ addiu a1, a1, 1
+ MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
+ CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+ addu s2, s2, s3 /* vx += unit_x; */
+ sh t1, 0(a0)
+ bnez v1, 0b
+ addiu a0, a0, 2
+
+ RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+ j ra
+ nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
+/*
+ * a0 - *dst
+ * a1 - *mask
+ * a2 - *src_top
+ * a3 - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+ lw t0, 32(sp)
+ beqz t0, 1f
+ nop
+
+ SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+
+ lw s0, 48(sp) /* s0 = wt */
+ lw s1, 52(sp) /* s1 = wb */
+ lw s2, 56(sp) /* s2 = vx */
+ lw s3, 60(sp) /* s3 = unit_x */
+ lw ra, 64(sp) /* ra = w */
+ li v0, 0x00ff00ff
+ li v1, 0x07e007e0
+ li s8, 0x001f001f
+0:
+ andi t4, s2, 0xffff /* t4 = (short)vx */
+ srl t4, t4, 8 /* t4 = vx >> 8 */
+ xori t5, t4, 0xff
+ addiu t5, t5, 1 /* t5 = ( 256 - (vx>>8)) */
+
+ mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */
+ mul s5, s0, t4 /* s5 = wt*(vx>>8) */
+ mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */
+ mul s7, s1, t4 /* s7 = wb*(vx>>8) */
+
+ sra t9, s2, 16
+ sll t9, t9, 1
+ addiu t8, t9, 2
+ lhx t0, t9(a2) /* t0 = tl */
+ lhx t1, t8(a2) /* t1 = tr */
+ andi t1, t1, 0xffff
+ addiu ra, ra, -1
+ lhx t2, t9(a3) /* t2 = bl */
+ lhx t3, t8(a3) /* t3 = br */
+ andi t3, t3, 0xffff
+
+ CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+ CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+ BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+ lbu t1, 0(a1) /* t1 = mask */
+ addiu a1, a1, 1
+ MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
+
+ addu s2, s2, s3 /* vx += unit_x; */
+ sw t0, 0(a0)
+ bnez ra, 0b
+ addiu a0, a0, 4
+
+ RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+1:
+ j ra
+ nop
+
+END(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
+/*
+ * a0 - *dst
+ * a1 - *mask
+ * a2 - *src_top
+ * a3 - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+ lw t0, 32(sp)
+ beqz t0, 1f
+ nop
+
+ SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+
+ lw s0, 48(sp) /* s0 = wt */
+ lw s1, 52(sp) /* s1 = wb */
+ lw s2, 56(sp) /* s2 = vx */
+ lw s3, 60(sp) /* s3 = unit_x */
+ lw ra, 64(sp) /* ra = w */
+ li v0, 0x00ff00ff
+ li v1, 0x07e007e0
+ li s8, 0x001f001f
+0:
+ andi t4, s2, 0xffff /* t4 = (short)vx */
+ srl t4, t4, 8 /* t4 = vx >> 8 */
+ xori t5, t4, 0xff
+ addiu t5, t5, 1 /* t5 = ( 256 - (vx>>8)) */
+
+ mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */
+ mul s5, s0, t4 /* s5 = wt*(vx>>8) */
+ mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */
+ mul s7, s1, t4 /* s7 = wb*(vx>>8) */
+
+ sra t9, s2, 16
+ sll t9, t9, 1
+ addiu t8, t9, 2
+ lhx t0, t9(a2) /* t0 = tl */
+ lhx t1, t8(a2) /* t1 = tr */
+ andi t1, t1, 0xffff
+ addiu ra, ra, -1
+ lhx t2, t9(a3) /* t2 = bl */
+ lhx t3, t8(a3) /* t3 = br */
+ andi t3, t3, 0xffff
+
+ CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+ CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+ BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+ lbu t1, 0(a1) /* t1 = mask */
+ addiu a1, a1, 1
+ MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
+ CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+ addu s2, s2, s3 /* vx += unit_x; */
+ sh t1, 0(a0)
+ bnez ra, 0b
+ addiu a0, a0, 2
+
+ RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+1:
+ j ra
+ nop
+
+END(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
/*
* a0 - dst (a8r8g8b8)
@@ -810,3 +1070,65 @@ LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
nop
END(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)
+/*
+ * a0 - *dst
+ * a1 - *mask
+ * a2 - *src_top
+ * a3 - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+ lw v1, 32(sp)
+ beqz v1, 1f
+ nop
+
+ SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ lw s0, 44(sp) /* s0 = wt */
+ lw s1, 48(sp) /* s1 = wb */
+ lw s2, 52(sp) /* s2 = vx */
+ lw s3, 56(sp) /* s3 = unit_x */
+ li v0, 256
+ li s8, 0x00ff00ff
+0:
+ andi t4, s2, 0xffff /* t4 = (short)vx */
+ srl t4, t4, 8 /* t4 = vx >> 8 */
+ subu t5, v0, t4 /* t5 = ( 256 - (vx>>8)) */
+
+ mul s4, s0, t5 /* s4 = wt*(256-(vx>>8)) */
+ mul s5, s0, t4 /* s5 = wt*(vx>>8) */
+ mul s6, s1, t5 /* s6 = wb*(256-(vx>>8)) */
+ mul s7, s1, t4 /* s7 = wb*(vx>>8) */
+
+ sra t9, s2, 16
+ sll t9, t9, 2
+ addiu t8, t9, 4
+ lwx t0, t9(a2) /* t0 = tl */
+ lwx t1, t8(a2) /* t1 = tr */
+ addiu v1, v1, -1
+ lwx t2, t9(a3) /* t2 = bl */
+ lwx t3, t8(a3) /* t3 = br */
+
+ BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+ lbu t1, 0(a1) /* t1 = mask */
+ lw t2, 0(a0) /* t2 = dst */
+ addiu a1, a1, 1
+ MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t0, s8, t3, t4, t5
+
+ addu s2, s2, s3 /* vx += unit_x; */
+ sw t0, 0(a0)
+ bnez v1, 0b
+ addiu a0, a0, 4
+
+ RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+ j ra
+ nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index 7cf3281..24b049e 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -566,6 +566,19 @@ LEAF_MIPS32R2(symbol) \
addu_s.qb \out2_8888, \d2_8888, \scratch2
.endm
+.macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888, \
+ m_8, \
+ d_8888, \
+ out_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3
+ MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \
+ \out_8888, \maskLSR, \
+ \scratch1, \scratch2, \scratch3
+
+ addu_s.qb \out_8888, \out_8888, \d_8888
+.endm
+
.macro BILINEAR_INTERPOLATE_SINGLE_PIXEL tl, tr, bl, br, \
scratch1, scratch2, \
alpha, red, green, blue \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 06d4335..66c0e5d 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -58,8 +58,18 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_8888, SRC,
+ uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_0565, SRC,
+ uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_x888, SRC,
+ uint16_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_0565, SRC,
+ uint16_t, uint16_t)
PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, OVER,
uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, ADD,
+ uint32_t, uint32_t)
static pixman_bool_t
pixman_fill_mips (uint32_t *bits,
@@ -209,9 +219,21 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mips_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mips_composite_over_n_8_0565),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_8_0565),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_8_0565),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8_x888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, mips_0565_8_0565),
+
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
{ PIXMAN_OP_NONE },
};
--
1.7.3
More information about the Pixman
mailing list