[Pixman] [PATCH] MIPS: DSPr2: Added several nearest neighbor fast paths: - over_8888_8888 - over_8888_0565 - src_0565_8888
Nemanja Lukic
nlukic at mips.com
Fri Mar 1 10:32:16 PST 2013
From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench results
Referent (before):
over_8888_8888 = L1: 19.47 L2: 16.30 M: 11.24 ( 59.69%) HT: 9.54 VT: 9.29 R: 9.47 RT: 6.24 ( 37Kops/s)
over_8888_0565 = L1: 13.22 L2: 12.02 M: 9.77 ( 38.92%) HT: 8.58 VT: 8.35 R: 8.38 RT: 5.78 ( 35Kops/s)
src_0565_8888 = L1: 20.70 L2: 19.22 M: 12.50 ( 49.79%) HT: 10.45 VT: 10.18 R: 9.99 RT: 5.31 ( 31Kops/s)
Optimized:
over_8888_8888 = L1: 43.67 L2: 33.30 M: 16.32 ( 86.65%) HT: 14.10 VT: 13.78 R: 12.96 RT: 7.85 ( 39Kops/s)
over_8888_0565 = L1: 26.20 L2: 22.97 M: 15.92 ( 63.40%) HT: 13.33 VT: 13.13 R: 12.72 RT: 7.65 ( 39Kops/s)
src_0565_8888 = L1: 62.98 L2: 53.44 M: 23.07 ( 91.87%) HT: 19.85 VT: 19.15 R: 17.70 RT: 9.68 ( 43Kops/s)
---
pixman/pixman-mips-dspr2-asm.S | 186 ++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2-asm.h | 51 +++++++++---
pixman/pixman-mips-dspr2.c | 23 +++++
pixman/pixman-mips-dspr2.h | 42 +++++++++
4 files changed, 291 insertions(+), 11 deletions(-)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index fb612d9..b94e66f 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -3069,6 +3069,192 @@ LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
END(pixman_composite_in_n_8_asm_mips)
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ * a3 - vx
+ * 16(sp) - unit_x
+ */
+
+ SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+ lw t8, 16(sp) /* t8 = unit_x */
+ li t6, 0x00ff00ff
+ beqz a2, 3f
+ nop
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+1:
+ sra t0, a3, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t0, a1, t0
+ lw t0, 0(t0) /* t0 = source (a8r8g8b8) */
+ addu a3, a3, t8 /* a3 = vx + unit_x */
+
+ sra t1, a3, 16 /* t0 = vx >> 16 */
+ sll t1, t1, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t1, a1, t1
+ lw t1, 0(t1) /* t1 = source (a8r8g8b8) */
+ addu a3, a3, t8 /* a3 = vx + unit_x */
+
+ lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+ lw t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+
+ OVER_2x8888_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t9, s0, s1, s2, s3
+
+ sw t4, 0(a0)
+ sw t5, 4(a0)
+ addiu a2, a2, -2
+ addiu t1, a2, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+2:
+ beqz a2, 3f
+ nop
+ sra t0, a3, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t0, a1, t0
+ lw t0, 0(t0) /* t0 = source (a8r8g8b8) */
+ lw t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+ addu a3, a3, t8 /* a3 = vx + unit_x */
+
+ OVER_8888_8888 t0, t1, t2, t6, t4, t5, t3, t7
+
+ sw t2, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+ j ra
+ nop
+
+END(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ * a3 - vx
+ * 16(sp) - unit_x
+ */
+
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, v0, v1
+ lw t8, 40(sp) /* t8 = unit_x */
+ li t4, 0x00ff00ff
+ li t5, 0xf800f800
+ li t6, 0x07e007e0
+ li t7, 0x001F001F
+ beqz a2, 3f
+ nop
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+1:
+ sra t0, a3, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t0, a1, t0
+ lw t0, 0(t0) /* t0 = source (a8r8g8b8) */
+ addu a3, a3, t8 /* a3 = vx + unit_x */
+ sra t1, a3, 16 /* t0 = vx >> 16 */
+ sll t1, t1, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t1, a1, t1
+ lw t1, 0(t1) /* t1 = source (a8r8g8b8) */
+ addu a3, a3, t8 /* a3 = vx + unit_x */
+ lhu t2, 0(a0) /* t2 = destination (r5g6b5) */
+ lhu t3, 2(a0) /* t3 = destination (r5g6b5) */
+
+ CONVERT_2x0565_TO_2x8888 t2, t3, v0, v1, t6, t7, s0, s1, s2, s3
+ OVER_2x8888_2x8888 t0, t1, v0, v1, t2, t3, t4, t9, s0, s1, s2, s3, s4
+ CONVERT_2x8888_TO_2x0565 t2, t3, v0, v1, t5, t6, t7, t9, s2
+
+ sh v0, 0(a0)
+ sh v1, 2(a0)
+ addiu a2, a2, -2
+ addiu t1, a2, -1
+ bgtz t1, 1b
+ addiu a0, a0, 4
+2:
+ beqz a2, 3f
+ nop
+ sra t0, a3, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t0, a1, t0
+ lw t0, 0(t0) /* t0 = source (a8r8g8b8) */
+ lhu t1, 0(a0) /* t1 = destination (r5g6b5) */
+ addu a3, a3, t8 /* a3 = vx + unit_x */
+
+ CONVERT_1x0565_TO_1x8888 t1, t2, t5, t6
+ OVER_8888_8888 t0, t2, t1, t4, t3, t5, t6, t7
+ CONVERT_1x8888_TO_1x0565 t1, t2, t5, t6
+
+ sh t2, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, v0, v1
+ j ra
+ nop
+
+END(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (r5g6b5)
+ * a2 - w
+ * a3 - vx
+ * 16(sp) - unit_x
+ */
+
+ SAVE_REGS_ON_STACK 0, v0
+ beqz a2, 3f
+ nop
+
+ lw v0, 16(sp) /* v0 = unit_x */
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+
+ li t4, 0x07e007e0
+ li t5, 0x001F001F
+1:
+ sra t0, a3, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 1 /* t0 = t0 * 2 ((r5g6b5)) */
+ addu t0, a1, t0
+ lhu t0, 0(t0) /* t0 = source ((r5g6b5)) */
+ addu a3, a3, v0 /* a3 = vx + unit_x */
+ sra t1, a3, 16 /* t1 = vx >> 16 */
+ sll t1, t1, 1 /* t1 = t1 * 2 ((r5g6b5)) */
+ addu t1, a1, t1
+ lhu t1, 0(t1) /* t1 = source ((r5g6b5)) */
+ addu a3, a3, v0 /* a3 = vx + unit_x */
+ addiu a2, a2, -2
+
+ CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+ sw t2, 0(a0)
+ sw t3, 4(a0)
+
+ addiu t2, a2, -1
+ bgtz t2, 1b
+ addiu a0, a0, 8
+2:
+ beqz a2, 3f
+ nop
+ sra t0, a3, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 1 /* t0 = t0 * 2 ((r5g6b5)) */
+ addu t0, a1, t0
+ lhu t0, 0(t0) /* t0 = source ((r5g6b5)) */
+
+ CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+ sw t1, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 0, v0
+ j ra
+ nop
+
+END(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
/*
* a0 - dst (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index b330c0f..cab122d 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -354,17 +354,16 @@ LEAF_MIPS32R2(symbol) \
out1_565, out2_565, \
maskR, maskG, maskB, \
scratch1, scratch2
- precrq.ph.w \scratch1, \in2_8888, \in1_8888
- precr_sra.ph.w \in2_8888, \in1_8888, 0
- shll.ph \scratch1, \scratch1, 8
- srl \in2_8888, \in2_8888, 3
- and \scratch2, \in2_8888, \maskB
- and \scratch1, \scratch1, \maskR
- srl \in2_8888, \in2_8888, 2
- and \out2_565, \in2_8888, \maskG
- or \out2_565, \out2_565, \scratch2
- or \out1_565, \out2_565, \scratch1
- srl \out2_565, \out1_565, 16
+ precr.qb.ph \scratch1, \in2_8888, \in1_8888
+ precrq.qb.ph \in2_8888, \in2_8888, \in1_8888
+ and \out1_565, \scratch1, \maskR
+ shrl.ph \scratch1, \scratch1, 3
+ shll.ph \in2_8888, \in2_8888, 3
+ and \scratch1, \scratch1, \maskB
+ or \out1_565, \out1_565, \scratch1
+ and \in2_8888, \in2_8888, \maskG
+ or \out1_565, \out1_565, \in2_8888
+ srl \out2_565, \out1_565, 16
.endm
/*
@@ -587,6 +586,36 @@ LEAF_MIPS32R2(symbol) \
addu_s.qb \out_8888, \out_8888, \s_8888
.endm
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8888 s1_8888, \
+ s2_8888, \
+ d1_8888, \
+ d2_8888, \
+ out1_8888, \
+ out2_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, \
+ scratch4, scratch5, scratch6
+ not \scratch1, \s1_8888
+ srl \scratch1, \scratch1, 24
+ not \scratch2, \s2_8888
+ srl \scratch2, \scratch2, 24
+ MIPS_2xUN8x4_MUL_2xUN8 \d1_8888, \d2_8888, \
+ \scratch1, \scratch2, \
+ \out1_8888, \out2_8888, \
+ \maskLSR, \
+ \scratch3, \scratch4, \scratch5, \
+ \scratch6, \d1_8888, \d2_8888
+
+ addu_s.qb \out1_8888, \out1_8888, \s1_8888
+ addu_s.qb \out2_8888, \out2_8888, \s2_8888
+.endm
+
.macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888, \
m_8, \
d_8888, \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 1ea2445..1949921 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -121,6 +121,13 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER,
+ uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_0565, OVER,
+ uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (0565_8888, SRC,
+ uint16_t, uint32_t)
+
PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
uint32_t, uint32_t)
PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_0565, SRC,
@@ -357,6 +364,22 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (IN, solid, null, a8, mips_composite_in_n_8),
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mips_8888_8888),
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mips_8888_8888),
+
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_0565),
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_0565),
+
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, mips_0565_8888),
+ PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888),
+ /* Note: NONE repeat is not supported yet */
+ SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+
PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
index 4ac9ff9..955ed70 100644
--- a/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman-mips-dspr2.h
@@ -246,6 +246,48 @@ mips_composite_##name (pixman_implementation_t *imp, \
} \
}
+/****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST(name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_mips ( \
+ dst_type * dst, \
+ const src_type * src, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x); \
+ \
+static force_inline void \
+scaled_nearest_scanline_mips_##name##_##op (dst_type * pd, \
+ const src_type * ps, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, w, \
+ vx, unit_x); \
+} \
+ \
+FAST_NEAREST_MAINLOOP (mips_##name##_cover_##op, \
+ scaled_nearest_scanline_mips_##name##_##op, \
+ src_type, dst_type, COVER) \
+FAST_NEAREST_MAINLOOP (mips_##name##_none_##op, \
+ scaled_nearest_scanline_mips_##name##_##op, \
+ src_type, dst_type, NONE) \
+FAST_NEAREST_MAINLOOP (mips_##name##_pad_##op, \
+ scaled_nearest_scanline_mips_##name##_##op, \
+ src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+
/*****************************************************************************/
#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op, \
--
1.7.3
More information about the Pixman
mailing list