[Pixman] [PATCH 1/3] MIPS: DSPr2: Added several nearest neighbor fast paths with a8 mask:
Nemanja Lukic
nlukic at mips.com
Sun Nov 4 13:58:19 PST 2012
From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Performance numbers before/after on MIPS-74kc @ 1GHz:
lowlevel-blt-bench -n
Referent (before):
over_8888_8_0565 = L1: 9.62 L2: 8.85 M: 7.40 ( 39.27%) HT: 5.67 VT: 5.61 R: 5.45 RT: 2.98 ( 22Kops/s)
over_0565_8_0565 = L1: 7.90 L2: 7.49 M: 6.72 ( 26.75%) HT: 5.24 VT: 5.20 R: 5.06 RT: 2.90 ( 22Kops/s)
Optimized:
over_8888_8_0565 = L1: 18.51 L2: 16.82 M: 12.13 ( 64.43%) HT: 10.06 VT: 9.88 R: 9.54 RT: 5.63 ( 31Kops/s)
over_0565_8_0565 = L1: 14.82 L2: 13.94 M: 11.34 ( 45.20%) HT: 9.45 VT: 9.35 R: 9.03 RT: 5.50 ( 31Kops/s)
---
pixman/pixman-mips-dspr2-asm.S | 157 ++++++++++++++++++++++++++++++++++++++++
pixman/pixman-mips-dspr2.c | 11 +++
pixman/pixman-mips-dspr2.h | 46 ++++++++++++
3 files changed, 214 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index b5cae16..9b9b449 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1833,6 +1833,163 @@ LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_asm_mips)
END(pixman_composite_add_8888_8888_asm_mips)
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (a8r8g8b8)
+ * a2 - mask (a8)
+ * a3 - w
+ * 16(sp) - vx
+ * 20(sp) - unit_x
+ */
+ beqz a3, 4f
+ nop
+
+ SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+ lw v0, 36(sp) /* v0 = vx */
+ lw v1, 40(sp) /* v1 = unit_x */
+ li t6, 0x00ff00ff
+ li t7, 0xf800f800
+ li t8, 0x07e007e0
+ li t9, 0x001F001F
+
+ addiu t1, a3, -1
+ beqz t1, 2f
+ nop
+1:
+ sra t0, v0, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t0, a1, t0
+ lw t0, 0(t0) /* t0 = source (a8r8g8b8) */
+ addu v0, v0, v1 /* v0 = vx + unit_x */
+ sra t1, v0, 16 /* t1 = vx >> 16 */
+ sll t1, t1, 2 /* t1 = t1 * 4 (a8r8g8b8) */
+ addu t1, a1, t1
+ lw t1, 0(t1) /* t1 = source (a8r8g8b8) */
+ addu v0, v0, v1 /* v0 = vx + unit_x */
+ lbu t2, 0(a2) /* t2 = mask (a8) */
+ lbu t3, 1(a2) /* t3 = mask (a8) */
+ lhu t4, 0(a0) /* t4 = destination (r5g6b5) */
+ lhu t5, 2(a0) /* t5 = destination (r5g6b5) */
+ addiu a2, a2, 2
+
+ CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5
+ OVER_2x8888_2x8_2x8888 t0, t1, \
+ t2, t3, \
+ s0, s1, \
+ t4, t5, \
+ t6, s2, s3, s4, s5, t2, t3
+ CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3
+
+ sh s0, 0(a0)
+ sh s1, 2(a0)
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 4
+2:
+ beqz a3, 3f
+ nop
+ sra t0, v0, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 2 /* t0 = t0 * 4 (a8r8g8b8) */
+ addu t0, a1, t0
+ lw t0, 0(t0) /* t0 = source (a8r8g8b8) */
+ lbu t1, 0(a2) /* t1 = mask (a8) */
+ lhu t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+ CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5
+ OVER_8888_8_8888 t0, t1, t3, t2, t6, t4, t5, t7, t8
+ CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
+
+ sh t3, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+4:
+ j ra
+ nop
+
+END(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (r5g6b5)
+ * a2 - mask (a8)
+ * a3 - w
+ * 16(sp) - vx
+ * 20(sp) - unit_x
+ */
+
+ beqz a3, 4f
+ nop
+ SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+ lw v0, 36(sp) /* v0 = vx */
+ lw v1, 40(sp) /* v1 = unit_x */
+ li t4, 0xf800f800
+ li t5, 0x07e007e0
+ li t6, 0x001F001F
+ li t7, 0x00ff00ff
+
+ addiu t1, a3, -1
+ beqz t1, 2f
+ nop
+1:
+ sra t0, v0, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 1 /* t0 = t0 * 2 (r5g6b5) */
+ addu t0, a1, t0
+ lhu t0, 0(t0) /* t0 = source (r5g6b5) */
+ addu v0, v0, v1 /* v0 = vx + unit_x */
+ sra t1, v0, 16 /* t1 = vx >> 16 */
+ sll t1, t1, 1 /* t1 = t1 * 2 (r5g6b5) */
+ addu t1, a1, t1
+ lhu t1, 0(t1) /* t1 = source (r5g6b5) */
+ addu v0, v0, v1 /* v0 = vx + unit_x */
+ lbu t2, 0(a2) /* t2 = mask (a8) */
+ lbu t3, 1(a2) /* t3 = mask (a8) */
+ lhu t8, 0(a0) /* t8 = destination (r5g6b5) */
+ lhu t9, 2(a0) /* t9 = destination (r5g6b5) */
+ addiu a2, a2, 2
+
+ CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
+ CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1
+ OVER_2x8888_2x8_2x8888 s0, s1, \
+ t2, t3, \
+ s2, s3, \
+ t0, t1, \
+ t7, t8, t9, s4, s5, s0, s1
+ CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3
+
+ sh s0, 0(a0)
+ sh s1, 2(a0)
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 4
+2:
+ beqz a3, 3f
+ nop
+ sra t0, v0, 16 /* t0 = vx >> 16 */
+ sll t0, t0, 1 /* t0 = t0 * 2 (r5g6b5) */
+ addu t0, a1, t0
+
+ lhu t0, 0(t0) /* t0 = source (r5g6b5) */
+ lbu t1, 0(a2) /* t1 = mask (a8) */
+ lhu t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+ CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5
+ CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6
+ OVER_8888_8_8888 t3, t1, t4, t0, t7, t2, t5, t6, t8
+ CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
+
+ sh t3, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+4:
+ j ra
+ nop
+
+END(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
/*
* a0 - *dst
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 9da636d..bd828df 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -107,6 +107,11 @@ PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, OVER,
PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, ADD,
uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_0565,
+ OVER, uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 0565_8_0565,
+ OVER, uint16_t, uint16_t)
+
PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_8888, SRC,
uint32_t, uint32_t)
PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_0565, SRC,
@@ -304,6 +309,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mips_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mips_composite_add_8888_8888),
+ PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
+ PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
+
+ PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565),
+ PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565),
+
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8888),
diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
index bddcfd8..3766850 100644
--- a/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman-mips-dspr2.h
@@ -210,6 +210,52 @@ mips_composite_##name (pixman_implementation_t *imp, \
} \
}
+/*****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_mips ( \
+ dst_type * dst, \
+ const src_type * src, \
+ const uint8_t * mask, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x); \
+ \
+static force_inline void \
+scaled_nearest_scanline_mips_##name##_##op (const uint8_t * mask, \
+ dst_type * pd, \
+ const src_type * ps, \
+ int32_t w, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ if ((flags & SKIP_ZERO_SRC) && zero_src) \
+ return; \
+ pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, \
+ mask, w, \
+ vx, unit_x); \
+} \
+ \
+FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_cover_##op, \
+ scaled_nearest_scanline_mips_##name##_##op, \
+ src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_none_##op, \
+ scaled_nearest_scanline_mips_##name##_##op, \
+ src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_pad_##op, \
+ scaled_nearest_scanline_mips_##name##_##op, \
+ src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
/****************************************************************************/
#define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST(flags, name, op, \
--
1.7.3
More information about the Pixman
mailing list