[Pixman] [PATCH 23/32] armv6: Add optimised scanline fetchers and writeback for r5g6b5 and a8
Ben Avison
bavison at riscosopen.org
Thu Aug 7 09:50:19 PDT 2014
This supports r5g6b5 source and desitination images, and a8 source images.
lowlevel-blt-bench results for example operations which use these because
they lack a dedicated fast path at the time of writing:
in_reverse_8_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 30.0 0.3 37.0 0.3 100.0% +23.2%
L2 23.3 0.3 29.4 0.4 100.0% +26.1%
M 24.0 0.0 31.3 0.1 100.0% +30.5%
HT 12.8 0.1 16.1 0.1 100.0% +25.8%
VT 11.9 0.1 14.8 0.1 100.0% +24.6%
R 11.7 0.1 14.6 0.1 100.0% +24.5%
RT 5.1 0.1 6.2 0.1 100.0% +20.2%
in_0565_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 22.0 0.1 28.3 0.2 100.0% +28.4%
L2 16.6 0.2 23.6 0.3 100.0% +42.2%
M 16.5 0.0 24.7 0.1 100.0% +49.5%
HT 11.0 0.1 13.7 0.1 100.0% +24.4%
VT 10.7 0.0 13.1 0.1 100.0% +22.0%
R 10.3 0.0 12.6 0.1 100.0% +22.5%
RT 5.3 0.1 5.7 0.1 100.0% +9.0%
in_reverse_8888_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 16.6 0.1 20.9 0.1 100.0% +25.5%
L2 13.1 0.1 17.7 0.3 100.0% +35.3%
M 13.2 0.0 19.2 0.0 100.0% +45.3%
HT 9.6 0.0 11.7 0.1 100.0% +21.8%
VT 9.3 0.0 11.4 0.1 100.0% +22.4%
R 9.0 0.0 10.9 0.1 100.0% +21.1%
RT 4.7 0.1 5.2 0.1 100.0% +8.7%
---
pixman/pixman-arm-common.h | 31 ++++++++++++++
pixman/pixman-arm-simd-asm.S | 94 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 45 ++++++++++++++++++++
3 files changed, 170 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 3558c15..f4632b2 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -453,4 +453,35 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp, \
pixman_composite_scanline_##name##_asm_##cputype (width, dest, src); \
}
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name) \
+void \
+pixman_get_scanline_##name##_asm_##cputype (int32_t w, \
+ uint32_t *dst, \
+ const uint32_t *src); \
+ \
+uint32_t * \
+cputype##_get_scanline_##name (pixman_iter_t *iter, const uint32_t *mask) \
+{ \
+ pixman_get_scanline_##name##_asm_##cputype (iter->width, iter->buffer, \
+ (uint32_t *) iter->bits); \
+ iter->bits += iter->stride; \
+ return iter->buffer; \
+}
+
+#define PIXMAN_ARM_BIND_WRITE_BACK(cputype, name) \
+void \
+pixman_write_back_##name##_asm_##cputype (int32_t w, \
+ uint32_t *dst, \
+ const uint32_t *src); \
+ \
+void \
+cputype##_write_back_##name (pixman_iter_t *iter) \
+{ \
+ pixman_write_back_##name##_asm_##cputype (iter->width, \
+ (uint32_t *)(iter->bits - iter->stride), \
+ iter->buffer); \
+}
+
#endif
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index f61b715..b251187 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -388,6 +388,16 @@ generate_composite_function \
src_0565_8888_process_head, \
src_0565_8888_process_tail
+generate_composite_function_single_scanline \
+ pixman_get_scanline_r5g6b5_asm_armv6, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+ 3, /* prefetch distance */ \
+ src_0565_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_0565_8888_process_head, \
+ src_0565_8888_process_tail
+
/******************************************************************************/
.macro src_x888_0565_init
@@ -465,6 +475,90 @@ generate_composite_function \
src_x888_0565_process_head, \
src_x888_0565_process_tail
+generate_composite_function_single_scanline \
+ pixman_write_back_r5g6b5_asm_armv6, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+ 3, /* prefetch distance */ \
+ src_x888_0565_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_x888_0565_process_head, \
+ src_x888_0565_process_tail
+
+/******************************************************************************/
+
+.macro src_8_8888_init
+ mov MASK, #0xff000000
+.endm
+
+.macro src_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes/4, firstreg, SRC, unaligned_src
+.endm
+
+.macro src_8_8888_1pixel cond, d0
+ mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_2pixels cond, d0, d1
+ and&cond WK&d1, MASK, WK&d0, lsl #16
+ mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_4pixels cond, d0, d1, d2, d3
+ and&cond WK&d3, MASK, WK&d0
+ and&cond WK&d2, MASK, WK&d0, lsl #8
+ and&cond WK&d1, MASK, WK&d0, lsl #16
+ mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ src_8_8888_4pixels cond, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ src_8_8888_2pixels cond, %(firstreg+0), %(firstreg+1)
+ .else // numbytes == 4
+ src_8_8888_1pixel cond, %(firstreg+0)
+ .endif
+.endm
+
+.macro src_8_8888_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110: /* Length of inner loop is set to allow one preload per 32 source pixels */
+ ldr STRIDE_M, [SRC], #4
+ and WK3, MASK, STRIDE_M
+ and WK2, MASK, STRIDE_M, lsl #8
+ and WK1, MASK, STRIDE_M, lsl #16
+ mov WK0, STRIDE_M, lsl #24
+ ldr STRIDE_M, [SRC], #4
+ .rept 6
+ pixst , 16, 0, DST
+ and WK3, MASK, STRIDE_M
+ and WK2, MASK, STRIDE_M, lsl #8
+ and WK1, MASK, STRIDE_M, lsl #16
+ mov WK0, STRIDE_M, lsl #24
+ ldr STRIDE_M, [SRC], #4
+ .endr
+ pld [SRC, SCRATCH]
+ pixst , 16, 0, DST
+ and WK3, MASK, STRIDE_M
+ and WK2, MASK, STRIDE_M, lsl #8
+ and WK1, MASK, STRIDE_M, lsl #16
+ mov WK0, STRIDE_M, lsl #24
+ pixst , 16, 0, DST
+ subs X, X, #32
+ bhs 110b
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_get_scanline_a8_asm_armv6, 8, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 2, /* prefetch distance */ \
+ src_8_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_8_8888_process_head, \
+ src_8_8888_process_tail, \
+ src_8_8888_inner_loop
+
/******************************************************************************/
.macro add_8_8_8pixels cond, dst1, dst2
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index f938342..514231a 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -109,6 +109,10 @@ PIXMAN_ARM_BIND_COMBINE_U (armv6, out)
PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse)
PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
+PIXMAN_ARM_BIND_GET_SCANLINE (armv6, r5g6b5)
+PIXMAN_ARM_BIND_WRITE_BACK (armv6, r5g6b5)
+PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a8)
+
void
pixman_composite_src_n_8888_asm_armv6 (int32_t w,
int32_t h,
@@ -328,6 +332,46 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
{ PIXMAN_OP_NONE },
};
+static const pixman_iter_info_t arm_simd_iters[] =
+{
+ { PIXMAN_r5g6b5,
+ (FAST_PATH_STANDARD_FLAGS |
+ FAST_PATH_ID_TRANSFORM |
+ FAST_PATH_NEAREST_FILTER |
+ FAST_PATH_SAMPLES_COVER_CLIP_NEAREST |
+ FAST_PATH_BITS_IMAGE),
+ ITER_NARROW | ITER_SRC,
+ _pixman_iter_init_bits_stride,
+ armv6_get_scanline_r5g6b5,
+ NULL
+ },
+
+ { PIXMAN_r5g6b5,
+ (FAST_PATH_STANDARD_FLAGS |
+ FAST_PATH_ID_TRANSFORM |
+ FAST_PATH_NEAREST_FILTER |
+ FAST_PATH_BITS_IMAGE),
+ ITER_NARROW | ITER_DEST,
+ _pixman_iter_init_bits_stride,
+ armv6_get_scanline_r5g6b5,
+ armv6_write_back_r5g6b5
+ },
+
+ { PIXMAN_a8,
+ (FAST_PATH_STANDARD_FLAGS |
+ FAST_PATH_ID_TRANSFORM |
+ FAST_PATH_NEAREST_FILTER |
+ FAST_PATH_SAMPLES_COVER_CLIP_NEAREST |
+ FAST_PATH_BITS_IMAGE),
+ ITER_NARROW | ITER_SRC,
+ _pixman_iter_init_bits_stride,
+ armv6_get_scanline_a8,
+ NULL
+ },
+
+ { PIXMAN_null },
+};
+
pixman_implementation_t *
_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
{
@@ -342,6 +386,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
imp->combine_32[PIXMAN_OP_OUT_REVERSE] = armv6_combine_out_reverse_u;
imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u;
+ imp->iter_info = arm_simd_iters;
imp->blt = arm_simd_blt;
imp->fill = arm_simd_fill;
--
1.7.5.4
More information about the Pixman
mailing list