[Pixman] [PATCH 2/5] ARMv6: New fill routines
Ben Avison
bavison at riscosopen.org
Sat Jan 19 08:16:50 PST 2013
Note that this also effectively accelerates src_n_8888, src_n_0565 and
src_n_8 composite types, because of the fast paths in
pixman-fast-path.c implemented by fast_composite_solid_fill(), which
end up dispatching these platform-specific fill routines.
src_n_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 157.3 1.1 574.2 8.7 100.0% +265.0%
L2 94.2 0.5 364.8 4.2 100.0% +287.3%
M 92.7 0.4 358.7 1.1 100.0% +287.1%
HT 68.5 0.9 133.6 4.0 100.0% +95.2%
VT 61.3 0.8 111.8 2.6 100.0% +82.4%
R 61.1 0.9 108.7 2.8 100.0% +78.1%
RT 24.6 1.0 28.6 1.6 100.0% +16.0%
src_n_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 157.4 1.0 983.1 38.5 100.0% +524.6%
L2 93.6 0.5 696.0 14.3 100.0% +643.4%
M 92.7 0.4 680.5 1.0 100.0% +634.0%
HT 68.3 0.9 160.3 6.6 100.0% +134.6%
VT 61.1 0.8 130.1 3.4 100.0% +112.9%
R 61.0 0.8 125.4 4.1 100.0% +105.7%
RT 24.9 1.3 29.5 1.5 100.0% +18.2%
src_n_8
Before After
Mean StdDev Mean StdDev Confidence Change
L1 154.7 1.0 1324.4 48.5 100.0% +756.3%
L2 92.4 0.4 1178.4 10.9 100.0% +1175.6%
M 92.9 0.4 1275.7 2.1 100.0% +1273.5%
HT 68.2 1.0 169.8 5.5 100.0% +149.0%
VT 61.2 1.0 138.5 3.6 100.0% +126.3%
R 61.3 0.9 130.1 3.8 100.0% +112.4%
RT 25.5 1.3 29.2 1.9 100.0% +14.6%
---
pixman/pixman-arm-simd-asm.S | 121 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 68 +++++++++++++++++++++++
2 files changed, 189 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index e69de29..64588a1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -0,0 +1,121 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison at riscosopen.org)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+#include "pixman-arm-simd-asm.h"
+
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ * cond ARM condition code for code block
+ * numbytes Number of output bytes that should be generated this time
+ * firstreg First WK register in which to place output
+ * unaligned_src Whether to use non-wordaligned loads of source image
+ * unaligned_mask Whether to use non-wordaligned loads of mask image
+ * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
+ */
+
+.macro src_n_8888_init
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro src_n_0565_init
+ ldrh SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro src_n_8_init
+ ldrb SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #8
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro fill_process_tail cond, numbytes, firstreg
+ WK4 .req SRC
+ WK5 .req STRIDE_S
+ WK6 .req MASK
+ WK7 .req STRIDE_M
+ pixst cond, numbytes, 4, DST
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_8888_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+generate_composite_function \
+ pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_0565_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+generate_composite_function \
+ pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_8_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 94f9a0c..f313df3 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -392,6 +392,72 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
uint32_t, uint32_t)
+void
+pixman_composite_src_n_8888_asm_armv6 (int32_t w,
+ int32_t h,
+ uint32_t *dst,
+ int32_t dst_stride,
+ uint32_t src);
+
+void
+pixman_composite_src_n_0565_asm_armv6 (int32_t w,
+ int32_t h,
+ uint16_t *dst,
+ int32_t dst_stride,
+ uint16_t src);
+
+void
+pixman_composite_src_n_8_asm_armv6 (int32_t w,
+ int32_t h,
+ uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t src);
+
+static pixman_bool_t
+arm_simd_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride, /* in 32-bit words */
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t _xor)
+{
+ /* stride is always multiple of 32bit units in pixman */
+ uint32_t byte_stride = stride * sizeof(uint32_t);
+
+ switch (bpp)
+ {
+ case 8:
+ pixman_composite_src_n_8_asm_armv6 (
+ width,
+ height,
+ (uint8_t *)(((char *) bits) + y * byte_stride + x),
+ byte_stride,
+ _xor & 0xff);
+ return TRUE;
+ case 16:
+ pixman_composite_src_n_0565_asm_armv6 (
+ width,
+ height,
+ (uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+ byte_stride / 2,
+ _xor & 0xffff);
+ return TRUE;
+ case 32:
+ pixman_composite_src_n_8888_asm_armv6 (
+ width,
+ height,
+ (uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+ byte_stride / 4,
+ _xor);
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
static const pixman_fast_path_t arm_simd_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
@@ -428,5 +494,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+ imp->fill = arm_simd_fill;
+
return imp;
}
--
1.7.5.4
More information about the Pixman
mailing list