[Pixman] [PATCH 2/5] ARMv6: New fill routines

Ben Avison bavison at riscosopen.org
Sat Jan 19 08:16:50 PST 2013


Note that this also effectively accelerates src_n_8888, src_n_0565 and
src_n_8 composite types, because of the fast paths in
pixman-fast-path.c implemented by fast_composite_solid_fill(), which
end up dispatching these platform-specific fill routines.

src_n_8888

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  157.3  1.1      574.2  8.7     100.0%      +265.0%
L2  94.2   0.5      364.8  4.2     100.0%      +287.3%
M   92.7   0.4      358.7  1.1     100.0%      +287.1%
HT  68.5   0.9      133.6  4.0     100.0%      +95.2%
VT  61.3   0.8      111.8  2.6     100.0%      +82.4%
R   61.1   0.9      108.7  2.8     100.0%      +78.1%
RT  24.6   1.0      28.6   1.6     100.0%      +16.0%

src_n_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  157.4  1.0      983.1  38.5    100.0%      +524.6%
L2  93.6   0.5      696.0  14.3    100.0%      +643.4%
M   92.7   0.4      680.5  1.0     100.0%      +634.0%
HT  68.3   0.9      160.3  6.6     100.0%      +134.6%
VT  61.1   0.8      130.1  3.4     100.0%      +112.9%
R   61.0   0.8      125.4  4.1     100.0%      +105.7%
RT  24.9   1.3      29.5   1.5     100.0%      +18.2%

src_n_8

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  154.7  1.0      1324.4 48.5    100.0%      +756.3%
L2  92.4   0.4      1178.4 10.9    100.0%      +1175.6%
M   92.9   0.4      1275.7 2.1     100.0%      +1273.5%
HT  68.2   1.0      169.8  5.5     100.0%      +149.0%
VT  61.2   1.0      138.5  3.6     100.0%      +126.3%
R   61.3   0.9      130.1  3.8     100.0%      +112.4%
RT  25.5   1.3      29.2   1.9     100.0%      +14.6%

---
 pixman/pixman-arm-simd-asm.S |  121 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |   68 +++++++++++++++++++++++
 2 files changed, 189 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index e69de29..64588a1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -0,0 +1,121 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison at riscosopen.org)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+	.p2align 2
+
+#include "pixman-arm-simd-asm.h"
+
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ *   cond           ARM condition code for code block
+ *   numbytes       Number of output bytes that should be generated this time
+ *   firstreg       First WK register in which to place output
+ *   unaligned_src  Whether to use non-wordaligned loads of source image
+ *   unaligned_mask Whether to use non-wordaligned loads of mask image
+ *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
+ */
+
+.macro src_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_0565_init
+        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_8_init
+        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #8
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro fill_process_tail  cond, numbytes, firstreg
+    WK4     .req    SRC
+    WK5     .req    STRIDE_S
+    WK6     .req    MASK
+    WK7     .req    STRIDE_M
+        pixst   cond, numbytes, 4, DST
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8888_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_0565_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 94f9a0c..f313df3 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -392,6 +392,72 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
                                         uint32_t, uint32_t)
 
+void
+pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
+                                       int32_t   h,
+                                       uint32_t *dst,
+                                       int32_t   dst_stride,
+                                       uint32_t  src);
+
+void
+pixman_composite_src_n_0565_asm_armv6 (int32_t   w,
+                                       int32_t   h,
+                                       uint16_t *dst,
+                                       int32_t   dst_stride,
+                                       uint16_t  src);
+
+void
+pixman_composite_src_n_8_asm_armv6 (int32_t   w,
+                                    int32_t   h,
+                                    uint8_t  *dst,
+                                    int32_t   dst_stride,
+                                    uint8_t  src);
+
+static pixman_bool_t
+arm_simd_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride, /* in 32-bit words */
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t                 _xor)
+{
+    /* stride is always multiple of 32bit units in pixman */
+    uint32_t byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+    case 8:
+	pixman_composite_src_n_8_asm_armv6 (
+		width,
+		height,
+		(uint8_t *)(((char *) bits) + y * byte_stride + x),
+		byte_stride,
+		_xor & 0xff);
+	return TRUE;
+    case 16:
+	pixman_composite_src_n_0565_asm_armv6 (
+		width,
+		height,
+		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+		byte_stride / 2,
+		_xor & 0xffff);
+	return TRUE;
+    case 32:
+	pixman_composite_src_n_8888_asm_armv6 (
+		width,
+		height,
+		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+		byte_stride / 4,
+		_xor);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
 static const pixman_fast_path_t arm_simd_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
@@ -428,5 +494,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
 
+    imp->fill = arm_simd_fill;
+
     return imp;
 }
-- 
1.7.5.4



More information about the Pixman mailing list