[Pixman] [PATCH 16/32] armv6: Add ability to generate single-scanline fast paths
Ben Avison
bavison at riscosopen.org
Thu Aug 7 09:50:12 PDT 2014
These are suitable for use (save for a small shim function to switch
between masked and maskless versions) in the combine_32 array of functions,
which are used for a given operation if no fast path with specific input
and output pixel formats has been found.
---
pixman/pixman-arm-simd-asm.h | 104 ++++++++++++++++++++++++++++++++++++------
1 files changed, 89 insertions(+), 15 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index da153c3..8e65981 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -547,6 +547,11 @@
.macro end_of_line restore_x, vars_spilled, loop_label, last_one
+ .if SINGLE_SCANLINE
+ .ifc "last_one",""
+ b 198f
+ .endif
+ .else
.if vars_spilled
/* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
/* This is ldmia sp,{} */
@@ -580,21 +585,22 @@
b 198f
.endif
.endif
+ .endif
.endm
-.macro generate_composite_function fname, \
- src_bpp_, \
- mask_bpp_, \
- dst_w_bpp_, \
- flags_, \
- prefetch_distance_, \
- init, \
- newline, \
- cleanup, \
- process_head, \
- process_tail, \
- process_inner_loop
+.macro generate_composite_function_common fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags_, \
+ prefetch_distance_, \
+ init, \
+ newline, \
+ cleanup, \
+ process_head, \
+ process_tail, \
+ process_inner_loop
pixman_asm_function fname
@@ -688,12 +694,28 @@
* The following arguments are unused for non-mask operations
* [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
* [sp,#12] = 0 or mask stride (pixels)
+ *
+ * or in the single-scanline case:
+ * r0 = width (pixels)
+ * r1 = pointer to top-left pixel of destination
+ * r2 = pointer to top-left pixel of source
+ * The following argument is unused for non-mask operations
+ * r3 = pointer to top-left pixel of mask
*/
/*
* Assign symbolic names to registers
*/
X .req r0 /* pixels to go on this line */
+ .if SINGLE_SCANLINE
+ DST .req r1 /* destination pixel pointer */
+ SRC .req r2 /* source pixel pointer */
+ MASK .req r3 /* mask pixel pointer (if applicable) */
+ Y .req r4 /* temporary */
+ STRIDE_D .req r5 /* temporary */
+ STRIDE_S .req r6 /* temporary */
+ STRIDE_M .req r7 /* temporary */
+ .else
Y .req r1 /* lines to go */
DST .req r2 /* destination pixel pointer */
STRIDE_D .req r3 /* destination stride (bytes, minus width) */
@@ -701,6 +723,7 @@
STRIDE_S .req r5 /* source stride (bytes, minus width) */
MASK .req r6 /* mask pixel pointer (if applicable) */
STRIDE_M .req r7 /* mask stride (bytes, minus width) */
+ .endif
WK0 .req r8 /* pixel data registers */
WK1 .req r9
WK2 .req r10
@@ -710,13 +733,16 @@
push {r4-r11, lr} /* save all registers */
+ .if !SINGLE_SCANLINE
subs Y, Y, #1
blo 199f
+ .endif
#ifdef DEBUG_PARAMS
sub sp, sp, #9*4
#endif
+ .if !SINGLE_SCANLINE
.if src_bpp > 0
ldr SRC, [sp, #ARGS_STACK_OFFSET]
ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
@@ -725,6 +751,7 @@
ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
.endif
+ .endif
#ifdef DEBUG_PARAMS
add Y, Y, #1
@@ -741,6 +768,7 @@
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
.endif
+ .if !SINGLE_SCANLINE
lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
.if src_bpp > 0
@@ -751,6 +779,7 @@
lsl STRIDE_M, #mask_bpp_shift
sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
.endif
+ .endif
/* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
cmp X, #2*16*8/dst_w_bpp - 1
@@ -765,6 +794,7 @@
* inner loop termination. We want it to stop when there are
* (prefetch_distance+1) complete blocks to go. */
sub X, X, #(prefetch_distance+2)*pix_per_block
+ .if !SINGLE_SCANLINE
mov ORIG_W, X
.if (flags) & FLAG_SPILL_LINE_VARS_WIDE
/* This is stmdb sp!,{} */
@@ -772,6 +802,7 @@
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
+ .endif
151: /* New line */
newline
preload_leading_step1 src_bpp, WK1, SRC
@@ -808,7 +839,7 @@
157: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
- .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE)
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
.endif
@@ -817,6 +848,7 @@
.ltorg
160: /* Medium case */
+ .if !SINGLE_SCANLINE
mov ORIG_W, X
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
/* This is stmdb sp!,{} */
@@ -824,6 +856,7 @@
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
+ .endif
161: /* New line */
newline
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
@@ -848,6 +881,7 @@
.ltorg
170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ .if !SINGLE_SCANLINE
.if dst_w_bpp < 32
mov ORIG_W, X
.endif
@@ -855,6 +889,7 @@
/* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
.endif
+ .endif
171: /* New line */
newline
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
@@ -892,13 +927,13 @@
177: /* Check for another line */
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
- .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE)
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
.endif
197:
- .if (flags) & FLAG_SPILL_LINE_VARS
+ .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS)
add sp, sp, #LINE_SAVED_REG_COUNT*4
.endif
198:
@@ -935,6 +970,42 @@
.endfunc
.endm
+.macro generate_composite_function fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags_, \
+ prefetch_distance_, \
+ init, \
+ newline, \
+ cleanup, \
+ process_head, \
+ process_tail, \
+ process_inner_loop
+ .set SINGLE_SCANLINE, 0
+generate_composite_function_common \
+ fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
+ init, newline, cleanup, process_head, process_tail, process_inner_loop
+.endm
+
+.macro generate_composite_function_single_scanline fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags_, \
+ prefetch_distance_, \
+ init, \
+ newline, \
+ cleanup, \
+ process_head, \
+ process_tail, \
+ process_inner_loop
+ .set SINGLE_SCANLINE, 1
+generate_composite_function_common \
+ fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
+ init, newline, cleanup, process_head, process_tail, process_inner_loop
+.endm
+
.macro line_saved_regs x:vararg
.set LINE_SAVED_REGS, 0
.set LINE_SAVED_REG_COUNT, 0
@@ -960,6 +1031,9 @@
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
.endif
.endr
+ .if SINGLE_SCANLINE
+ .set LINE_SAVED_REG_COUNT, 0
+ .endif
.endm
.macro nop_macro x:vararg
--
1.7.5.4
More information about the Pixman
mailing list