[Pixman] [PATCH 03/10] ARM: nearest scaling support for NEON scanline compositing functions
Siarhei Siamashka
siarhei.siamashka at gmail.com
Wed Nov 3 16:22:18 PDT 2010
From: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Now it is possible to generate scanline processing functions
for the case when the source image is scaled with NEAREST filter.
Only 16bpp and 32bpp pixel formats are supported for now. But the
others can be also added later when needed. All the existing NEON
fast path functions should be quite easy to reuse for implementing
fast paths which can work with scaled source images.
---
pixman/pixman-arm-neon-asm.h | 176 ++++++++++++++++++++++++++++++++++++++---
1 files changed, 163 insertions(+), 13 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index aa5e9bd..d3b506d 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -205,6 +205,100 @@
.endif
.endm
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #1
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #1
+ vld1.16 {d®1&[0]}, [TMP1, :16]
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #1
+ vld1.16 {d®1&[1]}, [TMP2, :16]
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #1
+ vld1.16 {d®1&[2]}, [TMP1, :16]
+ vld1.16 {d®1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #2
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #2
+ vld1.32 {d®1&[0]}, [TMP1, :32]
+ vld1.32 {d®1&[1]}, [TMP2, :32]
+.else
+ .error "unsupported"
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #1
+ vld1.16 {d®1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP1, mem_operand, TMP1, asl #2
+ vld1.32 {d®1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+ pixld1_s elem_size, %(basereg+4), mem_operand
+ pixld1_s elem_size, %(basereg+5), mem_operand
+ pixld1_s elem_size, %(basereg+6), mem_operand
+ pixld1_s elem_size, %(basereg+7), mem_operand
+ pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+ pixld1_s elem_size, %(basereg+2), mem_operand
+ pixld1_s elem_size, %(basereg+3), mem_operand
+.elseif numbytes == 8
+ pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+ .if elem_size == 32
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+ .elseif elem_size == 16
+ pixld0_s elem_size, %(basereg+0), 2, mem_operand
+ pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ .else
+ pixld0_s elem_size, %(basereg+0), 4, mem_operand
+ pixld0_s elem_size, %(basereg+0), 5, mem_operand
+ pixld0_s elem_size, %(basereg+0), 6, mem_operand
+ pixld0_s elem_size, %(basereg+0), 7, mem_operand
+ .endif
+.elseif numbytes == 2
+ .if elem_size == 16
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+ .else
+ pixld0_s elem_size, %(basereg+0), 2, mem_operand
+ pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ .endif
+.elseif numbytes == 1
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+ .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+ pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
.macro vuzp8 reg1, reg2
vuzp.8 d®1, d®2
.endm
@@ -792,7 +886,8 @@ fname:
* A simplified variant of function generation template for a single
* scanline processing (for implementing pixman combine functions)
*/
-.macro generate_composite_function_single_scanline fname, \
+.macro generate_composite_function_scanline use_nearest_scaling, \
+ fname, \
src_bpp_, \
mask_bpp_, \
dst_w_bpp_, \
@@ -830,23 +925,44 @@ fname:
.set src_basereg, src_basereg_
.set mask_basereg, mask_basereg_
+.if use_nearest_scaling != 0
+ /*
+ * Assign symbolic names to registers for nearest scaling
+ */
+ W .req r0
+ DST_W .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ MASK .req lr
+ TMP1 .req r4
+ TMP2 .req r5
+ DST_R .req r6
+
.macro pixld_src x:vararg
- pixld x
- .endm
- .macro fetch_src_pixblock
- pixld_src pixblock_size, src_bpp, \
- (src_basereg - pixblock_size * src_bpp / 64), SRC
+ pixld_s x
.endm
-/*
- * Assign symbolic names to registers
- */
+ ldr UNIT_X, [sp]
+ push {r4-r6, lr}
+ .if mask_bpp != 0
+ ldr MASK, [sp, #(16 + 4)]
+ .endif
+.else
+ /*
+ * Assign symbolic names to registers
+ */
W .req r0 /* width (is updated during processing) */
DST_W .req r1 /* destination buffer pointer for writes */
SRC .req r2 /* source buffer pointer */
DST_R .req ip /* destination buffer pointer for reads */
MASK .req r3 /* mask pointer */
+ .macro pixld_src x:vararg
+ pixld x
+ .endm
+.endif
+
.if (((flags) & FLAG_DST_READWRITE) != 0)
.set dst_r_bpp, dst_w_bpp
.else
@@ -858,6 +974,11 @@ fname:
.set DEINTERLEAVE_32BPP_ENABLED, 0
.endif
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+
init
mov DST_R, DST_W
@@ -896,7 +1017,11 @@ fname:
process_pixblock_tail_head
cleanup
- bx lr /* exit */
+.if use_nearest_scaling != 0
+ pop {r4-r6, pc} /* exit */
+.else
+ bx lr /* exit */
+.endif
8:
/* Process the remaining trailing pixels in the scanline (dst unaligned) */
process_trailing_pixels 0, 0, \
@@ -905,19 +1030,44 @@ fname:
process_pixblock_tail_head
cleanup
- bx lr /* exit */
- .purgem fetch_src_pixblock
- .purgem pixld_src
+.if use_nearest_scaling != 0
+ pop {r4-r6, pc} /* exit */
+
+ .unreq DST_R
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq DST_W
+ .unreq MASK
+
+.else
+ bx lr /* exit */
.unreq SRC
.unreq MASK
.unreq DST_R
.unreq DST_W
.unreq W
+.endif
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
.endfunc
.endm
+.macro generate_composite_function_single_scanline x:vararg
+ generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+ generate_composite_function_scanline 1, x
+.endm
+
/* Default prologue/epilogue, nothing special needs to be done */
.macro default_init
--
1.7.2.2
More information about the Pixman
mailing list