[Pixman] [PATCH 02/10] ARM: NEON: source image pixel fetcher can be overrided now
Siarhei Siamashka
siarhei.siamashka at gmail.com
Wed Nov 3 16:22:17 PDT 2010
From: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Added a special macro 'pixld_src' which is now responsible for fetching
pixels from the source image. Right now it just passes all its arguments
directly to 'pixld' macro, but it can be used in the future to provide
a special pixel fetcher for implementing nearest scaling.
The 'pixld_src' has a lot of arguments which define its behavior. But
for each particular fast path implementation, we already know NEON
registers allocation and how many pixels are processed in a single block.
That's why a higher level macro 'fetch_src_pixblock' is also introduced
(it's easier to use because it has no arguments) and used everywhere
in 'pixman-arm-neon-asm.S' instead of VLD instructions.
This patch does not introduce any functional changes and the resulting code
in the compiled object file is exactly the same.
---
pixman/pixman-arm-neon-asm.S | 50 +++++++++++++++++++++---------------------
pixman/pixman-arm-neon-asm.h | 35 ++++++++++++++++++++++------
2 files changed, 52 insertions(+), 33 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 87b8045..029709b 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -253,7 +253,7 @@
vld1.16 {d4, d5}, [DST_R, :128]!
vqadd.u8 q9, q0, q11
vshrn.u16 d6, q2, #8
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
vshrn.u16 d7, q2, #3
vsli.u16 q2, q2, #5
vshll.u8 q14, d16, #8
@@ -295,7 +295,7 @@
pixman_composite_over_8888_0565_process_pixblock_tail
vst1.16 {d28, d29}, [DST_W, :128]!
vld1.16 {d4, d5}, [DST_R, :128]!
- vld4.32 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
pixman_composite_over_8888_0565_process_pixblock_head
cache_preload 8, 8
.endm
@@ -433,7 +433,7 @@ generate_composite_function \
vsri.u16 q14, q8, #5
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
vsri.u16 q14, q9, #11
@@ -478,7 +478,7 @@ generate_composite_function \
.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
pixman_composite_src_0565_8888_process_pixblock_tail
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld1.16 {d0, d1}, [SRC]!
+ fetch_src_pixblock
pixman_composite_src_0565_8888_process_pixblock_head
cache_preload 8, 8
.endm
@@ -505,7 +505,7 @@ generate_composite_function \
.endm
.macro pixman_composite_add_8_8_process_pixblock_tail_head
- vld1.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
PF add PF_X, PF_X, #32
PF tst PF_CTL, #0xF
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
@@ -537,7 +537,7 @@ generate_composite_function \
/******************************************************************************/
.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
- vld1.32 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
@@ -613,7 +613,7 @@ generate_composite_function_single_scanline \
PF cmp PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
@@ -667,7 +667,7 @@ generate_composite_function_single_scanline \
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
vqadd.u8 q15, q1, q15
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
@@ -887,7 +887,7 @@ generate_composite_function \
.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
vld1.16 {d4, d5}, [DST_R, :128]!
pixman_composite_over_n_8_0565_process_pixblock_tail
- vld4.8 {d8, d9, d10, d11}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
vld1.8 {d24}, [MASK]!
pixman_composite_over_n_8_0565_process_pixblock_head
@@ -919,7 +919,7 @@ generate_composite_function \
.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
- vld1.16 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 16, 16
.endm
@@ -1065,7 +1065,7 @@ generate_composite_function \
.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
- vld1.32 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
.endm
@@ -1096,7 +1096,7 @@ generate_composite_function \
.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
- vld1.32 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
vorr q0, q0, q2
vorr q1, q1, q2
cache_preload 8, 8
@@ -1395,7 +1395,7 @@ generate_composite_function \
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
vld1.8 {d24, d25, d26, d27}, [MASK]!
- vld1.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 32, 32
pixman_composite_add_8_8_8_process_pixblock_head
.endm
@@ -1448,7 +1448,7 @@ generate_composite_function \
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vld4.8 {d24, d25, d26, d27}, [MASK]!
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
pixman_composite_add_8888_8888_8888_process_pixblock_head
.endm
@@ -1517,7 +1517,7 @@ generate_composite_function_single_scanline \
.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
vld4.8 {d12, d13, d14, d15}, [MASK]!
pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
@@ -1554,7 +1554,7 @@ generate_composite_function_single_scanline \
.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
pixman_composite_over_8888_n_8888_process_pixblock_tail
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
pixman_composite_over_8888_n_8888_process_pixblock_head
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
@@ -1588,7 +1588,7 @@ generate_composite_function \
.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
pixman_composite_over_8888_n_8888_process_pixblock_tail
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
vld4.8 {d12, d13, d14, d15}, [MASK]!
pixman_composite_over_8888_n_8888_process_pixblock_head
@@ -1630,7 +1630,7 @@ generate_composite_function_single_scanline \
.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
pixman_composite_over_8888_n_8888_process_pixblock_tail
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
vld1.8 {d15}, [MASK]!
pixman_composite_over_8888_n_8888_process_pixblock_head
@@ -1662,7 +1662,7 @@ generate_composite_function \
.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
vst3.8 {d0, d1, d2}, [DST_W]!
- vld3.8 {d0, d1, d2}, [SRC]!
+ fetch_src_pixblock
cache_preload 8, 8
.endm
@@ -1692,7 +1692,7 @@ generate_composite_function \
.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
vst4.8 {d0, d1, d2, d3}, [DST_W]!
- vld3.8 {d0, d1, d2}, [SRC]!
+ fetch_src_pixblock
vswp d0, d2
cache_preload 8, 8
.endm
@@ -1731,7 +1731,7 @@ generate_composite_function \
.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
vshll.u8 q14, d0, #8
- vld3.8 {d0, d1, d2}, [SRC]!
+ fetch_src_pixblock
vsri.u16 q14, q8, #5
vsri.u16 q14, q9, #11
vshll.u8 q8, d1, #8
@@ -1777,7 +1777,7 @@ generate_composite_function \
vswp d3, d31
vrshr.u16 q12, q9, #8
vrshr.u16 q13, q10, #8
- vld4.8 {d0, d1, d2, d3}, [SRC]!
+ fetch_src_pixblock
vraddhn.u16 d30, q11, q8
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
@@ -1851,7 +1851,7 @@ generate_composite_function \
.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
vld1.8 {d15}, [MASK]!
pixman_composite_over_0565_8_0565_process_pixblock_tail
- vld1.16 {d8, d9}, [SRC]!
+ fetch_src_pixblock
vld1.16 {d10, d11}, [DST_R, :128]!
cache_preload 8, 8
pixman_composite_over_0565_8_0565_process_pixblock_head
@@ -1903,7 +1903,7 @@ generate_composite_function \
.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
vld1.8 {d15}, [MASK]!
pixman_composite_add_0565_8_0565_process_pixblock_tail
- vld1.16 {d8, d9}, [SRC]!
+ fetch_src_pixblock
vld1.16 {d10, d11}, [DST_R, :128]!
cache_preload 8, 8
pixman_composite_add_0565_8_0565_process_pixblock_head
@@ -1951,7 +1951,7 @@ generate_composite_function \
/* TODO: expand macros and do better instructions scheduling */
.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
- vld1.8 {d15}, [SRC]!
+ fetch_src_pixblock
pixman_composite_out_reverse_8_0565_process_pixblock_tail
vld1.16 {d10, d11}, [DST_R, :128]!
cache_preload 8, 8
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index dec73d7..aa5e9bd 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -335,7 +335,7 @@ local skip1
tst DST_R, #lowbit
beq 1f
.endif
- pixld (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+ pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
.if dst_r_bpp > 0
pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
@@ -397,7 +397,7 @@ local skip1
.if pixblock_size > chunk_size
tst W, #chunk_size
beq 1f
- pixld chunk_size, src_bpp, src_basereg, SRC
+ pixld_src chunk_size, src_bpp, src_basereg, SRC
pixld chunk_size, mask_bpp, mask_basereg, MASK
.if dst_aligned_flag != 0
pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
@@ -531,6 +531,13 @@ fname:
.set src_basereg, src_basereg_
.set mask_basereg, mask_basereg_
+ .macro pixld_src x:vararg
+ pixld x
+ .endm
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
/*
* Assign symbolic names to registers
*/
@@ -696,8 +703,7 @@ fname:
/* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
pixld_a pixblock_size, dst_r_bpp, \
(dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
- pixld pixblock_size, src_bpp, \
- (src_basereg - pixblock_size * src_bpp / 64), SRC
+ fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
PF add PF_X, PF_X, #pixblock_size
@@ -739,8 +745,7 @@ fname:
beq 1f
pixld pixblock_size, dst_r_bpp, \
(dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
- pixld pixblock_size, src_bpp, \
- (src_basereg - pixblock_size * src_bpp / 64), SRC
+ fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
process_pixblock_head
@@ -761,6 +766,9 @@ fname:
cleanup
pop {r4-r12, pc} /* exit */
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
.unreq SRC
.unreq MASK
.unreq DST_R
@@ -821,6 +829,15 @@ fname:
.set dst_r_basereg, dst_r_basereg_
.set src_basereg, src_basereg_
.set mask_basereg, mask_basereg_
+
+ .macro pixld_src x:vararg
+ pixld x
+ .endm
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+
/*
* Assign symbolic names to registers
*/
@@ -857,8 +874,7 @@ fname:
/* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
pixld_a pixblock_size, dst_r_bpp, \
(dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
- pixld pixblock_size, src_bpp, \
- (src_basereg - pixblock_size * src_bpp / 64), SRC
+ fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
process_pixblock_head
@@ -891,6 +907,9 @@ fname:
cleanup
bx lr /* exit */
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
.unreq SRC
.unreq MASK
.unreq DST_R
--
1.7.2.2
More information about the Pixman
mailing list