[Pixman] [PATCH 4/4] armv6: Add nearest-scaled-cover src_0565_0565 fast path
Ben Avison
bavison at riscosopen.org
Tue Aug 25 16:23:26 PDT 2015
This is adapted from the nearest scaled cover scanline fetcher, modified to
pack output data in 16-bit units. This fetcher out-performs both the fast
path defined using PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST in pixman-arm-simd.c
and the fast path in pixman-fast-path.c.
Since the two preceding patches no longer defined a macroised C wrapper we
can use, and general_composite_rect is no use to us here (we don't want to
do pixel format conversion twice) the C wrapper has been written out longhand.
Unsurprisingly, the results are similar to last year's version of the patch:
lowlevel-blt-bench -n src_0565_0565
Before Old patch New patch Change
Mean StdDev Mean StdDev Mean StdDev Old New
L1 118.6 3.12 71.0 1.32 73.5 1.18 -40.1% -38.0%
L2 42.1 0.73 52.6 2.44 52.1 2.00 +25.1% +23.7%
M 42.1 0.15 69.3 0.10 69.3 0.15 +64.9% +64.8%
HT 24.4 0.35 29.2 0.33 29.5 0.24 +19.4% +20.9%
VT 23.0 0.24 27.4 0.29 27.7 0.35 +19.3% +20.6%
R 20.8 0.20 25.3 0.32 25.7 0.18 +21.4% +23.2%
RT 9.1 0.25 9.3 0.24 9.7 0.24 +1.7% +6.7%
---
pixman/pixman-arm-common.h | 9 +++++
pixman/pixman-arm-simd-asm-scaled.S | 4 ++
pixman/pixman-arm-simd-asm-scaled.h | 69 ++++++++++++++++++++++++++++++-----
pixman/pixman-arm-simd.c | 35 ++++++++++++++++++
4 files changed, 108 insertions(+), 9 deletions(-)
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index f970868..59190f0 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -494,6 +494,15 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp, \
FAST_PATH_X_UNIT_POSITIVE | \
FAST_PATH_Y_UNIT_ZERO)
+#define PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH(cputype,op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS, \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ cputype ## _composite_nearest_scaled_cover_ ## func \
+ }
+
#define PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER(op,s,d,func) \
{ PIXMAN_OP_ ## op, \
PIXMAN_ ## s, \
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index 0116889..24c1a27 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -170,6 +170,10 @@ generate_nearest_scaled_cover_function \
pixman_get_scanline_nearest_scaled_cover_x8r8g8b8_asm_armv6, 32, \
2, 3 /* prefetch distances */, nop_macro, convert_x888_8888
+generate_nearest_scaled_cover_function \
+ pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6, 16, \
+ 2, 0 /* prefetch distances */, nop_macro, nop_macro, 16
+
.macro init_ge
msr CPSR_s, #0x50000
.endm
diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h
index 660797d..e642e7f 100644
--- a/pixman/pixman-arm-simd-asm-scaled.h
+++ b/pixman/pixman-arm-simd-asm-scaled.h
@@ -94,7 +94,12 @@
.macro nearest_scaled_cover_enlarge_nomask_innerloop bpp, reg, convert, mask_hint, may_be_final, exit_label, store
adds ACCUM, ACCUM, UX
+ .if PIXEL_MERGE_OFFSET == 0
mov \reg, PIXEL
+ .else
+ orr \reg, \reg, PIXEL, lsl #PIXEL_MERGE_OFFSET
+ .endif
+ .set PIXEL_MERGE_OFFSET, (PIXEL_MERGE_OFFSET + out_bpp) & 31
\store
branch cc, \exit_label, 1203f
.ifnc "\may_be_final",""
@@ -158,10 +163,20 @@
mov TMP, XHI
adds XLO, XLO, UX, lsl #16
adc XHI, XHI, UX, lsr #16
+ .if PIXEL_MERGE_OFFSET == 0
ldrx \bpp,, <\reg, [PTR]>
+ .else
+ ldrx \bpp,, <PIXEL2, [PTR]>
+ .endif
eor TMP, TMP, XHI
bics TMP, TMP, #255/\bpp
+ .if PIXEL_MERGE_OFFSET == 0
\convert \reg, TMP
+ .else
+ \convert PIXEL2, TMP
+ orr \reg, \reg, PIXEL2, lsl #PIXEL_MERGE_OFFSET
+ .endif
+ .set PIXEL_MERGE_OFFSET, (PIXEL_MERGE_OFFSET + out_bpp) & 31
\store
branch eq, \exit_label, 1403f
subs PLDS, PLDS, #32
@@ -185,7 +200,14 @@
\inner_loop \bpp, WK0, \convert, mask_is_0, 1, 1503f, <add DST, DST, #4>
b 1503f
.endif
+ .set PIXEL_MERGE_OFFSET, 0
+ .if out_bpp == 32
1502: \inner_loop \bpp, WK0, \convert, mask_is_non_0, 1,, <str WK0, [DST], #4>
+ .elseif out_bpp == 16
+1502: \inner_loop \bpp, WK0, \convert, mask_is_non_0, 1,, <strh WK0, [DST], #2>
+ .else
+ .error "Output bits per pixel not supported"
+ .endif
1503:
.endm
@@ -206,15 +228,26 @@
\inner_loop \bpp, WK3, \convert, mask_is_0, 1, 1602f, <add DST, DST, #4*4>
b 1602f
.endif
-1601: \inner_loop \bpp, WK0, \convert
+1601:
+ .set PIXEL_MERGE_OFFSET, 0
+ .rept 32 / out_bpp
+ \inner_loop \bpp, WK0, \convert
+ .endr
+ .rept 32 / out_bpp
\inner_loop \bpp, WK1, \convert
+ .endr
+ .rept 32 / out_bpp
\inner_loop \bpp, WK2, \convert
+ .endr
+ .rept 32 / out_bpp - 1
+ \inner_loop \bpp, WK3, \convert
+ .endr
\inner_loop \bpp, WK3, \convert,, 1,, <stmia DST!!, {WK0,WK1,WK2,WK3}>
1602:
.endm
.macro process bpp, has_mask, inner_loop, convert
- cmp COUNT, #2 * 4 - 1 - 1 @ guaranteed at least one aligned half-cacheline output?
+ cmp COUNT, #2 * 128 / out_bpp - 1 - 1 @ guaranteed at least one aligned half-cacheline output?
blo 1706f
tst DST, #15
beq 1702f
@@ -222,16 +255,21 @@
sub COUNT, COUNT, #1
tst DST, #15
bne 1701b
-1702: sub COUNT, COUNT, #4 - 1
+1702: sub COUNT, COUNT, #128 / out_bpp - 1
+ .if \has_mask
tst MASK, #16
beq 1704f
-1703: process4 \bpp, \has_mask, 0, \inner_loop, \convert
- subs COUNT, COUNT, #4
+ .endif
+1703:
+.if \has_mask
+ process4 \bpp, \has_mask, 0, \inner_loop, \convert
+ subs COUNT, COUNT, #128 / out_bpp
bcc 1705f
+ .endif
1704: process4 \bpp, \has_mask, 1, \inner_loop, \convert
- subs COUNT, COUNT, #4
+ subs COUNT, COUNT, #128 / out_bpp
bcs 1703b
-1705: adds COUNT, COUNT, #4 - 1
+1705: adds COUNT, COUNT, #128 / out_bpp - 1
bcc 1707f
@ drop through...
1706: process1 \bpp, \has_mask, 1, \inner_loop, \convert
@@ -245,7 +283,8 @@
prefetch_distance_src_, \
prefetch_distance_mask_, \
init, \
- convert
+ convert, \
+ out_bpp_
/* void fname(uint32_t width,
* pixman_fixed_t x,
@@ -262,6 +301,11 @@ pixman_asm_function fname
*/
.set prefetch_distance_src, prefetch_distance_src_
.set prefetch_distance_mask, prefetch_distance_mask_
+ .ifc "out_bpp_",""
+ .set out_bpp, 32
+ .else
+ .set out_bpp, out_bpp_
+ .endif
/*
* Assign symbolic names to registers
@@ -273,7 +317,8 @@ XLO .req a2 @ reduce only
UX .req a3
DST .req a4
SRC .req v1
-MASK .req v2
+MASK .req v2 @ only when outputing 32bpp
+PIXEL2 .req v2 @ only when outputing <32bpp and reducing
PLDS .req v3
PIXEL .req v4 @ enlarge only
XHI .req v4 @ reduce only
@@ -292,6 +337,7 @@ TMP .req lr
blo 1807f-4
\init
mla WK2, COUNT, UX, X
+ .if out_bpp == 32
bics WK0, MASK, #31
beq 1801f
@ Use a simplified preload process for the mask,
@@ -302,6 +348,7 @@ TMP .req lr
.set OFFSET, OFFSET + 32
.endr
1801:
+ .endif
add WK0, SRC, X, lsr #16 - (log2_\bpp - 3)
bic WK0, WK0, #31
pld [WK0]
@@ -323,11 +370,13 @@ TMP .req lr
mov ACCUM, X, lsl #16
mov UX, UX, lsl #16
bic SRC, SRC, #(\bpp-1)/8
+ .if out_bpp == 32
teq MASK, #0
beq 1804f
mov VALID, #0
process \bpp, 1, nearest_scaled_cover_enlarge_mask_innerloop, \convert
1804:
+ .endif
ldrx \bpp,, <PIXEL, [SRC]>
\convert PIXEL, TMP
process \bpp, 0, nearest_scaled_cover_enlarge_nomask_innerloop, \convert
@@ -338,9 +387,11 @@ TMP .req lr
mov XHI, X, lsr #16
mov XLO, X, lsl #16
add XHI, XHI, TMP, lsr #log2_\bpp - 3
+ .if out_bpp == 32
teq MASK, #0
beq 1806f
process \bpp, 1, nearest_scaled_cover_reduce_mask_innerloop, \convert
+ .endif
1806: process \bpp, 0, nearest_scaled_cover_reduce_nomask_innerloop, \convert
1807:
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 1c6c1e9..f21bb8f 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -186,6 +186,38 @@ PIXMAN_ARM_BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(armv6, r5g6b5, uint16_t)
PIXMAN_ARM_BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(armv6, a8, uint8_t)
void
+pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6(uint32_t width,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ uint16_t *dest,
+ const uint16_t *source);
+
+static void
+armv6_composite_nearest_scaled_cover_src_0565_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint16_t *dst_line, *dst;
+ uint16_t *src_bits, *src;
+ int dst_stride, src_stride;
+ pixman_fixed_t x, y, uxx, uxy, uyy;
+
+ PIXMAN_ARM_IMAGE_GET_SCALED (src_image, src_x, src_y, uint16_t, src_stride, src_bits, x, y, uxx, uxy, uyy);
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_bits + src_stride * pixman_fixed_to_int (y - pixman_fixed_e);
+ pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6 (
+ width, x - pixman_fixed_e, uxx, (uint16_t *) dst, (uint16_t *) src);
+ x += uxy;
+ y += uyy;
+ }
+}
+
+void
pixman_composite_src_n_8888_asm_armv6 (int32_t w,
int32_t h,
uint32_t *dst,
@@ -399,6 +431,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, r5g6b5, r5g6b5, src_0565_0565),
+ PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, b5g6r5, b5g6r5, src_0565_0565),
+
PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER (SRC, a8r8g8b8, r5g6b5, src_8888_0565),
PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER (SRC, x8r8g8b8, r5g6b5, src_8888_0565),
PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER (SRC, a8b8g8r8, b5g6r5, src_8888_0565),
--
1.7.5.4
More information about the Pixman
mailing list