[Pixman] [PATCH 33/37] armv6: Add fetcher for a8r8g8b8 bilinear-interpolation scaled images
Ben Avison
bavison at riscosopen.org
Tue Sep 9 11:51:41 PDT 2014
This is constrained to support X increments in the positive X direction only.
It also doesn't attempt to support any form of image repeat.
Here are some affine-bench results for a variety of horizontal and vertical
scaling factors.
Before:
x increment 0.5 0.75 1.0 1.5 2.0
y increment
0.5 7.1 6.9 6.8 6.6 6.3
0.75 6.4 6.2 6.1 5.8 5.5
1.0 5.9 5.7 5.2 4.9
1.5 5.0 4.8 4.6 4.3 4.0
2.0 4.4 4.2 4.0 3.7 3.4
After:
x increment 0.5 0.75 1.0 1.5 2.0
y increment
0.5 21.0 19.6 19.2 20.2 18.9
0.75 18.0 16.6 16.1 17.1 15.9
1.0 21.8 18.9 19.9 17.7
1.5 12.8 11.3 10.9 11.8 10.7
2.0 10.7 9.3 8.9 9.8 8.8
Improvement:
x increment 0.5 0.75 1.0 1.5 2.0
y increment
0.5 +196.7% +183.6% +181.8% +206.6% +198.4%
0.75 +182.2% +166.2% +164.0% +194.8% +185.8%
1.0 +271.7% +234.4% +282.7% +257.9%
1.5 +154.6% +135.3% +134.3% +173.3% +164.8%
2.0 +144.1% +124.2% +123.3% +165.6% +155.5%
---
pixman/pixman-arm-common.h | 14 ++
pixman/pixman-arm-simd-asm-scaled.S | 352 +++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd-asm-scaled.h | 339 +++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 229 +++++++++++++++++++++++
4 files changed, 934 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 2ddcbbc..9b5cefb 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -478,6 +478,20 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp, \
cputype ## _composite_nearest_scaled_cover_ ## func \
}
+#define PIXMAN_ARM_BILINEAR_AFFINE_FLAGS \
+ (FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NARROW_FORMAT | \
+ FAST_PATH_BILINEAR_FILTER | \
+ FAST_PATH_HAS_TRANSFORM | \
+ FAST_PATH_AFFINE_TRANSFORM)
+
+#define PIXMAN_ARM_BILINEAR_SCALED_COVER_FLAGS \
+ (PIXMAN_ARM_BILINEAR_AFFINE_FLAGS | \
+ FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR | \
+ FAST_PATH_X_UNIT_POSITIVE | \
+ FAST_PATH_SCALE_TRANSFORM) // implies FAST_PATH_Y_UNIT_ZERO
+
#define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name) \
void \
pixman_get_scanline_##name##_asm_##cputype (int32_t w, \
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index 2606e64..85ca212 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -203,3 +203,355 @@ generate_nearest_scaled_cover_function \
2, 3 /* prefetch distances */, nop_macro, convert_8_8888
/******************************************************************************/
+
+.macro convert_8888_08080808 in_ag, rb
+ uxtb16 \rb, \in_ag
+ uxtb16 \in_ag, \in_ag, ror #8
+.endm
+
+generate_bilinear_scaled_cover_functions 32, a8r8g8b8, 3, 3, 3, 3, 3, 3, 3, 3, nop_macro, convert_8888_08080808
+
+/******************************************************************************/
+
+.macro pass2_1pixel_internal t0, t1, b0, b1, tmp, mul, d
+ pkhtb \tmp, \t0, \b0, asr #16
+ pkhbt \t0, \b0, \t0, lsl #16
+ pkhtb \b0, \t1, \b1, asr #16
+ pkhbt \t1, \b1, \t1, lsl #16
+ smuad \t0, \t0, \mul
+ smuad \tmp, \tmp, \mul
+ smuad \t1, \t1, \mul
+ smuad \b0, \b0, \mul
+ mov \tmp, \tmp, lsl #8
+ pkhtb \t0, \tmp, \t0, asr #8
+ pkhtb \t1, \b0, \t1, asr #16
+ sel \d, \t1, \t0
+.endm
+
+.macro pass2_1pixel check_src_thresh
+ ldrd v3, v4, [SRC, #32]
+ ldrd v1, v2, [SRC], #8
+ .if \check_src_thresh
+ tst SRC, #31
+ addeq SRC, SRC, #32
+ .endif
+ pass2_1pixel_internal v1, v2, v3, v4, ip, DIST, v1
+ str v1, [DST], #4
+.endm
+
+.macro pass2_preload n, do_preload, pos
+ .ifnc "do_preload",""
+ /* Minimum distance is 32 because we read the bottom-row pixels from
+ * the cacheline after the one being pointed at */
+ .set dist, 32 + prefetch_distance*64
+ .if \n == 1 || \n == 2
+ /* Stretch one cacheline further ahead in these cases to ensure
+ * we stop preloading at boundary between groups of 4 output pixels */
+ .set dist, dist + 32
+ .endif
+ .if \pos == \n
+ /* Source pointer has just skipped "bottom" cacheline */
+ pld [SRC, #dist]
+ .elseif ((\pos) & 1) == ((\n) & 1)
+ /* Time-equidistant between skips, but pointer is half-way through cacheline */
+ pld [SRC, #16 + dist]
+ .endif
+ .endif
+.endm
+
+.macro pass2_4pixels n, do_preload
+ ldrd v3, v4, [SRC, #32]
+ ldrd v1, v2, [SRC], #8 + 32*-(\n==1)
+ pass2_preload \n, \do_preload, 1
+ ldrd v7, v8, [SRC, #32]
+ ldrd v5, v6, [SRC], #8 + 32*-(\n==2)
+ pass2_preload \n, \do_preload, 2
+ pass2_1pixel_internal v1, v2, v3, v4, ip, DIST, OUT0
+ pass2_1pixel_internal v5, v6, v7, v8, ip, DIST, OUT1
+ ldrd v5, v6, [SRC, #32]
+ ldrd v3, v4, [SRC], #8 + 32*-(\n==3)
+ pass2_preload \n, \do_preload, 3
+ ldrd v7, v8, [SRC, #32]
+ ldr lr, [SRC], #4
+ pass2_1pixel_internal v3, v4, v5, v6, ip, DIST, OUT2
+ ldr v6, [SRC], #4 + 32*-(\n==0)
+ pass2_preload \n, \do_preload, 0
+ pass2_1pixel_internal lr, v6, v7, v8, ip, DIST, OUT3
+ stmia DST!, {OUT0, OUT1, OUT2, OUT3}
+.endm
+
+.macro pass2 n
+ .if \n == 1 || \n == 2
+ cmp ip, #(prefetch_distance+1)*4
+ movcs ip, #(prefetch_distance+1)*4
+ .else
+ cmp ip, #prefetch_distance*4
+ movcs ip, #prefetch_distance*4
+ .endif
+ subs ip, ip, #4
+ bcc 2f
+1: pld [v1]
+ pld [v1, #32]
+ add v1, v1, #64
+ subs ip, ip, #4
+ bcs 1b
+2: sub DIST, DIST, DIST, lsl #16
+ msr cpsr_s, #0x50000
+ add DIST, DIST, #1 << (32-BILINEAR_INTERPOLATION_BITS)
+ @ top half of DIST now holds complementary weight
+ .if \n != 0
+3: pass2_1pixel 0
+ subs COUNT, COUNT, #1
+ bmi 99f
+ tst DST, #15
+ bne 3b
+ .endif
+ subs COUNT, COUNT, #4-1
+ bmi 8f
+ @ pixels_remaining_minus_4 = COUNT
+ @ pixels_done = (4-n)&3
+ @ preloads_done = at most, (prefetch_distance + 1 + (n==2 | n==3)) * 2
+ @ total_preloads = (1 + (pixels - 1) / 4) * 2
+ @ so if n is 0 or 3, then
+ @ preloads_to_do_minus_1 = (COUNT + pixels_done - 1 - prefetch_distance*4) / 4 * 2
+ @ or if n is 1 or 2, then 2 fewer than that
+ .set adjust, ((4-\n)&3) - 1 - (prefetch_distance - (\n==1) - (\n==2)) * 4
+ adds COUNT, COUNT, #adjust
+ bmi 6f
+5: pass2_4pixels \n, do_preload
+ subs COUNT, COUNT, #4
+ bpl 5b
+6:
+ .if adjust > -4
+ subs COUNT, COUNT, #adjust
+ bmi 8f // have to handle the possibility there are no groups of 4 without preloads
+ .else
+ sub COUNT, COUNT, #adjust
+ .endif
+7: pass2_4pixels \n
+ subs COUNT, COUNT, #4
+ bpl 7b
+8: adds COUNT, COUNT, #4-1
+ bmi 99f
+9: pass2_1pixel (\n != 0)
+ subs COUNT, COUNT, #1
+ bpl 9b
+99: pop {v1-v8,pc}
+.endm
+
+/* void
+ * pixman_get_scanline_bilinear_scaled_cover_pass2_asm_armv6 (
+ * uint32_t width,
+ * int16_t dist_y,
+ * uint32_t *dest,
+ * int16_t *source)
+ *
+ * This version is used when the output scanline falls between two
+ * different input scanlines
+ */
+pixman_asm_function pixman_get_scanline_bilinear_scaled_cover_pass2_asm_armv6
+COUNT .req a1
+DIST .req a2
+DST .req a3
+SRC .req a4
+OUT0 .req v1
+OUT1 .req v2
+OUT2 .req v3
+OUT3 .req v5 @ avoid register-lock of last STM register against following LDM
+.set prefetch_distance, 2
+ push {v1-v8,lr}
+ subs COUNT, COUNT, #1
+ bmi 99f
+ movs ip, DST, lsl #29
+ add v1, SRC, #64
+ mov ip, COUNT
+ pld [SRC] @ already cacheline-aligned
+ pld [SRC, #32]
+ @ total_preloads = (1 + (pixels - 1) / 4) * 2
+ @ initial_preloads = at most, (prefetch_distance + 1 + (n==2 | n==3)) * 2
+ bhi 13f
+ bcs 12f
+ bne 11f
+10: pass2 0
+11: pass2 1
+12: pass2 2
+13: pass2 3
+
+.unreq COUNT
+.unreq DST
+.unreq DIST
+.unreq SRC
+.unreq OUT0
+.unreq OUT1
+.unreq OUT2
+.unreq OUT3
+.endfunc
+
+/******************************************************************************/
+
+.macro pass2a_1pixel check_src_thresh
+ ldmia SRC!, {AG0, RB0}
+ .if \check_src_thresh
+ tst SRC, #31
+ .if prefetch_distance == 0
+ bne 20f
+ teq COUNT, #0
+ beq 20f
+ add SRC, SRC, #32
+ pld [SRC]
+20:
+ .else
+ addeq SRC, SRC, #32
+ .endif
+ .endif
+ mov AG0, AG0, lsl #8-BILINEAR_INTERPOLATION_BITS
+ mov RB0, RB0, lsr #BILINEAR_INTERPOLATION_BITS
+ sel OUT0, RB0, AG0
+ str OUT0, [DST], #4
+.endm
+
+.macro pass2a_4pixels n, do_preload
+ .if \n == 0
+ ldmia SRC!, {AG0, RB0, AG1, RB1, AG2, RB2, AG3, RB3}
+ .elseif \n == 1
+ ldmia SRC!, {AG0, RB0}
+ .elseif \n == 2
+ ldmia SRC!, {AG0, RB0, AG1, RB1}
+ .else // \n == 3
+ ldmia SRC!, {AG0, RB0, AG1, RB1, AG2, RB2}
+ .endif
+ add SRC, SRC, #32
+ .ifnc "\do_preload",""
+ pld [SRC, #prefetch_distance*64]
+ .endif
+ .if \n == 1
+ ldmia SRC!, {AG1, RB1, AG2, RB2, AG3, RB3}
+ .elseif \n == 2
+ ldmia SRC!, {AG2, RB2, AG3, RB3}
+ .elseif \n == 3
+ ldmia SRC!, {AG3, RB3}
+ .endif
+ mov AG0, AG0, lsl #8-BILINEAR_INTERPOLATION_BITS
+ mov RB0, RB0, lsr #BILINEAR_INTERPOLATION_BITS
+ mov AG1, AG1, lsl #8-BILINEAR_INTERPOLATION_BITS
+ mov RB1, RB1, lsr #BILINEAR_INTERPOLATION_BITS
+ mov AG2, AG2, lsl #8-BILINEAR_INTERPOLATION_BITS
+ mov RB2, RB2, lsr #BILINEAR_INTERPOLATION_BITS
+ mov AG3, AG3, lsl #8-BILINEAR_INTERPOLATION_BITS
+ mov RB3, RB3, lsr #BILINEAR_INTERPOLATION_BITS
+ sel OUT0, RB0, AG0
+ sel OUT1, RB1, AG1
+ sel OUT2, RB2, AG2
+ sel OUT3, RB3, AG3
+ stmia DST!, {OUT0, OUT1, OUT2, OUT3}
+.endm
+
+.macro pass2a n
+ .if \n != 0
+3: pass2a_1pixel 0
+ subs COUNT, COUNT, #1
+ bmi 99f
+ tst DST, #15
+ bne 3b
+ .endif
+ subs COUNT, COUNT, #4-1
+ bmi 8f
+ @ pixels_remaining_minus_4 = COUNT
+ @ pixels_done = (4-n)&3
+ @ preloads_done = at most, prefetch_distance + 1
+ @ total_preloads = 1 + (pixels - 1) / 4
+ @ so preloads_to_do_minus_1 = (COUNT + pixels_done - 1 - prefetch_distance*4) / 4
+ .set adjust, ((4-\n)&3) - 1 - prefetch_distance*4
+ adds COUNT, COUNT, #adjust
+ bmi 6f
+5: pass2a_4pixels \n, do_preload
+ subs COUNT, COUNT, #4
+ bpl 5b
+6:
+ .if adjust > -4
+ subs COUNT, COUNT, #adjust
+ bmi 8f // have to handle the possibility there are no groups of 4 without preloads
+ .else
+ sub COUNT, COUNT, #adjust
+ .endif
+7: pass2a_4pixels \n
+ subs COUNT, COUNT, #4
+ bpl 7b
+8: adds COUNT, COUNT, #4-1
+ bmi 99f
+9: pass2a_1pixel (\n != 0)
+ subs COUNT, COUNT, #1
+ bpl 9b
+99: pop {v1-v6,pc}
+.endm
+
+/* void
+ * pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_armv6 (
+ * uint32_t width,
+ * uint32_t *dest,
+ * int16_t *source)
+ *
+ * This version is used when the output scanline coincides
+ * exactly with an input scanline
+ */
+pixman_asm_function pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_armv6
+COUNT .req a1
+DST .req a2
+SRC .req a3
+AG0 .req a4
+RB0 .req v1
+AG1 .req v2
+RB1 .req v3
+AG2 .req v4
+RB2 .req v5
+AG3 .req v6
+RB3 .req ip
+OUT0 .req a4
+OUT1 .req v2
+OUT2 .req v4
+OUT3 .req lr @ avoid register-lock of last STM register against following LDM
+.set prefetch_distance, 2
+ push {v1-v6,lr}
+ subs COUNT, COUNT, #1
+ bcc 99f
+ @ total_preloads = 1 + (pixels - 1) / 4
+ @ initial_preloads = at most, prefetch_distance + 1
+ mov ip, #prefetch_distance*4
+ cmp COUNT, #prefetch_distance*4
+ pld [SRC] @ already cacheline-aligned
+ movcc ip, COUNT
+ add a4, SRC, #64
+ subs ip, ip, #4
+ bcc 2f
+1: pld [a4]
+ add a4, a4, #64
+ subs ip, ip, #4
+ bcs 1b
+2: msr cpsr_s, #0x50000
+ movs ip, DST, lsl #29
+ bhi 13f
+ bcs 12f
+ bne 11f
+10: pass2a 0
+11: pass2a 1
+12: pass2a 2
+13: pass2a 3
+
+.unreq COUNT
+.unreq DST
+.unreq SRC
+.unreq AG0
+.unreq RB0
+.unreq AG1
+.unreq RB1
+.unreq AG2
+.unreq RB2
+.unreq AG3
+.unreq RB3
+.unreq OUT0
+.unreq OUT1
+.unreq OUT2
+.unreq OUT3
+.endfunc
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h
index 66d2e12..3ed6c55 100644
--- a/pixman/pixman-arm-simd-asm-scaled.h
+++ b/pixman/pixman-arm-simd-asm-scaled.h
@@ -23,6 +23,8 @@
* Author: Ben Avison (bavison at riscosopen.org)
*/
+#include "pixman-private.h" // for BILINEAR_INTERPOLATION_BITS
+
.set log2_32, 5
.set log2_16, 4
.set log2_8, 3
@@ -414,5 +416,342 @@ TMP .req lr
.endfunc
.endm
+/******************************************************************************/
+
+.macro bilinear_scaled_cover_process_horizontal format, factor, in, nin, out, size, exit
+ mov DIST, ACCUM, lsr #32 - BILINEAR_INTERPOLATION_BITS
+ sub AG_OUT\out, AG_IN\nin, AG_IN\in
+ sub RB_OUT\out, RB_IN\nin, RB_IN\in
+ mul AG_OUT\out, AG_OUT\out, DIST
+ mul RB_OUT\out, RB_OUT\out, DIST
+ ldr UX, [sp]
+ add AG_OUT\out, AG_OUT\out, AG_IN\in, lsl #BILINEAR_INTERPOLATION_BITS
+ add RB_OUT\out, RB_OUT\out, RB_IN\in, lsl #BILINEAR_INTERPOLATION_BITS
+ .if \size == 1
+ stmia DST!, {AG_OUT0, RB_OUT0}
+ subs COUNT, COUNT, #1
+ bmi .L\format\()_factor\factor\()_\exit
+ .elseif \out
+ stmia DST!, {AG_OUT0, RB_OUT0, AG_OUT1, RB_OUT1}
+ tst DST, #16
+ addeq DST, DST, #32
+ subeqs COUNT, COUNT, #4
+ bmi .L\format\()_factor\factor\()_\exit
+ .endif
+ adds ACCUM, ACCUM, UX
+.endm
+
+.macro bilinear_scaled_cover_innerloop format, factor, convert, in, nin, out, nout, size, exit, dropthrough
+ .if \factor == 0
+
+ .L\format\()_factor\factor\()_\in\out\()_cs:
+ .if bpp == 32
+ ldr AG_IN\nin, [SRC, #4]!
+ .elseif bpp == 16
+ ldrh AG_IN\nin, [SRC, #2]!
+ .else // bpp == 8
+ ldrb AG_IN\nin, [SRC, #1]!
+ .endif
+ tst SRC, #31
+ bne .L\format\()_factor\factor\()_\in\out\()_cs_skip
+ subs PLDS, PLDS, #32
+ ble .L\format\()_factor\factor\()_\in\out\()_cs_skip
+ pld [SRC, #prefetch_distance*32]
+ .L\format\()_factor\factor\()_\in\out\()_cs_skip:
+ \convert AG_IN\nin, RB_IN\nin
+ .L\format\()_factor\factor\()_\in\out\()_cc:
+ bilinear_scaled_cover_process_horizontal \format, \factor, \in, \nin, \out, \size, \exit
+ bcc .L\format\()_factor\factor\()_\in\nout\()_cc
+ .ifc "\dropthrough",""
+ b .L\format\()_factor\factor\()_\nin\nout\()_cs
+ .endif
+
+ .elseif \factor == 1
+
+ .L\format\()_factor\factor\()_\in\out\()_cc:
+ .if bpp == 32
+ ldr AG_IN\nin, [SRC, #4]!
+ .elseif bpp == 16
+ ldrh AG_IN\nin, [SRC, #2]!
+ .else // bpp == 8
+ ldrb AG_IN\nin, [SRC, #1]!
+ .endif
+ tst SRC, #31
+ .if \in == 1
+ bne .L\format\()_factor\factor\()_\in\out\()_cc_skip
+ subs PLDS, PLDS, #32
+ ble .L\format\()_factor\factor\()_\in\out\()_cc_skip
+ pld [SRC, #prefetch_distance*32]
+ .L\format\()_factor\factor\()_\in\out\()_cc_skip:
+ \convert AG_IN\nin, RB_IN\nin
+ .else
+ bne .L\format\()_factor\factor\()_0\out\()_converge
+ subs PLDS, PLDS, #32
+ ble .L\format\()_factor\factor\()_0\out\()_converge
+ pld [SRC, #prefetch_distance*32]
+ b .L\format\()_factor\factor\()_0\out\()_converge
+ .L\format\()_factor\factor\()_0\out\()_cs:
+ orr TMP, SRC, #31
+ .if bpp == 32
+ ldmib SRC!, {AG_IN0, AG_IN1}
+ .elseif bpp == 16
+ ldrh AG_IN0, [SRC, #2]!
+ ldrh AG_IN1, [SRC, #2]!
+ .else // bpp == 8
+ ldrb AG_IN0, [SRC, #1]!
+ ldrb AG_IN1, [SRC, #1]!
+ .endif
+ cmp SRC, TMP
+ subgts PLDS, PLDS, #32
+ ble .L\format\()_factor\factor\()_0\out\()_cs_skip
+ pld [TMP, #1+prefetch_distance*32]
+ .L\format\()_factor\factor\()_0\out\()_cs_skip:
+ \convert AG_IN0, RB_IN0
+ .L\format\()_factor\factor\()_0\out\()_converge:
+ \convert AG_IN1, RB_IN1
+ .endif
+ bilinear_scaled_cover_process_horizontal \format, \factor, \in, \nin, \out, \size, \exit
+ bcs .L\format\()_factor\factor\()_0\nout\()_cs
+ .ifc "\dropthrough",""
+ b .L\format\()_factor\factor\()_\nin\nout\()_cc
+ .endif
+
+ .else // \factor >= 2
+
+ .L\format\()_factor\factor\()_0\out\():
+ orr TMP, SRC, #31
+ .if bpp == 32
+ ldrcs AG_IN0, [SRC, #factor*4]!
+ ldrcc AG_IN0, [SRC, #(factor-1)*4]!
+ ldr AG_IN1, [SRC, #4]!
+ .elseif bpp == 16
+ ldrcsh AG_IN0, [SRC, #factor*2]!
+ ldrcch AG_IN0, [SRC, #(factor-1)*2]!
+ ldrh AG_IN1, [SRC, #2]!
+ .else // bpp == 8
+ ldrcsb AG_IN0, [SRC, #factor]!
+ ldrccb AG_IN0, [SRC, #factor-1]!
+ ldrb AG_IN1, [SRC, #1]!
+ .endif
+ cmp SRC, TMP
+ subgts PLDS, PLDS, #32
+ ble .L\format\()_factor\factor\()_0\out\()_skip
+ pld [TMP, #1+prefetch_distance*32]
+ .L\format\()_factor\factor\()_0\out\()_skip:
+ \convert AG_IN0, RB_IN0
+ \convert AG_IN1, RB_IN1
+ bilinear_scaled_cover_process_horizontal \format, \factor, \in, \nin, \out, \size, \exit
+ .ifc "\dropthrough",""
+ b .L\format\()_factor\factor\()_0\nout\()
+ .endif
+
+ .endif
+.endm
+
+.macro generate_bilinear_scaled_cover_function fname, \
+ bpp_, \
+ format, \
+ factor, \
+ prefetch_distance_, \
+ init, \
+ convert
+
+/* void fname(uint32_t width,
+ * pixman_fixed_t x,
+ * pixman_fixed_t ux,
+ * uint32_t *dest,
+ * const void *source);
+ */
+pixman_asm_function fname
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set bpp, \bpp_
+ .set prefetch_distance, \prefetch_distance_
+
+/*
+ * Assign symbolic names to registers
+ */
+COUNT .req a1
+X .req a2
+ACCUM .req a2
+UX .req a3
+DIST .req a3
+TMP .req a3
+DST .req a4
+AG_IN0 .req v1
+RB_IN0 .req v2
+AG_IN1 .req v3
+RB_IN1 .req v4
+AG_OUT .req v5
+RB_OUT .req v6
+AG_OUT0 .req v5
+RB_OUT0 .req v6
+AG_OUT1 .req v7
+RB_OUT1 .req v8
+SRC .req ip
+PLDS .req lr
+
+ push {v1-v8,lr}
+ subs COUNT, COUNT, #1
+ bmi 99f
+
+ mla v1, UX, COUNT, X @ v1 = X for final pixel (memory pipeline still busy with push)
+ ldr v2, [sp, #9*4] @ get source from stack
+ add SRC, v2, X, lsr #16 - (log2_\bpp_ - 3)
+ bic v3, SRC, #31
+ add v2, v2, v1, lsr #16 - (log2_\bpp_ - 3)
+ pld [v3]
+ bic SRC, SRC, #bpp/8-1
+ .if \factor >= 2
+ @ In these cases we point at the 2nd input sample when we enter the main loop
+ add v1, SRC, #bpp/8
+ bic v1, v1, #31
+ add v1, v1, #prefetch_distance*32
+ .else
+ add v1, v3, #prefetch_distance*32
+ .endif
+ add v2, v2, #bpp/8 @ v2 -> second input sample for final pixel
+ subs PLDS, v2, v1
+ movcc v1, v2
+2: add v3, v3, #32
+ cmp v3, v1
+ bhi 3f
+ pld [v3]
+ b 2b
+3: @ Add 1 to PLDS so that subs PLDS,PLDS,#32 sets GT whenever a preload is to be done
+ add PLDS, PLDS, #1
+ mov UX, UX, lsl #16
+ mov ACCUM, X, lsl #16
+ push {UX}
+ \init
+
+ .if \factor >= 2
+ .if bpp == 32
+ ldr AG_IN0, [SRC]
+ ldr AG_IN1, [SRC, #4]!
+ .elseif bpp == 16
+ ldrh AG_IN0, [SRC]
+ ldrh AG_IN1, [SRC, #2]!
+ .else // bpp == 8
+ ldrb AG_IN0, [SRC]
+ ldrb AG_IN1, [SRC, #1]!
+ .endif
+ \convert AG_IN0, RB_IN0
+ \convert AG_IN1, RB_IN1
+ subs COUNT, COUNT, #4-1
+ bmi .L\format\()_factor\factor\()_narrow
+ bilinear_scaled_cover_process_horizontal \format, \factor, 0, 1, 0, 4, unused
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 0, 1, 1, 0, 4, trailing, dropthrough
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 0, 1, 0, 1, 4, trailing
+ .L\format\()_factor\factor\()_narrow:
+ add COUNT, COUNT, #4-1
+ bilinear_scaled_cover_process_horizontal \format, \factor, 0, 1, , 1, done
+ b .L\format\()_factor\factor\()_0
+ .L\format\()_factor\factor\()_trailing:
+ adds COUNT, COUNT, #4-1
+ bmi 98f
+ adds ACCUM, ACCUM, UX
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 0, 1, , , 1, done
+ .else
+ .if bpp == 32
+ ldr AG_IN0, [SRC]
+ .elseif bpp == 16
+ ldrh AG_IN0, [SRC]
+ .else // bpp == 8
+ ldrb AG_IN0, [SRC]
+ .endif
+ \convert AG_IN0, RB_IN0
+ subs COUNT, COUNT, #4-1
+ addmi COUNT, COUNT, #4-1
+ .if \factor == 0
+ bmi .L\format\()_factor\factor\()_0_cs
+ .elseif \factor == 1
+ bmi .L\format\()_factor\factor\()_0_cc
+ .endif
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 0, 1, 0, 1, 4, 0_trailing, dropthrough
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 1, 0, 1, 0, 4, 1_trailing
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 0, 1, 1, 0, 4, 0_trailing, dropthrough
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 1, 0, 0, 1, 4, 1_trailing
+ .L\format\()_factor\factor\()_0_trailing:
+ adds COUNT, COUNT, #4-1
+ bmi 98f
+ adds ACCUM, ACCUM, UX
+ .if \factor == 0
+ bcc .L\format\()_factor\factor\()_0_cc
+ .elseif \factor == 1
+ bcs .L\format\()_factor\factor\()_0_cs
+ .endif
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 1, 0, , , 1, done
+ .L\format\()_factor\factor\()_1_trailing:
+ adds COUNT, COUNT, #4-1
+ bmi 98f
+ adds ACCUM, ACCUM, UX
+ .if \factor == 0
+ bcc .L\format\()_factor\factor\()_1_cc
+ .elseif \factor == 1
+ bcs .L\format\()_factor\factor\()_0_cs
+ .endif
+ bilinear_scaled_cover_innerloop \format, \factor, \convert, 0, 1, , , 1, done
+ .endif
+ .L\format\()_factor\factor\()_done:
+98: pop {UX,v1-v8,pc}
+99: pop {v1-v8,pc}
+
+.unreq COUNT
+.unreq X
+.unreq ACCUM
+.unreq UX
+.unreq DIST
+.unreq TMP
+.unreq DST
+.unreq AG_IN0
+.unreq RB_IN0
+.unreq AG_IN1
+.unreq RB_IN1
+.unreq AG_OUT0
+.unreq RB_OUT0
+.unreq AG_OUT1
+.unreq RB_OUT1
+.unreq SRC
+.unreq PLDS
+.endfunc
+.endm
+
+.macro generate_bilinear_scaled_cover_functions bpp, \
+ format, \
+ pd0, pd1, pd2, pd3, pd4, pd5, pd6, pd7, \
+ init, \
+ convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor0_asm_armv6, \
+ \bpp, \format, 0, \pd0, \init, \convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor1_asm_armv6, \
+ \bpp, \format, 1, \pd1, \init, \convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor2_asm_armv6, \
+ \bpp, \format, 2, \pd2, \init, \convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor3_asm_armv6, \
+ \bpp, \format, 3, \pd3, \init, \convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor4_asm_armv6, \
+ \bpp, \format, 4, \pd4, \init, \convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor5_asm_armv6, \
+ \bpp, \format, 5, \pd5, \init, \convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor6_asm_armv6, \
+ \bpp, \format, 6, \pd6, \init, \convert
+generate_bilinear_scaled_cover_function \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor7_asm_armv6, \
+ \bpp, \format, 7, \pd7, \init, \convert
+.endm
+
+/******************************************************************************/
+
.macro nop_macro x:vararg
.endm
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index e69216a..f783c17 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -37,6 +37,9 @@
#define ALIGN(addr) \
((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
+#define ALIGN32(addr) \
+ ((uint8_t *)((((uintptr_t)(addr)) + 31) & (~31)))
+
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
@@ -191,6 +194,224 @@ BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, x8r8g8b8, uint32_t)
BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, r5g6b5, uint16_t)
BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, a8, uint8_t)
+typedef void (*bilinear_pass1_t) (uint32_t width,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int16_t *dest,
+ const void *source);
+
+#ifndef __STDC_VERSION__
+#define FLEXIBLE 1
+#else
+#if __STDC_VERSION__ >= 199901 // struct hack is illegal in C99, use flexible array member
+#define FLEXIBLE
+#else
+#define FLEXIBLE 1
+#endif
+#endif
+
+typedef struct
+{
+ bilinear_pass1_t pass1;
+ int line_y[2];
+ int16_t *line_buffer;
+ pixman_fixed_t x;
+ pixman_fixed_t y;
+ int stride;
+ uint8_t data[FLEXIBLE];
+} bilinear_info_t;
+
+static void
+armv6_get_scanline_bilinear_fini (pixman_iter_t *iter)
+{
+ free (iter->data);
+}
+
+void
+pixman_get_scanline_bilinear_scaled_cover_pass2_asm_armv6 (
+ uint32_t width,
+ int16_t dist_y,
+ uint32_t *dest,
+ int16_t *source);
+
+void
+pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_armv6 (
+ uint32_t width,
+ uint32_t *dest,
+ int16_t *source);
+
+#define BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(cputype, name, type) \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor0_asm_##cputype (); \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor1_asm_##cputype (); \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor2_asm_##cputype (); \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor3_asm_##cputype (); \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor4_asm_##cputype (); \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor5_asm_##cputype (); \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor6_asm_##cputype (); \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor7_asm_##cputype (); \
+ \
+static void \
+cputype##_get_scanline_bilinear_scaled_cover_pass1_##name##_factor8 ( \
+ uint32_t width, \
+ pixman_fixed_t x, \
+ pixman_fixed_t ux, \
+ int16_t *dest, \
+ const void *source) \
+{ \
+ /* The preload scheme used by the assembly version relies on the \
+ * reduction factor being less than 8x. Fall back to C. */ \
+ while (width--) \
+ { \
+ uint32_t lag, rag, lrb, rrb, dist_x, ag, rb; \
+ cputype##_convert_adjacent_##name (source, pixman_fixed_to_int (x), \
+ &lag, &rag, &lrb, &rrb); \
+ dist_x = (x & 0xFFFF) >> (16 - BILINEAR_INTERPOLATION_BITS); \
+ ag = (lag << BILINEAR_INTERPOLATION_BITS) + dist_x * (rag - lag); \
+ rb = (lrb << BILINEAR_INTERPOLATION_BITS) + dist_x * (rrb - lrb); \
+ *(uint32_t *)(dest+0) = ag; \
+ *(uint32_t *)(dest+2) = rb; \
+ dest += 4; \
+ if (((uintptr_t) dest & 31) == 0) \
+ dest += 4*4; \
+ x += ux; \
+ } \
+} \
+ \
+static void \
+cputype##_get_scanline_bilinear_init_##name (pixman_iter_t *iter, \
+ const pixman_iter_info_t *iter_info) \
+{ \
+ int width = iter->width; \
+ bilinear_info_t *info; \
+ int stride; \
+ type *bits; \
+ pixman_fixed_t x, y, uxx, uxy, uyy; \
+ \
+ PIXMAN_IMAGE_GET_SCALED (iter->image, iter->x, iter->y, type, \
+ stride, bits, x, y, uxx, uxy, uyy); \
+ (void) bits; \
+ (void) uxy; \
+ (void) uyy; \
+ \
+ info = malloc (offsetof(bilinear_info_t, data) + 31 + \
+ (width + 3) / 4 * sizeof (int16_t)*4*4 * 2); \
+ if (!info) \
+ { \
+ /* In this case, we don't guarantee any particular rendering. */ \
+ _pixman_log_error ( \
+ FUNC, "Allocation failure, skipping rendering\n"); \
+ \
+ iter->get_scanline = _pixman_iter_get_scanline_noop; \
+ iter->fini = NULL; \
+ iter->data = NULL; \
+ } \
+ else \
+ { \
+ static const bilinear_pass1_t routines[9] = { \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor0_asm_##cputype, \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor1_asm_##cputype, \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor2_asm_##cputype, \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor3_asm_##cputype, \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor4_asm_##cputype, \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor5_asm_##cputype, \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor6_asm_##cputype, \
+ pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor7_asm_##cputype, \
+ cputype##_get_scanline_bilinear_scaled_cover_pass1_##name##_factor8 \
+ }; \
+ uxx >>= 16; \
+ if (uxx >= 8) \
+ uxx = 8; \
+ info->pass1 = routines[uxx]; \
+ \
+ /* It is safe to set the y coordinates to -1 initially \
+ * because COVER_CLIP_BILINEAR ensures that we will only \
+ * be asked to fetch lines in the [0, height) interval \
+ */ \
+ info->line_y[0] = -1; \
+ info->line_y[1] = -1; \
+ \
+ info->line_buffer = (int16_t *) ALIGN32 (info->data); \
+ info->x = x - pixman_fixed_1 / 2; \
+ info->y = y - pixman_fixed_1 / 2; \
+ info->stride = stride; \
+ \
+ iter->fini = cputype##_get_scanline_bilinear_fini; \
+ iter->data = info; \
+ } \
+} \
+ \
+static uint32_t * \
+cputype##_get_scanline_bilinear_scaled_cover_##name (pixman_iter_t *iter, \
+ const uint32_t *mask) \
+{ \
+ bilinear_info_t *info = iter->data; \
+ int y0 = pixman_fixed_to_int (info->y); \
+ int y1 = y0 + 1; \
+ int i = y0 & 1; \
+ int width = iter->width; \
+ pixman_fixed_t fx = info->x; \
+ pixman_fixed_t ux = iter->image->common.transform->matrix[0][0]; \
+ int16_t *buffer = info->line_buffer; \
+ type *bits = (type *)iter->image->bits.bits; \
+ int stride = info->stride; \
+ uint32_t *out = iter->buffer; \
+ int32_t dist_y; \
+ \
+ dist_y = (info->y >> BILINEAR_INTERPOLATION_BITS) & \
+ ((0x10000 >> BILINEAR_INTERPOLATION_BITS) - \
+ (0x10000 >> (2 * BILINEAR_INTERPOLATION_BITS))); \
+ if (i) \
+ { \
+ /* Invert weight if upper scanline is in second buffer */ \
+ dist_y = (0x10000 >> BILINEAR_INTERPOLATION_BITS) - dist_y; \
+ } \
+ info->y += iter->image->common.transform->matrix[1][1]; \
+ \
+ if (info->line_y[i] != y0) \
+ { \
+ info->pass1 (width, fx, ux, buffer + 4*4*i, bits + stride * y0); \
+ info->line_y[i] = y0; \
+ } \
+ \
+ if (dist_y & ((0x10000 >> BILINEAR_INTERPOLATION_BITS) - \
+ (0x10000 >> (2 * BILINEAR_INTERPOLATION_BITS)))) \
+ { \
+ if (info->line_y[!i] != y1) \
+ { \
+ info->pass1 (width, fx, ux, buffer + 4*4*!i, bits + stride * y1); \
+ info->line_y[!i] = y1; \
+ } \
+ \
+ pixman_get_scanline_bilinear_scaled_cover_pass2_asm_##cputype ( \
+ width, dist_y, out, buffer); \
+ } \
+ else \
+ { \
+ pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_##cputype ( \
+ width, out, buffer + 4*4*i); \
+ } \
+ \
+ return out; \
+}
+
+static inline void armv6_convert_adjacent_a8r8g8b8 (const void *void_source,
+ int x,
+ uint32_t *lag,
+ uint32_t *rag,
+ uint32_t *lrb,
+ uint32_t *rrb)
+{
+ const uint32_t *source = void_source;
+ uint32_t left = source[x];
+ uint32_t right = source[x+1];
+ *lag = (left & 0xff00ff00) >> 8;
+ *rag = (right & 0xff00ff00) >> 8;
+ *lrb = (left & 0x00ff00ff);
+ *rrb = (right & 0x00ff00ff);
+}
+
+BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(armv6, a8r8g8b8, uint32_t)
+
#define NEAREST_SCALED_COVER_USES_SRC_BUFFER(op, src_format, dst_format) \
(PIXMAN_OP_##op != PIXMAN_OP_SRC || \
(PIXMAN_##dst_format != PIXMAN_a8r8g8b8 && \
@@ -537,6 +758,14 @@ static const pixman_iter_info_t arm_simd_iters[] =
NULL
},
+ { PIXMAN_a8r8g8b8,
+ PIXMAN_ARM_BILINEAR_SCALED_COVER_FLAGS,
+ ITER_NARROW | ITER_SRC,
+ armv6_get_scanline_bilinear_init_a8r8g8b8,
+ armv6_get_scanline_bilinear_scaled_cover_a8r8g8b8,
+ NULL
+ },
+
{ PIXMAN_x8r8g8b8,
PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS,
ITER_NARROW | ITER_SRC,
--
1.7.5.4
More information about the Pixman
mailing list