[Pixman] [PATCH 33/37] armv6: Add fetcher for a8r8g8b8 bilinear-interpolation scaled images

Tue Sep 9 11:51:41 PDT 2014

This is constrained to support X increments in the positive X direction only.
It also doesn't attempt to support any form of image repeat.

Here are some affine-bench results for a variety of horizontal and vertical
scaling factors.

Before:
    x increment   0.5   0.75  1.0   1.5   2.0
y increment
    0.5            7.1   6.9   6.8   6.6   6.3
    0.75           6.4   6.2   6.1   5.8   5.5
    1.0            5.9   5.7         5.2   4.9
    1.5            5.0   4.8   4.6   4.3   4.0
    2.0            4.4   4.2   4.0   3.7   3.4

After:
    x increment   0.5   0.75  1.0   1.5   2.0
y increment
    0.5           21.0  19.6  19.2  20.2  18.9
    0.75          18.0  16.6  16.1  17.1  15.9
    1.0           21.8  18.9        19.9  17.7
    1.5           12.8  11.3  10.9  11.8  10.7
    2.0           10.7   9.3   8.9   9.8   8.8

Improvement:
    x increment   0.5     0.75    1.0     1.5     2.0
y increment
    0.5           +196.7% +183.6% +181.8% +206.6% +198.4%
    0.75          +182.2% +166.2% +164.0% +194.8% +185.8%
    1.0           +271.7% +234.4%         +282.7% +257.9%
    1.5           +154.6% +135.3% +134.3% +173.3% +164.8%
    2.0           +144.1% +124.2% +123.3% +165.6% +155.5%
---
 pixman/pixman-arm-common.h          |   14 ++
 pixman/pixman-arm-simd-asm-scaled.S |  352 +++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd-asm-scaled.h |  339 +++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c            |  229 +++++++++++++++++++++++
 4 files changed, 934 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 2ddcbbc..9b5cefb 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -478,6 +478,20 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp,                   \
         cputype ## _composite_nearest_scaled_cover_ ## func                    \
     }
 
+#define PIXMAN_ARM_BILINEAR_AFFINE_FLAGS                                \
+    (FAST_PATH_NO_ALPHA_MAP             |                               \
+     FAST_PATH_NO_ACCESSORS             |                               \
+     FAST_PATH_NARROW_FORMAT            |                               \
+     FAST_PATH_BILINEAR_FILTER          |                               \
+     FAST_PATH_HAS_TRANSFORM            |                               \
+     FAST_PATH_AFFINE_TRANSFORM)
+
+#define PIXMAN_ARM_BILINEAR_SCALED_COVER_FLAGS                          \
+    (PIXMAN_ARM_BILINEAR_AFFINE_FLAGS      |                            \
+     FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR |                            \
+     FAST_PATH_X_UNIT_POSITIVE             |                            \
+     FAST_PATH_SCALE_TRANSFORM) // implies FAST_PATH_Y_UNIT_ZERO
+
 #define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name)                         \
 void                                                                        \
 pixman_get_scanline_##name##_asm_##cputype (int32_t        w,               \
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index 2606e64..85ca212 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -203,3 +203,355 @@ generate_nearest_scaled_cover_function \
     2, 3 /* prefetch distances */, nop_macro, convert_8_8888
 
 /******************************************************************************/
+
+.macro convert_8888_08080808  in_ag, rb
+        uxtb16  \rb, \in_ag
+        uxtb16  \in_ag, \in_ag, ror #8
+.endm
+
+generate_bilinear_scaled_cover_functions  32, a8r8g8b8, 3, 3, 3, 3, 3, 3, 3, 3, nop_macro, convert_8888_08080808
+
+/******************************************************************************/
+
+.macro pass2_1pixel_internal  t0, t1, b0, b1, tmp, mul, d
+        pkhtb   \tmp, \t0, \b0, asr #16
+        pkhbt   \t0, \b0, \t0, lsl #16
+        pkhtb   \b0, \t1, \b1, asr #16
+        pkhbt   \t1, \b1, \t1, lsl #16
+        smuad   \t0, \t0, \mul
+        smuad   \tmp, \tmp, \mul
+        smuad   \t1, \t1, \mul
+        smuad   \b0, \b0, \mul
+        mov     \tmp, \tmp, lsl #8
+        pkhtb   \t0, \tmp, \t0, asr #8
+        pkhtb   \t1, \b0, \t1, asr #16
+        sel     \d, \t1, \t0
+.endm
+
+.macro pass2_1pixel  check_src_thresh
+        ldrd    v3, v4, [SRC, #32]
+        ldrd    v1, v2, [SRC], #8
+ .if \check_src_thresh
+        tst     SRC, #31
+        addeq   SRC, SRC, #32
+ .endif
+        pass2_1pixel_internal  v1, v2, v3, v4, ip, DIST, v1
+        str     v1, [DST], #4
+.endm
+
+.macro pass2_preload  n, do_preload, pos
+ .ifnc "do_preload",""
+  /* Minimum distance is 32 because we read the bottom-row pixels from
+   * the cacheline after the one being pointed at */
+  .set dist, 32 + prefetch_distance*64
+  .if \n == 1 || \n == 2
+   /* Stretch one cacheline further ahead in these cases to ensure
+    * we stop preloading at boundary between groups of 4 output pixels */
+   .set dist, dist + 32
+  .endif
+  .if \pos == \n
+        /* Source pointer has just skipped "bottom" cacheline */
+        pld     [SRC, #dist]
+  .elseif ((\pos) & 1) == ((\n) & 1)
+        /* Time-equidistant between skips, but pointer is half-way through cacheline */
+        pld     [SRC, #16 + dist]
+  .endif
+ .endif
+.endm
+
+.macro pass2_4pixels  n, do_preload
+        ldrd    v3, v4, [SRC, #32]
+        ldrd    v1, v2, [SRC], #8 + 32*-(\n==1)
+        pass2_preload  \n, \do_preload, 1
+        ldrd    v7, v8, [SRC, #32]
+        ldrd    v5, v6, [SRC], #8 + 32*-(\n==2)
+        pass2_preload  \n, \do_preload, 2
+        pass2_1pixel_internal  v1, v2, v3, v4, ip, DIST, OUT0
+        pass2_1pixel_internal  v5, v6, v7, v8, ip, DIST, OUT1
+        ldrd    v5, v6, [SRC, #32]
+        ldrd    v3, v4, [SRC], #8 + 32*-(\n==3)
+        pass2_preload  \n, \do_preload, 3
+        ldrd    v7, v8, [SRC, #32]
+        ldr     lr, [SRC], #4
+        pass2_1pixel_internal  v3, v4, v5, v6, ip, DIST, OUT2
+        ldr     v6, [SRC], #4 + 32*-(\n==0)
+        pass2_preload  \n, \do_preload, 0
+        pass2_1pixel_internal  lr, v6, v7, v8, ip, DIST, OUT3
+        stmia   DST!, {OUT0, OUT1, OUT2, OUT3}
+.endm
+
+.macro pass2  n
+ .if \n == 1 || \n == 2
+        cmp     ip, #(prefetch_distance+1)*4
+        movcs   ip, #(prefetch_distance+1)*4
+ .else
+        cmp     ip, #prefetch_distance*4
+        movcs   ip, #prefetch_distance*4
+ .endif
+        subs    ip, ip, #4
+        bcc     2f
+1:      pld     [v1]
+        pld     [v1, #32]
+        add     v1, v1, #64
+        subs    ip, ip, #4
+        bcs     1b
+2:      sub     DIST, DIST, DIST, lsl #16
+        msr     cpsr_s, #0x50000
+        add     DIST, DIST, #1 << (32-BILINEAR_INTERPOLATION_BITS)
+        @ top half of DIST now holds complementary weight
+ .if \n != 0
+3:      pass2_1pixel  0
+        subs    COUNT, COUNT, #1
+        bmi     99f
+        tst     DST, #15
+        bne     3b
+ .endif
+        subs    COUNT, COUNT, #4-1
+        bmi     8f
+        @ pixels_remaining_minus_4 = COUNT
+        @ pixels_done = (4-n)&3
+        @ preloads_done = at most, (prefetch_distance + 1 + (n==2 | n==3)) * 2
+        @ total_preloads = (1 + (pixels - 1) / 4) * 2
+        @ so if n is 0 or 3, then
+        @ preloads_to_do_minus_1 = (COUNT + pixels_done - 1 - prefetch_distance*4) / 4 * 2
+        @ or if n is 1 or 2, then 2 fewer than that
+ .set adjust, ((4-\n)&3) - 1 - (prefetch_distance - (\n==1) - (\n==2)) * 4
+        adds    COUNT, COUNT, #adjust
+        bmi     6f
+5:      pass2_4pixels  \n, do_preload
+        subs    COUNT, COUNT, #4
+        bpl     5b
+6:
+ .if adjust > -4
+        subs    COUNT, COUNT, #adjust
+        bmi     8f  // have to handle the possibility there are no groups of 4 without preloads
+ .else
+        sub     COUNT, COUNT, #adjust
+ .endif
+7:      pass2_4pixels  \n
+        subs    COUNT, COUNT, #4
+        bpl     7b
+8:      adds    COUNT, COUNT, #4-1
+        bmi     99f
+9:      pass2_1pixel  (\n != 0)
+        subs    COUNT, COUNT, #1
+        bpl     9b
+99:     pop     {v1-v8,pc}
+.endm
+
+/* void
+ * pixman_get_scanline_bilinear_scaled_cover_pass2_asm_armv6 (
+ *                                               uint32_t  width,
+ *                                               int16_t   dist_y,
+ *                                               uint32_t *dest,
+ *                                               int16_t  *source)
+ *
+ * This version is used when the output scanline falls between two
+ * different input scanlines
+ */
+pixman_asm_function pixman_get_scanline_bilinear_scaled_cover_pass2_asm_armv6
+COUNT   .req    a1
+DIST    .req    a2
+DST     .req    a3
+SRC     .req    a4
+OUT0    .req    v1
+OUT1    .req    v2
+OUT2    .req    v3
+OUT3    .req    v5 @ avoid register-lock of last STM register against following LDM
+.set prefetch_distance, 2
+        push    {v1-v8,lr}
+        subs    COUNT, COUNT, #1
+        bmi     99f
+        movs    ip, DST, lsl #29
+        add     v1, SRC, #64
+        mov     ip, COUNT
+        pld     [SRC]   @ already cacheline-aligned
+        pld     [SRC, #32]
+        @ total_preloads = (1 + (pixels - 1) / 4) * 2
+        @ initial_preloads = at most, (prefetch_distance + 1 + (n==2 | n==3)) * 2
+        bhi     13f
+        bcs     12f
+        bne     11f
+10:     pass2   0
+11:     pass2   1
+12:     pass2   2
+13:     pass2   3
+
+.unreq  COUNT
+.unreq  DST
+.unreq  DIST
+.unreq  SRC
+.unreq  OUT0
+.unreq  OUT1
+.unreq  OUT2
+.unreq  OUT3
+.endfunc
+
+/******************************************************************************/
+
+.macro pass2a_1pixel  check_src_thresh
+        ldmia   SRC!, {AG0, RB0}
+ .if \check_src_thresh
+        tst     SRC, #31
+  .if prefetch_distance == 0
+        bne     20f
+        teq     COUNT, #0
+        beq     20f
+        add     SRC, SRC, #32
+        pld     [SRC]
+20:
+  .else
+        addeq   SRC, SRC, #32
+  .endif
+ .endif
+        mov     AG0, AG0, lsl #8-BILINEAR_INTERPOLATION_BITS
+        mov     RB0, RB0, lsr #BILINEAR_INTERPOLATION_BITS
+        sel     OUT0, RB0, AG0
+        str     OUT0, [DST], #4
+.endm
+
+.macro pass2a_4pixels  n, do_preload
+ .if \n == 0
+        ldmia   SRC!, {AG0, RB0, AG1, RB1, AG2, RB2, AG3, RB3}
+ .elseif \n == 1
+        ldmia   SRC!, {AG0, RB0}
+ .elseif \n == 2
+        ldmia   SRC!, {AG0, RB0, AG1, RB1}
+ .else // \n == 3
+        ldmia   SRC!, {AG0, RB0, AG1, RB1, AG2, RB2}
+ .endif
+        add     SRC, SRC, #32
+ .ifnc "\do_preload",""
+        pld     [SRC, #prefetch_distance*64]
+ .endif
+ .if \n == 1
+        ldmia   SRC!, {AG1, RB1, AG2, RB2, AG3, RB3}
+ .elseif \n == 2
+        ldmia   SRC!, {AG2, RB2, AG3, RB3}
+ .elseif \n == 3
+        ldmia   SRC!, {AG3, RB3}
+ .endif
+        mov     AG0, AG0, lsl #8-BILINEAR_INTERPOLATION_BITS
+        mov     RB0, RB0, lsr #BILINEAR_INTERPOLATION_BITS
+        mov     AG1, AG1, lsl #8-BILINEAR_INTERPOLATION_BITS
+        mov     RB1, RB1, lsr #BILINEAR_INTERPOLATION_BITS
+        mov     AG2, AG2, lsl #8-BILINEAR_INTERPOLATION_BITS
+        mov     RB2, RB2, lsr #BILINEAR_INTERPOLATION_BITS
+        mov     AG3, AG3, lsl #8-BILINEAR_INTERPOLATION_BITS
+        mov     RB3, RB3, lsr #BILINEAR_INTERPOLATION_BITS
+        sel     OUT0, RB0, AG0
+        sel     OUT1, RB1, AG1
+        sel     OUT2, RB2, AG2
+        sel     OUT3, RB3, AG3
+        stmia   DST!, {OUT0, OUT1, OUT2, OUT3}
+.endm
+
+.macro pass2a  n
+ .if \n != 0
+3:      pass2a_1pixel  0
+        subs    COUNT, COUNT, #1
+        bmi     99f
+        tst     DST, #15
+        bne     3b
+ .endif
+        subs    COUNT, COUNT, #4-1
+        bmi     8f
+        @ pixels_remaining_minus_4 = COUNT
+        @ pixels_done = (4-n)&3
+        @ preloads_done = at most, prefetch_distance + 1
+        @ total_preloads = 1 + (pixels - 1) / 4
+        @ so preloads_to_do_minus_1 = (COUNT + pixels_done - 1 - prefetch_distance*4) / 4
+ .set adjust, ((4-\n)&3) - 1 - prefetch_distance*4
+        adds    COUNT, COUNT, #adjust
+        bmi     6f
+5:      pass2a_4pixels  \n, do_preload
+        subs    COUNT, COUNT, #4
+        bpl     5b
+6:
+ .if adjust > -4
+        subs    COUNT, COUNT, #adjust
+        bmi     8f  // have to handle the possibility there are no groups of 4 without preloads
+ .else
+        sub     COUNT, COUNT, #adjust
+ .endif
+7:      pass2a_4pixels  \n
+        subs    COUNT, COUNT, #4
+        bpl     7b
+8:      adds    COUNT, COUNT, #4-1
+        bmi     99f
+9:      pass2a_1pixel  (\n != 0)
+        subs    COUNT, COUNT, #1
+        bpl     9b
+99:     pop     {v1-v6,pc}
+.endm
+
+/* void
+ * pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_armv6 (
+ *                                               uint32_t  width,
+ *                                               uint32_t *dest,
+ *                                               int16_t  *source)
+ *
+ * This version is used when the output scanline coincides
+ * exactly with an input scanline
+ */
+pixman_asm_function pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_armv6
+COUNT   .req    a1
+DST     .req    a2
+SRC     .req    a3
+AG0     .req    a4
+RB0     .req    v1
+AG1     .req    v2
+RB1     .req    v3
+AG2     .req    v4
+RB2     .req    v5
+AG3     .req    v6
+RB3     .req    ip
+OUT0    .req    a4
+OUT1    .req    v2
+OUT2    .req    v4
+OUT3    .req    lr @ avoid register-lock of last STM register against following LDM
+.set prefetch_distance, 2
+        push    {v1-v6,lr}
+        subs    COUNT, COUNT, #1
+        bcc     99f
+        @ total_preloads = 1 + (pixels - 1) / 4
+        @ initial_preloads = at most, prefetch_distance + 1
+        mov     ip, #prefetch_distance*4
+        cmp     COUNT, #prefetch_distance*4
+        pld     [SRC]   @ already cacheline-aligned
+        movcc   ip, COUNT
+        add     a4, SRC, #64
+        subs    ip, ip, #4
+        bcc     2f
+1:      pld     [a4]
+        add     a4, a4, #64
+        subs    ip, ip, #4
+        bcs     1b
+2:      msr     cpsr_s, #0x50000
+        movs    ip, DST, lsl #29
+        bhi     13f
+        bcs     12f
+        bne     11f
+10:     pass2a  0
+11:     pass2a  1
+12:     pass2a  2
+13:     pass2a  3
+
+.unreq  COUNT
+.unreq  DST
+.unreq  SRC
+.unreq  AG0
+.unreq  RB0
+.unreq  AG1
+.unreq  RB1
+.unreq  AG2
+.unreq  RB2
+.unreq  AG3
+.unreq  RB3
+.unreq  OUT0
+.unreq  OUT1
+.unreq  OUT2
+.unreq  OUT3
+.endfunc
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h
index 66d2e12..3ed6c55 100644
--- a/pixman/pixman-arm-simd-asm-scaled.h
+++ b/pixman/pixman-arm-simd-asm-scaled.h
@@ -23,6 +23,8 @@
  * Author:  Ben Avison (bavison at riscosopen.org)
  */
 
+#include "pixman-private.h" // for BILINEAR_INTERPOLATION_BITS
+
 .set log2_32, 5
 .set log2_16, 4
 .set log2_8,  3
@@ -414,5 +416,342 @@ TMP     .req    lr
 .endfunc
 .endm
 
+/******************************************************************************/
+
+.macro bilinear_scaled_cover_process_horizontal  format, factor, in, nin, out, size, exit
+        mov     DIST, ACCUM, lsr #32 - BILINEAR_INTERPOLATION_BITS
+        sub     AG_OUT\out, AG_IN\nin, AG_IN\in
+        sub     RB_OUT\out, RB_IN\nin, RB_IN\in
+        mul     AG_OUT\out, AG_OUT\out, DIST
+        mul     RB_OUT\out, RB_OUT\out, DIST
+        ldr     UX, [sp]
+        add     AG_OUT\out, AG_OUT\out, AG_IN\in, lsl #BILINEAR_INTERPOLATION_BITS
+        add     RB_OUT\out, RB_OUT\out, RB_IN\in, lsl #BILINEAR_INTERPOLATION_BITS
+ .if \size == 1
+        stmia   DST!, {AG_OUT0, RB_OUT0}
+        subs    COUNT, COUNT, #1
+        bmi     .L\format\()_factor\factor\()_\exit
+ .elseif \out
+        stmia   DST!, {AG_OUT0, RB_OUT0, AG_OUT1, RB_OUT1}
+        tst     DST, #16
+        addeq   DST, DST, #32
+        subeqs  COUNT, COUNT, #4
+        bmi     .L\format\()_factor\factor\()_\exit
+ .endif
+        adds    ACCUM, ACCUM, UX
+.endm
+
+.macro bilinear_scaled_cover_innerloop  format, factor, convert, in, nin, out, nout, size, exit, dropthrough
+ .if \factor == 0
+
+  .L\format\()_factor\factor\()_\in\out\()_cs:
+  .if bpp == 32
+        ldr     AG_IN\nin, [SRC, #4]!
+  .elseif bpp == 16
+        ldrh    AG_IN\nin, [SRC, #2]!
+  .else // bpp == 8
+        ldrb    AG_IN\nin, [SRC, #1]!
+  .endif
+        tst     SRC, #31
+        bne     .L\format\()_factor\factor\()_\in\out\()_cs_skip
+        subs    PLDS, PLDS, #32
+        ble     .L\format\()_factor\factor\()_\in\out\()_cs_skip
+        pld     [SRC, #prefetch_distance*32]
+  .L\format\()_factor\factor\()_\in\out\()_cs_skip:
+        \convert  AG_IN\nin, RB_IN\nin
+  .L\format\()_factor\factor\()_\in\out\()_cc:
+        bilinear_scaled_cover_process_horizontal  \format, \factor, \in, \nin, \out, \size, \exit
+        bcc     .L\format\()_factor\factor\()_\in\nout\()_cc
+  .ifc "\dropthrough",""
+        b       .L\format\()_factor\factor\()_\nin\nout\()_cs
+  .endif
+
+ .elseif \factor == 1
+
+  .L\format\()_factor\factor\()_\in\out\()_cc:
+  .if bpp == 32
+        ldr     AG_IN\nin, [SRC, #4]!
+  .elseif bpp == 16
+        ldrh    AG_IN\nin, [SRC, #2]!
+  .else // bpp == 8
+        ldrb    AG_IN\nin, [SRC, #1]!
+  .endif
+        tst     SRC, #31
+  .if \in == 1
+        bne     .L\format\()_factor\factor\()_\in\out\()_cc_skip
+        subs    PLDS, PLDS, #32
+        ble     .L\format\()_factor\factor\()_\in\out\()_cc_skip
+        pld     [SRC, #prefetch_distance*32]
+  .L\format\()_factor\factor\()_\in\out\()_cc_skip:
+        \convert  AG_IN\nin, RB_IN\nin
+  .else
+        bne     .L\format\()_factor\factor\()_0\out\()_converge
+        subs    PLDS, PLDS, #32
+        ble     .L\format\()_factor\factor\()_0\out\()_converge
+        pld     [SRC, #prefetch_distance*32]
+        b       .L\format\()_factor\factor\()_0\out\()_converge
+   .L\format\()_factor\factor\()_0\out\()_cs:
+        orr     TMP, SRC, #31
+   .if bpp == 32
+        ldmib   SRC!, {AG_IN0, AG_IN1}
+   .elseif bpp == 16
+        ldrh    AG_IN0, [SRC, #2]!
+        ldrh    AG_IN1, [SRC, #2]!
+   .else // bpp == 8
+        ldrb    AG_IN0, [SRC, #1]!
+        ldrb    AG_IN1, [SRC, #1]!
+   .endif
+        cmp     SRC, TMP
+        subgts  PLDS, PLDS, #32
+        ble     .L\format\()_factor\factor\()_0\out\()_cs_skip
+        pld     [TMP, #1+prefetch_distance*32]
+   .L\format\()_factor\factor\()_0\out\()_cs_skip:
+        \convert  AG_IN0, RB_IN0
+   .L\format\()_factor\factor\()_0\out\()_converge:
+        \convert  AG_IN1, RB_IN1
+  .endif
+        bilinear_scaled_cover_process_horizontal  \format, \factor, \in, \nin, \out, \size, \exit
+        bcs     .L\format\()_factor\factor\()_0\nout\()_cs
+  .ifc "\dropthrough",""
+        b       .L\format\()_factor\factor\()_\nin\nout\()_cc
+  .endif
+
+ .else // \factor >= 2
+
+  .L\format\()_factor\factor\()_0\out\():
+        orr     TMP, SRC, #31
+  .if bpp == 32
+        ldrcs   AG_IN0, [SRC, #factor*4]!
+        ldrcc   AG_IN0, [SRC, #(factor-1)*4]!
+        ldr     AG_IN1, [SRC, #4]!
+  .elseif bpp == 16
+        ldrcsh  AG_IN0, [SRC, #factor*2]!
+        ldrcch  AG_IN0, [SRC, #(factor-1)*2]!
+        ldrh    AG_IN1, [SRC, #2]!
+  .else // bpp == 8
+        ldrcsb  AG_IN0, [SRC, #factor]!
+        ldrccb  AG_IN0, [SRC, #factor-1]!
+        ldrb    AG_IN1, [SRC, #1]!
+  .endif
+        cmp     SRC, TMP
+        subgts  PLDS, PLDS, #32
+        ble     .L\format\()_factor\factor\()_0\out\()_skip
+        pld     [TMP, #1+prefetch_distance*32]
+  .L\format\()_factor\factor\()_0\out\()_skip:
+        \convert  AG_IN0, RB_IN0
+        \convert  AG_IN1, RB_IN1
+        bilinear_scaled_cover_process_horizontal  \format, \factor, \in, \nin, \out, \size, \exit
+  .ifc "\dropthrough",""
+        b       .L\format\()_factor\factor\()_0\nout\()
+  .endif
+
+ .endif
+.endm
+
+.macro generate_bilinear_scaled_cover_function fname, \
+                                               bpp_, \
+                                               format, \
+                                               factor, \
+                                               prefetch_distance_, \
+                                               init, \
+                                               convert
+
+/* void fname(uint32_t       width,
+ *            pixman_fixed_t x,
+ *            pixman_fixed_t ux,
+ *            uint32_t      *dest,
+ *            const void    *source);
+ */
+pixman_asm_function fname
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set bpp, \bpp_
+ .set prefetch_distance, \prefetch_distance_
+
+/*
+ * Assign symbolic names to registers
+ */
+COUNT   .req    a1
+X       .req    a2
+ACCUM   .req    a2
+UX      .req    a3
+DIST    .req    a3
+TMP     .req    a3
+DST     .req    a4
+AG_IN0  .req    v1
+RB_IN0  .req    v2
+AG_IN1  .req    v3
+RB_IN1  .req    v4
+AG_OUT  .req    v5
+RB_OUT  .req    v6
+AG_OUT0 .req    v5
+RB_OUT0 .req    v6
+AG_OUT1 .req    v7
+RB_OUT1 .req    v8
+SRC     .req    ip
+PLDS    .req    lr
+
+        push    {v1-v8,lr}
+        subs    COUNT, COUNT, #1
+        bmi     99f
+
+        mla     v1, UX, COUNT, X        @ v1 = X for final pixel (memory pipeline still busy with push)
+        ldr     v2, [sp, #9*4]          @ get source from stack
+        add     SRC, v2, X, lsr #16 - (log2_\bpp_ - 3)
+        bic     v3, SRC, #31
+        add     v2, v2, v1, lsr #16 - (log2_\bpp_ - 3)
+        pld     [v3]
+        bic     SRC, SRC, #bpp/8-1
+ .if \factor >= 2
+        @ In these cases we point at the 2nd input sample when we enter the main loop
+        add     v1, SRC, #bpp/8
+        bic     v1, v1, #31
+        add     v1, v1, #prefetch_distance*32
+ .else
+        add     v1, v3, #prefetch_distance*32
+ .endif
+        add     v2, v2, #bpp/8           @ v2 -> second input sample for final pixel
+        subs    PLDS, v2, v1
+        movcc   v1, v2
+2:      add     v3, v3, #32
+        cmp     v3, v1
+        bhi     3f
+        pld     [v3]
+        b       2b
+3:      @ Add 1 to PLDS so that subs PLDS,PLDS,#32 sets GT whenever a preload is to be done
+        add     PLDS, PLDS, #1
+        mov     UX, UX, lsl #16
+        mov     ACCUM, X, lsl #16
+        push    {UX}
+        \init
+
+ .if \factor >= 2
+  .if bpp == 32
+        ldr     AG_IN0, [SRC]
+        ldr     AG_IN1, [SRC, #4]!
+  .elseif bpp == 16
+        ldrh    AG_IN0, [SRC]
+        ldrh    AG_IN1, [SRC, #2]!
+  .else // bpp == 8
+        ldrb    AG_IN0, [SRC]
+        ldrb    AG_IN1, [SRC, #1]!
+  .endif
+        \convert  AG_IN0, RB_IN0
+        \convert  AG_IN1, RB_IN1
+        subs    COUNT, COUNT, #4-1
+        bmi     .L\format\()_factor\factor\()_narrow
+        bilinear_scaled_cover_process_horizontal  \format, \factor, 0, 1, 0, 4, unused
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 0, 1, 1, 0, 4, trailing, dropthrough
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 0, 1, 0, 1, 4, trailing
+  .L\format\()_factor\factor\()_narrow:
+        add     COUNT, COUNT, #4-1
+        bilinear_scaled_cover_process_horizontal  \format, \factor, 0, 1, , 1, done
+        b       .L\format\()_factor\factor\()_0
+  .L\format\()_factor\factor\()_trailing:
+        adds    COUNT, COUNT, #4-1
+        bmi     98f
+        adds    ACCUM, ACCUM, UX
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 0, 1, , , 1, done
+ .else
+  .if bpp == 32
+        ldr     AG_IN0, [SRC]
+  .elseif bpp == 16
+        ldrh    AG_IN0, [SRC]
+  .else // bpp == 8
+        ldrb    AG_IN0, [SRC]
+  .endif
+        \convert  AG_IN0, RB_IN0
+        subs    COUNT, COUNT, #4-1
+        addmi   COUNT, COUNT, #4-1
+  .if \factor == 0
+        bmi     .L\format\()_factor\factor\()_0_cs
+  .elseif \factor == 1
+        bmi     .L\format\()_factor\factor\()_0_cc
+  .endif
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 0, 1, 0, 1, 4, 0_trailing, dropthrough
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 1, 0, 1, 0, 4, 1_trailing
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 0, 1, 1, 0, 4, 0_trailing, dropthrough
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 1, 0, 0, 1, 4, 1_trailing
+  .L\format\()_factor\factor\()_0_trailing:
+        adds    COUNT, COUNT, #4-1
+        bmi     98f
+        adds    ACCUM, ACCUM, UX
+  .if \factor == 0
+        bcc     .L\format\()_factor\factor\()_0_cc
+  .elseif \factor == 1
+        bcs     .L\format\()_factor\factor\()_0_cs
+  .endif
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 1, 0, , , 1, done
+  .L\format\()_factor\factor\()_1_trailing:
+        adds    COUNT, COUNT, #4-1
+        bmi     98f
+        adds    ACCUM, ACCUM, UX
+  .if \factor == 0
+        bcc     .L\format\()_factor\factor\()_1_cc
+  .elseif \factor == 1
+        bcs     .L\format\()_factor\factor\()_0_cs
+  .endif
+        bilinear_scaled_cover_innerloop  \format, \factor, \convert, 0, 1, , , 1, done
+ .endif
+ .L\format\()_factor\factor\()_done:
+98:     pop     {UX,v1-v8,pc}
+99:     pop     {v1-v8,pc}
+
+.unreq  COUNT
+.unreq  X
+.unreq  ACCUM
+.unreq  UX
+.unreq  DIST
+.unreq  TMP
+.unreq  DST
+.unreq  AG_IN0
+.unreq  RB_IN0
+.unreq  AG_IN1
+.unreq  RB_IN1
+.unreq  AG_OUT0
+.unreq  RB_OUT0
+.unreq  AG_OUT1
+.unreq  RB_OUT1
+.unreq  SRC
+.unreq  PLDS
+.endfunc
+.endm
+
+.macro generate_bilinear_scaled_cover_functions bpp, \
+                                                format, \
+                                                pd0, pd1, pd2, pd3, pd4, pd5, pd6, pd7, \
+                                                init, \
+                                                convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor0_asm_armv6, \
+    \bpp, \format, 0, \pd0, \init, \convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor1_asm_armv6, \
+    \bpp, \format, 1, \pd1, \init, \convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor2_asm_armv6, \
+    \bpp, \format, 2, \pd2, \init, \convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor3_asm_armv6, \
+    \bpp, \format, 3, \pd3, \init, \convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor4_asm_armv6, \
+    \bpp, \format, 4, \pd4, \init, \convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor5_asm_armv6, \
+    \bpp, \format, 5, \pd5, \init, \convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor6_asm_armv6, \
+    \bpp, \format, 6, \pd6, \init, \convert
+generate_bilinear_scaled_cover_function \
+    pixman_get_scanline_bilinear_scaled_cover_pass1_\format\()_factor7_asm_armv6, \
+    \bpp, \format, 7, \pd7, \init, \convert
+.endm
+
+/******************************************************************************/
+
 .macro nop_macro x:vararg
 .endm
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index e69216a..f783c17 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -37,6 +37,9 @@
 #define ALIGN(addr)                                                     \
     ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
 
+#define ALIGN32(addr)                                                   \
+    ((uint8_t *)((((uintptr_t)(addr)) + 31) & (~31)))
+
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
 		                   uint32_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
@@ -191,6 +194,224 @@ BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, x8r8g8b8, uint32_t)
 BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, r5g6b5,   uint16_t)
 BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, a8,       uint8_t)
 
+typedef void (*bilinear_pass1_t) (uint32_t       width,
+                                  pixman_fixed_t x,
+                                  pixman_fixed_t ux,
+                                  int16_t       *dest,
+                                  const void    *source);
+
+#ifndef __STDC_VERSION__
+#define FLEXIBLE 1
+#else
+#if __STDC_VERSION__ >= 199901 // struct hack is illegal in C99, use flexible array member
+#define FLEXIBLE
+#else
+#define FLEXIBLE 1
+#endif
+#endif
+
+typedef struct
+{
+    bilinear_pass1_t pass1;
+    int              line_y[2];
+    int16_t         *line_buffer;
+    pixman_fixed_t   x;
+    pixman_fixed_t   y;
+    int              stride;
+    uint8_t          data[FLEXIBLE];
+} bilinear_info_t;
+
+static void
+armv6_get_scanline_bilinear_fini (pixman_iter_t *iter)
+{
+    free (iter->data);
+}
+
+void
+pixman_get_scanline_bilinear_scaled_cover_pass2_asm_armv6 (
+                                              uint32_t  width,
+                                              int16_t   dist_y,
+                                              uint32_t *dest,
+                                              int16_t  *source);
+
+void
+pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_armv6 (
+                                              uint32_t  width,
+                                              uint32_t *dest,
+                                              int16_t  *source);
+
+#define BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(cputype, name, type)                        \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor0_asm_##cputype ();     \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor1_asm_##cputype ();     \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor2_asm_##cputype ();     \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor3_asm_##cputype ();     \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor4_asm_##cputype ();     \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor5_asm_##cputype ();     \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor6_asm_##cputype ();     \
+void pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor7_asm_##cputype ();     \
+                                                                                            \
+static void                                                                                 \
+cputype##_get_scanline_bilinear_scaled_cover_pass1_##name##_factor8 (                       \
+                                uint32_t       width,                                       \
+                                pixman_fixed_t x,                                           \
+                                pixman_fixed_t ux,                                          \
+                                int16_t       *dest,                                        \
+                                const void    *source)                                      \
+{                                                                                           \
+    /* The preload scheme used by the assembly version relies on the                        \
+     * reduction factor being less than 8x. Fall back to C. */                              \
+    while (width--)                                                                         \
+    {                                                                                       \
+        uint32_t lag, rag, lrb, rrb, dist_x, ag, rb;                                        \
+        cputype##_convert_adjacent_##name (source, pixman_fixed_to_int (x),                 \
+                &lag, &rag, &lrb, &rrb);                                                    \
+        dist_x = (x & 0xFFFF) >> (16 - BILINEAR_INTERPOLATION_BITS);                        \
+        ag     = (lag << BILINEAR_INTERPOLATION_BITS) + dist_x * (rag - lag);               \
+        rb     = (lrb << BILINEAR_INTERPOLATION_BITS) + dist_x * (rrb - lrb);               \
+        *(uint32_t *)(dest+0) = ag;                                                         \
+        *(uint32_t *)(dest+2) = rb;                                                         \
+        dest += 4;                                                                          \
+        if (((uintptr_t) dest & 31) == 0)                                                   \
+            dest += 4*4;                                                                    \
+        x += ux;                                                                            \
+    }                                                                                       \
+}                                                                                           \
+                                                                                            \
+static void                                                                                 \
+cputype##_get_scanline_bilinear_init_##name (pixman_iter_t *iter,                           \
+                                             const pixman_iter_info_t *iter_info)           \
+{                                                                                           \
+    int              width = iter->width;                                                   \
+    bilinear_info_t *info;                                                                  \
+    int              stride;                                                                \
+    type            *bits;                                                                  \
+    pixman_fixed_t   x, y, uxx, uxy, uyy;                                                   \
+                                                                                            \
+    PIXMAN_IMAGE_GET_SCALED (iter->image, iter->x, iter->y, type,                           \
+                             stride, bits, x, y, uxx, uxy, uyy);                            \
+    (void) bits;                                                                            \
+    (void) uxy;                                                                             \
+    (void) uyy;                                                                             \
+                                                                                            \
+    info = malloc (offsetof(bilinear_info_t, data) + 31 +                                   \
+            (width + 3) / 4 * sizeof (int16_t)*4*4 * 2);                                    \
+    if (!info)                                                                              \
+    {                                                                                       \
+        /* In this case, we don't guarantee any particular rendering. */                    \
+        _pixman_log_error (                                                                 \
+            FUNC, "Allocation failure, skipping rendering\n");                              \
+                                                                                            \
+        iter->get_scanline = _pixman_iter_get_scanline_noop;                                \
+        iter->fini = NULL;                                                                  \
+        iter->data = NULL;                                                                  \
+    }                                                                                       \
+    else                                                                                    \
+    {                                                                                       \
+        static const bilinear_pass1_t routines[9] = {                                       \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor0_asm_##cputype, \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor1_asm_##cputype, \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor2_asm_##cputype, \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor3_asm_##cputype, \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor4_asm_##cputype, \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor5_asm_##cputype, \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor6_asm_##cputype, \
+            pixman_get_scanline_bilinear_scaled_cover_pass1_##name##_factor7_asm_##cputype, \
+            cputype##_get_scanline_bilinear_scaled_cover_pass1_##name##_factor8             \
+        };                                                                                  \
+        uxx >>= 16;                                                                         \
+        if (uxx >= 8)                                                                       \
+            uxx = 8;                                                                        \
+        info->pass1 = routines[uxx];                                                        \
+                                                                                            \
+        /* It is safe to set the y coordinates to -1 initially                              \
+         * because COVER_CLIP_BILINEAR ensures that we will only                            \
+         * be asked to fetch lines in the [0, height) interval                              \
+         */                                                                                 \
+        info->line_y[0] = -1;                                                               \
+        info->line_y[1] = -1;                                                               \
+                                                                                            \
+        info->line_buffer = (int16_t *) ALIGN32 (info->data);                               \
+        info->x = x - pixman_fixed_1 / 2;                                                   \
+        info->y = y - pixman_fixed_1 / 2;                                                   \
+        info->stride = stride;                                                              \
+                                                                                            \
+        iter->fini = cputype##_get_scanline_bilinear_fini;                                  \
+        iter->data = info;                                                                  \
+    }                                                                                       \
+}                                                                                           \
+                                                                                            \
+static uint32_t *                                                                           \
+cputype##_get_scanline_bilinear_scaled_cover_##name (pixman_iter_t  *iter,                  \
+                                                     const uint32_t *mask)                  \
+{                                                                                           \
+    bilinear_info_t *info   = iter->data;                                                   \
+    int              y0     = pixman_fixed_to_int (info->y);                                \
+    int              y1     = y0 + 1;                                                       \
+    int              i      = y0 & 1;                                                       \
+    int              width  = iter->width;                                                  \
+    pixman_fixed_t   fx     = info->x;                                                      \
+    pixman_fixed_t   ux     = iter->image->common.transform->matrix[0][0];                  \
+    int16_t         *buffer = info->line_buffer;                                            \
+    type            *bits   = (type *)iter->image->bits.bits;                               \
+    int              stride = info->stride;                                                 \
+    uint32_t        *out    = iter->buffer;                                                 \
+    int32_t          dist_y;                                                                \
+                                                                                            \
+    dist_y = (info->y >> BILINEAR_INTERPOLATION_BITS) &                                     \
+             ((0x10000 >> BILINEAR_INTERPOLATION_BITS) -                                    \
+              (0x10000 >> (2 * BILINEAR_INTERPOLATION_BITS)));                              \
+    if (i)                                                                                  \
+    {                                                                                       \
+        /* Invert weight if upper scanline is in second buffer */                           \
+        dist_y = (0x10000 >> BILINEAR_INTERPOLATION_BITS) - dist_y;                         \
+    }                                                                                       \
+    info->y += iter->image->common.transform->matrix[1][1];                                 \
+                                                                                            \
+    if (info->line_y[i] != y0)                                                              \
+    {                                                                                       \
+        info->pass1 (width, fx, ux, buffer + 4*4*i, bits + stride * y0);                    \
+        info->line_y[i] = y0;                                                               \
+    }                                                                                       \
+                                                                                            \
+    if (dist_y & ((0x10000 >> BILINEAR_INTERPOLATION_BITS) -                                \
+                  (0x10000 >> (2 * BILINEAR_INTERPOLATION_BITS))))                          \
+    {                                                                                       \
+        if (info->line_y[!i] != y1)                                                         \
+        {                                                                                   \
+            info->pass1 (width, fx, ux, buffer + 4*4*!i, bits + stride * y1);               \
+            info->line_y[!i] = y1;                                                          \
+        }                                                                                   \
+                                                                                            \
+        pixman_get_scanline_bilinear_scaled_cover_pass2_asm_##cputype (                     \
+            width, dist_y, out, buffer);                                                    \
+    }                                                                                       \
+    else                                                                                    \
+    {                                                                                       \
+        pixman_get_scanline_bilinear_scaled_cover_pass2a_asm_##cputype (                    \
+            width, out, buffer + 4*4*i);                                                    \
+    }                                                                                       \
+                                                                                            \
+    return out;                                                                             \
+}
+
+static inline void armv6_convert_adjacent_a8r8g8b8 (const void *void_source,
+                                                    int         x,
+                                                    uint32_t   *lag,
+                                                    uint32_t   *rag,
+                                                    uint32_t   *lrb,
+                                                    uint32_t   *rrb)
+{
+    const uint32_t *source = void_source;
+    uint32_t left  = source[x];
+    uint32_t right = source[x+1];
+    *lag = (left & 0xff00ff00) >> 8;
+    *rag = (right & 0xff00ff00) >> 8;
+    *lrb = (left & 0x00ff00ff);
+    *rrb = (right & 0x00ff00ff);
+}
+
+BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(armv6, a8r8g8b8, uint32_t)
+
 #define NEAREST_SCALED_COVER_USES_SRC_BUFFER(op, src_format, dst_format) \
     (PIXMAN_OP_##op != PIXMAN_OP_SRC ||                                  \
      (PIXMAN_##dst_format != PIXMAN_a8r8g8b8 &&                          \
@@ -537,6 +758,14 @@ static const pixman_iter_info_t arm_simd_iters[] =
       NULL
     },
 
+    { PIXMAN_a8r8g8b8,
+      PIXMAN_ARM_BILINEAR_SCALED_COVER_FLAGS,
+      ITER_NARROW | ITER_SRC,
+      armv6_get_scanline_bilinear_init_a8r8g8b8,
+      armv6_get_scanline_bilinear_scaled_cover_a8r8g8b8,
+      NULL
+    },
+
     { PIXMAN_x8r8g8b8,
       PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS,
       ITER_NARROW | ITER_SRC,
-- 
1.7.5.4