[Pixman] [PATCH 4/4] armv6: Add nearest-scaled-cover src_0565_0565 fast path

Tue Aug 25 16:23:26 PDT 2015

This is adapted from the nearest scaled cover scanline fetcher, modified to
pack output data in 16-bit units. This fetcher out-performs both the fast
path defined using PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST in pixman-arm-simd.c
and the fast path in pixman-fast-path.c.

Since the two preceding patches no longer defined a macroised C wrapper we
can use, and general_composite_rect is no use to us here (we don't want to
do pixel format conversion twice) the C wrapper has been written out longhand.
Unsurprisingly, the results are similar to last year's version of the patch:

lowlevel-blt-bench -n src_0565_0565

       Before          Old patch      New patch       Change
      Mean StdDev     Mean StdDev    Mean StdDev    Old     New
L1   118.6   3.12     71.0   1.32    73.5   1.18   -40.1%  -38.0%
L2    42.1   0.73     52.6   2.44    52.1   2.00   +25.1%  +23.7%
M     42.1   0.15     69.3   0.10    69.3   0.15   +64.9%  +64.8%
HT    24.4   0.35     29.2   0.33    29.5   0.24   +19.4%  +20.9%
VT    23.0   0.24     27.4   0.29    27.7   0.35   +19.3%  +20.6%
R     20.8   0.20     25.3   0.32    25.7   0.18   +21.4%  +23.2%
RT     9.1   0.25      9.3   0.24     9.7   0.24    +1.7%   +6.7%
---
 pixman/pixman-arm-common.h          |    9 +++++
 pixman/pixman-arm-simd-asm-scaled.S |    4 ++
 pixman/pixman-arm-simd-asm-scaled.h |   69 ++++++++++++++++++++++++++++++-----
 pixman/pixman-arm-simd.c            |   35 ++++++++++++++++++
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index f970868..59190f0 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -494,6 +494,15 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp,                   \
      FAST_PATH_X_UNIT_POSITIVE            |                             \
      FAST_PATH_Y_UNIT_ZERO)
 
+#define PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH(cputype,op,s,d,func) \
+    {   PIXMAN_OP_ ## op,                                                      \
+        PIXMAN_ ## s,                                                          \
+        PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS,                                 \
+        PIXMAN_null, 0,                                                        \
+        PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                                \
+        cputype ## _composite_nearest_scaled_cover_ ## func                    \
+    }
+
 #define PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER(op,s,d,func) \
     {   PIXMAN_OP_ ## op,                                                       \
         PIXMAN_ ## s,                                                           \
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index 0116889..24c1a27 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -170,6 +170,10 @@ generate_nearest_scaled_cover_function \
     pixman_get_scanline_nearest_scaled_cover_x8r8g8b8_asm_armv6, 32, \
     2, 3 /* prefetch distances */, nop_macro, convert_x888_8888
 
+generate_nearest_scaled_cover_function \
+    pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6, 16, \
+    2, 0 /* prefetch distances */, nop_macro, nop_macro, 16
+
 .macro init_ge
         msr     CPSR_s, #0x50000
 .endm
diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h
index 660797d..e642e7f 100644
--- a/pixman/pixman-arm-simd-asm-scaled.h
+++ b/pixman/pixman-arm-simd-asm-scaled.h
@@ -94,7 +94,12 @@
 
 .macro nearest_scaled_cover_enlarge_nomask_innerloop  bpp, reg, convert, mask_hint, may_be_final, exit_label, store
         adds    ACCUM, ACCUM, UX
+ .if PIXEL_MERGE_OFFSET == 0
         mov     \reg, PIXEL
+ .else
+        orr     \reg, \reg, PIXEL, lsl #PIXEL_MERGE_OFFSET
+ .endif
+ .set PIXEL_MERGE_OFFSET, (PIXEL_MERGE_OFFSET + out_bpp) & 31
         \store
         branch  cc, \exit_label, 1203f
  .ifnc "\may_be_final",""
@@ -158,10 +163,20 @@
         mov     TMP, XHI
         adds    XLO, XLO, UX, lsl #16
         adc     XHI, XHI, UX, lsr #16
+ .if PIXEL_MERGE_OFFSET == 0
         ldrx    \bpp,, <\reg, [PTR]>
+ .else
+        ldrx    \bpp,, <PIXEL2, [PTR]>
+ .endif
         eor     TMP, TMP, XHI
         bics    TMP, TMP, #255/\bpp
+ .if PIXEL_MERGE_OFFSET == 0
         \convert \reg, TMP
+ .else
+        \convert PIXEL2, TMP
+        orr     \reg, \reg, PIXEL2, lsl #PIXEL_MERGE_OFFSET
+ .endif
+ .set PIXEL_MERGE_OFFSET, (PIXEL_MERGE_OFFSET + out_bpp) & 31
         \store
         branch  eq, \exit_label, 1403f
         subs    PLDS, PLDS, #32
@@ -185,7 +200,14 @@
         \inner_loop  \bpp, WK0, \convert, mask_is_0, 1, 1503f, <add DST, DST, #4>
         b       1503f
  .endif
+ .set PIXEL_MERGE_OFFSET, 0
+ .if out_bpp == 32
 1502:   \inner_loop  \bpp, WK0, \convert, mask_is_non_0, 1,, <str WK0, [DST], #4>
+ .elseif out_bpp == 16
+1502:   \inner_loop  \bpp, WK0, \convert, mask_is_non_0, 1,, <strh WK0, [DST], #2>
+ .else
+        .error  "Output bits per pixel not supported"
+ .endif
 1503:
 .endm
 
@@ -206,15 +228,26 @@
         \inner_loop  \bpp, WK3, \convert, mask_is_0, 1, 1602f, <add DST, DST, #4*4>
         b       1602f
  .endif
-1601:   \inner_loop  \bpp, WK0, \convert
+1601:
+ .set PIXEL_MERGE_OFFSET, 0
+ .rept 32 / out_bpp
+        \inner_loop  \bpp, WK0, \convert
+ .endr
+ .rept 32 / out_bpp
         \inner_loop  \bpp, WK1, \convert
+ .endr
+ .rept 32 / out_bpp
         \inner_loop  \bpp, WK2, \convert
+ .endr
+ .rept 32 / out_bpp - 1
+        \inner_loop  \bpp, WK3, \convert
+ .endr
         \inner_loop  \bpp, WK3, \convert,, 1,, <stmia DST!!, {WK0,WK1,WK2,WK3}>
 1602:
 .endm
 
 .macro process  bpp, has_mask, inner_loop, convert
-        cmp     COUNT, #2 * 4 - 1 - 1   @ guaranteed at least one aligned half-cacheline output?
+        cmp     COUNT, #2 * 128 / out_bpp - 1 - 1   @ guaranteed at least one aligned half-cacheline output?
         blo     1706f
         tst     DST, #15
         beq     1702f
@@ -222,16 +255,21 @@
         sub     COUNT, COUNT, #1
         tst     DST, #15
         bne     1701b
-1702:   sub     COUNT, COUNT, #4 - 1
+1702:   sub     COUNT, COUNT, #128 / out_bpp - 1
+ .if \has_mask
         tst     MASK, #16
         beq     1704f
-1703:   process4  \bpp, \has_mask, 0, \inner_loop, \convert
-        subs    COUNT, COUNT, #4
+ .endif
+1703:
+.if \has_mask
+        process4  \bpp, \has_mask, 0, \inner_loop, \convert
+        subs    COUNT, COUNT, #128 / out_bpp
         bcc     1705f
+ .endif
 1704:   process4  \bpp, \has_mask, 1, \inner_loop, \convert
-        subs    COUNT, COUNT, #4
+        subs    COUNT, COUNT, #128 / out_bpp
         bcs     1703b
-1705:   adds    COUNT, COUNT, #4 - 1
+1705:   adds    COUNT, COUNT, #128 / out_bpp - 1
         bcc     1707f
         @ drop through...
 1706:   process1 \bpp, \has_mask, 1, \inner_loop, \convert
@@ -245,7 +283,8 @@
                                               prefetch_distance_src_, \
                                               prefetch_distance_mask_, \
                                               init, \
-                                              convert
+                                              convert, \
+                                              out_bpp_
 
 /* void fname(uint32_t width,
  *            pixman_fixed_t x,
@@ -262,6 +301,11 @@ pixman_asm_function fname
  */
  .set prefetch_distance_src,  prefetch_distance_src_
  .set prefetch_distance_mask, prefetch_distance_mask_
+ .ifc "out_bpp_",""
+  .set out_bpp, 32
+ .else
+  .set out_bpp, out_bpp_
+ .endif
 
 /*
  * Assign symbolic names to registers
@@ -273,7 +317,8 @@ XLO     .req    a2  @ reduce only
 UX      .req    a3
 DST     .req    a4
 SRC     .req    v1
-MASK    .req    v2
+MASK    .req    v2  @ only when outputing 32bpp
+PIXEL2  .req    v2  @ only when outputing <32bpp and reducing
 PLDS    .req    v3
 PIXEL   .req    v4  @ enlarge only
 XHI     .req    v4  @ reduce only
@@ -292,6 +337,7 @@ TMP     .req    lr
         blo     1807f-4
         \init
         mla     WK2, COUNT, UX, X
+ .if out_bpp == 32
         bics    WK0, MASK, #31
         beq     1801f
         @ Use a simplified preload process for the mask,
@@ -302,6 +348,7 @@ TMP     .req    lr
    .set OFFSET, OFFSET + 32
   .endr
 1801:
+ .endif
         add     WK0, SRC, X, lsr #16 - (log2_\bpp - 3)
         bic     WK0, WK0, #31
         pld     [WK0]
@@ -323,11 +370,13 @@ TMP     .req    lr
         mov     ACCUM, X, lsl #16
         mov     UX, UX, lsl #16
         bic     SRC, SRC, #(\bpp-1)/8
+ .if out_bpp == 32
         teq     MASK, #0
         beq     1804f
         mov     VALID, #0
         process \bpp, 1, nearest_scaled_cover_enlarge_mask_innerloop, \convert
 1804:
+ .endif
         ldrx    \bpp,, <PIXEL, [SRC]>
         \convert PIXEL, TMP
         process \bpp, 0, nearest_scaled_cover_enlarge_nomask_innerloop, \convert
@@ -338,9 +387,11 @@ TMP     .req    lr
         mov     XHI, X, lsr #16
         mov     XLO, X, lsl #16
         add     XHI, XHI, TMP, lsr #log2_\bpp - 3
+ .if out_bpp == 32
         teq     MASK, #0
         beq     1806f
         process \bpp, 1, nearest_scaled_cover_reduce_mask_innerloop, \convert
+ .endif
 1806:   process \bpp, 0, nearest_scaled_cover_reduce_nomask_innerloop, \convert
 1807:
 
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 1c6c1e9..f21bb8f 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -186,6 +186,38 @@ PIXMAN_ARM_BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(armv6, r5g6b5, uint16_t)
 PIXMAN_ARM_BIND_GET_SCANLINE_BILINEAR_SCALED_COVER(armv6, a8, uint8_t)
 
 void
+pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6(uint32_t        width,
+                                                                 pixman_fixed_t  x,
+                                                                 pixman_fixed_t  ux,
+                                                                 uint16_t       *dest,
+                                                                 const uint16_t *source);
+
+static void
+armv6_composite_nearest_scaled_cover_src_0565_0565 (pixman_implementation_t *imp,
+                                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t      *dst_line, *dst;
+    uint16_t      *src_bits, *src;
+    int            dst_stride, src_stride;
+    pixman_fixed_t x, y, uxx, uxy, uyy;
+
+    PIXMAN_ARM_IMAGE_GET_SCALED (src_image, src_x, src_y, uint16_t, src_stride, src_bits, x, y, uxx, uxy, uyy);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+        dst = dst_line;
+        dst_line += dst_stride;
+        src = src_bits + src_stride * pixman_fixed_to_int (y - pixman_fixed_e);
+        pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_armv6 (
+                width, x - pixman_fixed_e, uxx, (uint16_t *) dst, (uint16_t *) src);
+        x += uxy;
+        y += uyy;
+    }
+}
+
+void
 pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
                                        int32_t   h,
                                        uint32_t *dst,
@@ -399,6 +431,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
 
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, r5g6b5, r5g6b5, src_0565_0565),
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, b5g6r5, b5g6r5, src_0565_0565),
+
     PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER (SRC, a8r8g8b8, r5g6b5, src_8888_0565),
     PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER (SRC, x8r8g8b8, r5g6b5, src_8888_0565),
     PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH_VIA_ITER (SRC, a8b8g8r8, b5g6r5, src_8888_0565),
-- 
1.7.5.4