[Pixman] [PATCH 30/32] armv6: Add nearest-scaled-cover src_8888_8888 fast path

Ben Avison bavison at riscosopen.org
Thu Aug 7 09:50:26 PDT 2014


Without this patch, any such operations are matched against the fast path
implementation in pixman-fast-path.c before general_composite_rect(), so
we never get to use the armv6-optimised assembly fetcher routines.

This patch adds a C wrapper to the same assembly routine used for the
nearest-scaled-cover fetcher, adapted to perform a 2D plot rather than a
single scanlne. The C is macroised so that later patches can use the same
approach to build more complex fast paths from combinations of armv6
fetcher/combiner/writeback routines in a similar manner to
pixcman_composite_rect().

lowlevel-blt-bench -n src_8888_8888:

     Before          After
     Mean   StdDev   Mean   StdDev  Confidence  Change
L1   117.2  1.6      79.2   1.1     100.0%      -32.4%
L2   44.1   3.1      49.9   2.4     100.0%      +13.2%
M    40.0   0.1      72.5   0.1     100.0%      +81.4%
HT   20.1   0.1      29.5   0.3     100.0%      +46.5%
VT   19.4   0.1      27.7   0.2     100.0%      +42.7%
R    18.2   0.1      26.2   0.2     100.0%      +44.1%
RT   8.7    0.2      10.0   0.2     100.0%      +15.8%

affine-bench * 0 0 1 src a8r8g8b8 a8r8g8b8:

     Before          After
     Mean   StdDev   Mean   StdDev  Confidence  Change
0.5  46.6   0.1      110.5  0.1     100.0%      +137.2%
0.75 39.1   0.1      88.5   0.1     100.0%      +126.1%
1.0  36.3   0.2      71.7   0.1     100.0%      +97.7%
1.5  26.7   0.1      55.3   0.1     100.0%      +106.8%
2.0  19.9   0.0      43.5   0.0     100.0%      +119.2%
---
 pixman/pixman-arm-common.h |    9 ++++
 pixman/pixman-arm-simd.c   |   96 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index a4d4ea4..2ddcbbc 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -469,6 +469,15 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp,                   \
      FAST_PATH_X_UNIT_POSITIVE            |                             \
      FAST_PATH_Y_UNIT_ZERO)
 
+#define PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH(cputype,op,s,d,func) \
+    {   PIXMAN_OP_ ## op,                                                      \
+        PIXMAN_ ## s,                                                          \
+        PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS,                                 \
+        PIXMAN_null, 0,                                                        \
+        PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                                \
+        cputype ## _composite_nearest_scaled_cover_ ## func                    \
+    }
+
 #define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name)                         \
 void                                                                        \
 pixman_get_scanline_##name##_asm_##cputype (int32_t        w,               \
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 6554530..7bc1e39 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -27,10 +27,16 @@
 #include <config.h>
 #endif
 
+#include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-arm-common.h"
 #include "pixman-inlines.h"
 
+#define SCANLINE_BUFFER_LENGTH 8192
+
+#define ALIGN(addr)                                                     \
+    ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
+
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
 		                   uint32_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
@@ -83,6 +89,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
                                         uint32_t, uint32_t)
 
+#define pixman_composite_scanline_src_asm_armv6(w, dst, src) do { (void)(w); (void)(dst); (void)(src); } while (0)
+
 void
 pixman_composite_scanline_src_mask_asm_armv6 (int32_t         w,
                                               uint32_t       *dst,
@@ -111,6 +119,8 @@ PIXMAN_ARM_BIND_COMBINE_U (armv6, out)
 PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse)
 PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
 
+#define pixman_get_scanline_a8r8g8b8_asm_armv6(w, dst, src) do { (void)(w); (void)(dst); (void)(src); } while (0)
+#define pixman_write_back_a8r8g8b8_asm_armv6(w, dst, src)   do { (void)(w); (void)(dst); (void)(src); } while (0)
 PIXMAN_ARM_BIND_GET_SCANLINE (armv6, r5g6b5)
 PIXMAN_ARM_BIND_WRITE_BACK   (armv6, r5g6b5)
 PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a1r5g5b5)
@@ -181,6 +191,85 @@ BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, x8r8g8b8, uint32_t)
 BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, r5g6b5,   uint16_t)
 BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, a8,       uint8_t)
 
+#define NEAREST_SCALED_COVER_USES_SRC_BUFFER(op, src_format, dst_format) \
+    (PIXMAN_OP_##op != PIXMAN_OP_SRC ||                                  \
+     (PIXMAN_##dst_format != PIXMAN_a8r8g8b8 &&                          \
+      (PIXMAN_##src_format != PIXMAN_r5g6b5 || PIXMAN_##dst_format != PIXMAN_r5g6b5)))
+
+#define NEAREST_SCALED_COVER_USES_DST_BUFFER(op, dst_format) \
+    (PIXMAN_OP_##op != PIXMAN_OP_SRC && PIXMAN_##dst_format != PIXMAN_a8r8g8b8)
+
+#define BIND_NEAREST_SCALED_COVER_FAST_PATH_SRC_DST(cputype, name, OP, op, src_type, dst_type, src_format, dst_format) \
+static void                                                                                                 \
+cputype##_composite_nearest_scaled_cover_##name (pixman_implementation_t *imp,                              \
+                                                 pixman_composite_info_t *info)                             \
+{                                                                                                           \
+    uint8_t        stack_scanline_buffer[SCANLINE_BUFFER_LENGTH];                                           \
+    uint8_t       *scanline_buffer = stack_scanline_buffer;                                                 \
+    PIXMAN_COMPOSITE_ARGS (info);                                                                           \
+    dst_type      *dst_line, *dst;                                                                          \
+    src_type      *src_bits, *src;                                                                          \
+    uint32_t      *end_of_buffer, *dst_buffer, *src_buffer;                                                 \
+    int            dst_stride, src_stride;                                                                  \
+    pixman_fixed_t x, y, uxx, uxy, uyy;                                                                     \
+                                                                                                            \
+    end_of_buffer = dst_buffer = src_buffer = (uint32_t *) ALIGN (scanline_buffer);                         \
+    if (NEAREST_SCALED_COVER_USES_SRC_BUFFER (OP, src_format, dst_format))                                  \
+        end_of_buffer = dst_buffer = (uint32_t *) ALIGN (src_buffer + width);                               \
+    if (NEAREST_SCALED_COVER_USES_DST_BUFFER (OP, dst_format))                                              \
+        end_of_buffer = dst_buffer + width;                                                                 \
+    if (NEAREST_SCALED_COVER_USES_SRC_BUFFER (OP, src_format, dst_format) &&                                \
+        (uint8_t *) end_of_buffer > scanline_buffer + sizeof stack_scanline_buffer)                         \
+    {                                                                                                       \
+        scanline_buffer = pixman_malloc_ab_plus_c (end_of_buffer - src_buffer, sizeof (uint32_t), 15);      \
+                                                                                                            \
+        if (!scanline_buffer)                                                                               \
+            return;                                                                                         \
+                                                                                                            \
+        src_buffer = (uint32_t *) ALIGN (scanline_buffer);                                                  \
+        dst_buffer = (uint32_t *) ALIGN (src_buffer + width);                                               \
+    }                                                                                                       \
+                                                                                                            \
+    PIXMAN_IMAGE_GET_SCALED (src_image, src_x, src_y, src_type, src_stride, src_bits, x, y, uxx, uxy, uyy); \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, dst_stride, dst_line, 1);                  \
+                                                                                                            \
+    while (height--)                                                                                        \
+    {                                                                                                       \
+        dst = dst_line;                                                                                     \
+        dst_line += dst_stride;                                                                             \
+        src = src_bits + src_stride * pixman_fixed_to_int (y - pixman_fixed_e);                             \
+        if (PIXMAN_OP_##OP == PIXMAN_OP_SRC &&                                                              \
+            PIXMAN_##src_format == PIXMAN_r5g6b5 &&                                                         \
+            PIXMAN_##dst_format == PIXMAN_r5g6b5)                                                           \
+            pixman_get_scanline_r5g6b5_nearest_scaled_cover_r5g6b5_asm_##cputype (                          \
+                    width, x - pixman_fixed_e, uxx, (uint16_t *) dst, (uint16_t *) src);                    \
+        else if (NEAREST_SCALED_COVER_USES_SRC_BUFFER (OP, src_format, dst_format))                         \
+            pixman_get_scanline_nearest_scaled_cover_##src_format##_asm_##cputype (                         \
+                    width, x - pixman_fixed_e, uxx, src_buffer, src, NULL);                                 \
+        else                                                                                                \
+            pixman_get_scanline_nearest_scaled_cover_##src_format##_asm_##cputype (                         \
+                    width, x - pixman_fixed_e, uxx, (uint32_t *) dst, src, NULL);                           \
+        if (NEAREST_SCALED_COVER_USES_DST_BUFFER (OP, dst_format))                                          \
+        {                                                                                                   \
+            pixman_get_scanline_##dst_format##_asm_##cputype (width, dst_buffer, (uint32_t *) dst);         \
+            pixman_composite_scanline_##op##_asm_##cputype (width, dst_buffer, src_buffer);                 \
+            pixman_write_back_##dst_format##_asm_##cputype (width, (uint32_t *) dst, dst_buffer);           \
+        }                                                                                                   \
+        else if (PIXMAN_OP_##OP != PIXMAN_OP_SRC)                                                           \
+            pixman_composite_scanline_##op##_asm_##cputype (width, (uint32_t *) dst, src_buffer);           \
+        else if (NEAREST_SCALED_COVER_USES_SRC_BUFFER (OP, src_format, dst_format))                         \
+            pixman_write_back_##dst_format##_asm_##cputype (width, (uint32_t *) dst, src_buffer);           \
+        x += uxy;                                                                                           \
+        y += uyy;                                                                                           \
+    }                                                                                                       \
+                                                                                                            \
+    if (NEAREST_SCALED_COVER_USES_SRC_BUFFER (OP, src_format, dst_format) &&                                \
+        scanline_buffer != stack_scanline_buffer)                                                           \
+        free (scanline_buffer);                                                                             \
+}
+
+BIND_NEAREST_SCALED_COVER_FAST_PATH_SRC_DST (armv6, src_8888_8888,  SRC,  src,  uint32_t, uint32_t, a8r8g8b8, a8r8g8b8)
+
 void
 pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
                                        int32_t   h,
@@ -390,6 +479,13 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
 
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, a8r8g8b8, a8r8g8b8, src_8888_8888),
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, a8r8g8b8, x8r8g8b8, src_8888_8888),
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, x8r8g8b8, x8r8g8b8, src_8888_8888),
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, a8b8g8r8, a8b8g8r8, src_8888_8888),
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, a8b8g8r8, x8b8g8r8, src_8888_8888),
+    PIXMAN_ARM_NEAREST_SCALED_COVER_SRC_DST_FAST_PATH (armv6, SRC, x8b8g8r8, x8b8g8r8, src_8888_8888),
+
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
 
-- 
1.7.5.4



More information about the Pixman mailing list