pixman: Branch 'master' - 2 commits

Fri Apr 18 04:01:29 UTC 2025

pixman/pixman-arma64-neon-asm.S |   95 ++++++++++++++----------------------
 pixman/pixman-arma64-neon-asm.h |   21 +++-----
 test/meson.build                |    1 
 test/neg-stride-test.c          |  105 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 152 insertions(+), 70 deletions(-)

New commits:
commit 106323bc15c7eb89a033cf8ff38f52bca8a4b53c
Author: Joel May <git at jmay.us>
Date:   Wed Apr 9 17:54:25 2025 -0700

    Fix arm64 advanced prefetcher
    
    https://gitlab.freedesktop.org/pixman/pixman/-/issues/119
    
    When the advanced preloader reached the end of a scanline, it advanced
    the `PF_SRC`, `PF_DST`, and `PF_MASK` addresses by 1 instead of
    advancing to the next scanline.
    
    Most of the time, this means the prefetcher is simply prefetching the
    wrong place in the image.  But when the stride is negative, this can
    result in a segfault when trying to read memory beyond the end of the
    image buffer.
    
    I have fixed this by combining the bitshift and add instructions to
    directly update the address register by the correct amount before
    prefetching.
    
    My fix results in the same behavior as the code in
    `pixman/pixman-arm-neon-asm.h`:
    ```
        PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    ```
    This instruction means that `PF_SRC` is updated with the offset of
    `SRC_STRIDE << #src_bpp_shift` and then the byte at updated `PF_SRC` is
    read into `DUMMY`.  (The `ge` condition has been replaced with a
    separate branch instruction in aarch64.)
    
    I also cleaned up a couple other cases where instructions were
    redundant.

diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
index 7329d4b..519ff5b 100644
--- a/pixman/pixman-arma64-neon-asm.S
+++ b/pixman/pixman-arma64-neon-asm.S
@@ -305,16 +305,14 @@
         mov         v28.d[0], v14.d[0]
         mov         v29.d[0], v14.d[1]
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     raddhn      v20.8b, v10.8h, v17.8h
     raddhn      v23.8b, v11.8h, v19.8h
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_SRC, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     raddhn      v22.8b, v12.8h, v18.8h
         st1         {v14.8h}, [DST_W], #16
@@ -497,9 +495,8 @@ generate_composite_function \
     ushll       v14.8h, v2.8b, #7
     sli         v14.8h, v14.8h, #1
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     ushll       v9.8h, v0.8b, #7
     sli         v9.8h, v9.8h, #1
@@ -585,12 +582,10 @@ generate_composite_function \
 10:
     uqadd       v28.8b, v0.8b, v4.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     uqadd       v29.8b, v1.8b, v5.8b
     uqadd       v30.8b, v2.8b, v6.8b
@@ -631,12 +626,10 @@ generate_composite_function \
 10:
     uqadd       v28.8b, v0.8b, v4.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     uqadd       v29.8b, v1.8b, v5.8b
     uqadd       v30.8b, v2.8b, v6.8b
@@ -719,15 +712,13 @@ generate_composite_function_single_scanline \
 10:
     umull      v9.8h, v22.8b, v5.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     umull      v10.8h, v22.8b, v6.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
      umull     v11.8h, v22.8b, v7.8b
 .endm
@@ -793,15 +784,13 @@ generate_composite_function_single_scanline \
 10:
     umull      v9.8h, v22.8b, v5.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     umull      v10.8h, v22.8b, v6.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     umull      v11.8h, v22.8b, v7.8b
 .endm
@@ -886,9 +875,8 @@ generate_composite_function_single_scanline \
                                     PF subs, PF_CTL, PF_CTL, #0x10
     umull       v11.8h, v24.8b, v7.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
         st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
 .endm
@@ -950,9 +938,8 @@ generate_composite_function \
     umull       v9.8h, v22.8b, v5.8b
     umull       v10.8h, v22.8b, v6.8b
                                     PF blt, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     umull       v11.8h, v22.8b, v7.8b
 .endm
@@ -1436,9 +1423,8 @@ generate_composite_function \
 10:
     umull          v11.8h, v24.8b, v3.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-                                    PF add, PF_MASK, PF_MASK, #1
+                                    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK]
 10:
         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
     ursra       v8.8h, v8.8h, #8
@@ -1517,9 +1503,8 @@ generate_composite_function \
 10:
     umull       v3.8h,  v27.8b, v16.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-                                    PF add, PF_MASK, PF_MASK, #1
+                                    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK]
 10:
         st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
     ursra       v0.8h, v0.8h,  #8
@@ -1628,15 +1613,13 @@ generate_composite_function \
 10:
     umull       v19.8h, v24.8b, v11.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
         uqadd       v28.8b, v0.8b, v28.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-                                    PF add, PF_MASK, PF_MASK, #1
+                                    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK]
 10:
         uqadd        v29.8b, v1.8b, v29.8b
         uqadd        v30.8b, v2.8b, v30.8b
@@ -2699,9 +2682,8 @@ generate_composite_function \
                                     PF sub, PF_X, PF_X, ORIG_W
                                     PF subs, PF_CTL, PF_CTL, #0x10
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
 .endm
 
@@ -2768,9 +2750,8 @@ generate_composite_function \
                                     PF sub, PF_X, PF_X, ORIG_W
                                     PF subs, PF_CTL, PF_CTL, #0x10
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
 .endm
 
diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
index ec3d76f..516ebb8 100644
--- a/pixman/pixman-arma64-neon-asm.h
+++ b/pixman/pixman-arma64-neon-asm.h
@@ -474,25 +474,21 @@
     PF lsl, DUMMY, PF_X, #mask_bpp_shift
     PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
 .endif
-    PF ble, 71f
+    PF ble, 72f
     PF sub, PF_X, PF_X, ORIG_W
     PF subs, PF_CTL, PF_CTL, #0x10
-71:
     PF ble, 72f
 .if src_bpp_shift >= 0
-    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-    PF add, PF_SRC, PF_SRC, #1
+    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+    PF ldrsb, DUMMY, [PF_SRC]
 .endif
 .if dst_r_bpp != 0
-    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-    PF add, PF_DST, PF_DST, #1
+    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+    PF ldrsb, DUMMY, [PF_DST]
 .endif
 .if mask_bpp_shift >= 0
-    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-    PF add, PF_MASK, PF_MASK, #1
+    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+    PF ldrsb, DUMMY, [PF_MASK]
 .endif
 72:
 .endif
@@ -858,8 +854,7 @@
     PF mov,     PF_DST, DST_R
     PF mov,     PF_MASK, MASK
     /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
-    PF lsl,     DUMMY, H, #4
-    PF mov,     PF_CTL, DUMMY
+    PF lsl,     PF_CTL, H, #4
     PF add,     PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
 
     \init
commit fdbb0d944c78b2952bc99235c5bafc60011925a8
Author: Joel May <git at jmay.us>
Date:   Thu Apr 10 10:12:11 2025 -0700

    Test case for compositing with a negative stride
    
    The aarch64 advanced prefetcher has a bug where it can read past the
    end of the image buffers.  Typically this causes no ill effects (beyond
    making the attempted prefetch useless), because it's simply prefetching
    junk data and discarding it.  Where this causes problems (i.e. segfault)
    is when it tries to read memory that not readable, such as a memory map
    with no read permission.
    
    To expose this specific bug, we need a test case with a negative stride
    and enough height such that `last_scanline_ptr + height` runs past the
    end of the image buffer (this equation is approximate and omits some
    small variables).  Using `fence_malloc`, we can catch the problem with a
    segfault when the prefetch attempts to read memory beyond the end of the
    image buffer.

diff --git a/test/meson.build b/test/meson.build
index 47dd33c..e652956 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -51,6 +51,7 @@ tests = [
   'scaling-test',
   'composite',
   'tolerance-test',
+  'neg-stride-test',
 ]
 
 # Remove/update this once thread-test.c supports threading methods
diff --git a/test/neg-stride-test.c b/test/neg-stride-test.c
new file mode 100644
index 0000000..ed10479
--- /dev/null
+++ b/test/neg-stride-test.c
@@ -0,0 +1,105 @@
+/*
+ * Test program, which tests negative strides in compositing with fence_malloc
+ * to check for out-of-bounds memory access.
+ */
+#include "utils.h"
+
+#if FENCE_MALLOC_ACTIVE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+typedef struct
+{
+    int                     width;
+    int                     height;
+    int                     stride;
+    pixman_format_code_t    format;
+} image_info_t;
+
+typedef struct
+{
+    pixman_op_t     op;
+
+    image_info_t    src;
+    image_info_t    dest;
+
+    int             src_x;
+    int             src_y;
+    int             dest_x;
+    int             dest_y;
+    int             width;
+    int             height;
+} composite_info_t;
+
+const composite_info_t info =
+{
+    PIXMAN_OP_SRC,
+    { 16, 1000, -16, PIXMAN_a8r8g8b8 },
+    { 16, 1000, -16, PIXMAN_a8r8g8b8 },
+    0, 0,
+    0, 0,
+    16, 1000
+};
+
+static pixman_image_t *
+make_image (const image_info_t *info, char **buf)
+{
+    int size = info->height * abs (info->stride);
+    char *data = fence_malloc (size);
+    int i;
+
+    for (i = 0; i < size; ++i)
+        data[i] = (i % 255) ^ (((i % 16) << 4) | (i & 0xf0));
+
+    *buf = data;
+    if (info->stride < 0)
+        /* Move to the start of the last scanline. */
+        data += size + info->stride;
+
+    return pixman_image_create_bits (info->format,
+                                     info->width,
+                                     info->height,
+                                     (uint32_t *)data,
+                                     info->stride);
+}
+
+static void
+test_composite (const composite_info_t *info)
+{
+    char *src_buf;
+    char *dest_buf;
+    pixman_image_t *src = make_image (&info->src, &src_buf);
+    pixman_image_t *dest = make_image (&info->dest, &dest_buf);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src, NULL, dest,
+                            info->src_x, info->src_y,
+                            0, 0,
+                            info->dest_x, info->dest_y,
+                            info->width, info->height);
+
+    fence_free (src_buf);
+    fence_free (dest_buf);
+    pixman_image_unref (src);
+    pixman_image_unref (dest);
+}
+
+int
+main (int argc, char **argv)
+{
+    test_composite (&info);
+
+    return 0;
+}
+
+#else /* FENCE_MALLOC_ACTIVE */
+
+int
+main (int argc, char **argv)
+{
+    /* Automake return code for test SKIP. */
+    return 77;
+}
+
+#endif /* FENCE_MALLOC_ACTIVE */