Mesa (master): vc4: Handle partial loads/stores of tiled textures.

Thu Aug 9 00:06:39 UTC 2018

Module: Mesa
Branch: master
Commit: 25bee5ef9ea923bf0d99f5f8eb19082c449f3e53
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=25bee5ef9ea923bf0d99f5f8eb19082c449f3e53

Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan  4 14:08:10 2017 -0800

vc4: Handle partial loads/stores of tiled textures.

Previously, we would load out the tile-aligned area, update the raster
copy, and store it back.  This was a huge cost for XPutImage calls to the
screen under glamor.

Instead, implement a general load/store path that walks over the source
x/y writing into the corresponding pixel of the destination (using clever
math from
https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/).
If things are aligned, we go through the previous utile-at-a-time loop.

Improves x11perf -putimage10 performance by 139.777% +/- 2.83464% (n=5)
Improves x11perf -putimage100 performance by 383.908% +/- 22.6297% (n=11)
Improves x11perf -getimage10 performance by 2.75731% +/- 0.585054% (n=145)

---

 src/gallium/drivers/vc4/vc4_resource.c  |  46 +---------
 src/gallium/drivers/vc4/vc4_tiling.c    |  13 ---
 src/gallium/drivers/vc4/vc4_tiling_lt.c | 156 +++++++++++++++++++++++++++++++-
 3 files changed, 155 insertions(+), 60 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 381a618be2..f2adb29061 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -181,9 +181,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
 
         struct vc4_resource_slice *slice = &rsc->slices[level];
         if (rsc->tiled) {
-                uint32_t utile_w = vc4_utile_width(rsc->cpp);
-                uint32_t utile_h = vc4_utile_height(rsc->cpp);
-
                 /* No direct mappings of tiled, since we need to manually
                  * tile/untile.
                  */
@@ -204,49 +201,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                         ptrans->box.height = (ptrans->box.height + 3) >> 2;
                 }
 
-                /* We need to align the box to utile boundaries, since that's
-                 * what load/store operates on.  This may cause us to need to
-                 * read out the original contents in that border area.  Right
-                 * now we just read out the entire contents, including the
-                 * middle area that will just get overwritten.
-                 */
-                uint32_t box_start_x = ptrans->box.x & (utile_w - 1);
-                uint32_t box_start_y = ptrans->box.y & (utile_h - 1);
-                bool needs_load = (usage & PIPE_TRANSFER_READ) != 0;
-
-                if (box_start_x) {
-                        ptrans->box.width += box_start_x;
-                        ptrans->box.x -= box_start_x;
-                        needs_load = true;
-                }
-                if (box_start_y) {
-                        ptrans->box.height += box_start_y;
-                        ptrans->box.y -= box_start_y;
-                        needs_load = true;
-                }
-                if (ptrans->box.width & (utile_w - 1)) {
-                        /* We only need to force a load if our border region
-                         * we're extending into is actually part of the
-                         * texture.
-                         */
-                        uint32_t slice_width = u_minify(prsc->width0, level);
-                        if (ptrans->box.x + ptrans->box.width != slice_width)
-                                needs_load = true;
-                        ptrans->box.width = align(ptrans->box.width, utile_w);
-                }
-                if (ptrans->box.height & (utile_h - 1)) {
-                        uint32_t slice_height = u_minify(prsc->height0, level);
-                        if (ptrans->box.y + ptrans->box.height != slice_height)
-                                needs_load = true;
-                        ptrans->box.height = align(ptrans->box.height, utile_h);
-                }
-
                 ptrans->stride = ptrans->box.width * rsc->cpp;
                 ptrans->layer_stride = ptrans->stride * ptrans->box.height;
 
                 trans->map = malloc(ptrans->layer_stride * ptrans->box.depth);
 
-                if (needs_load) {
+                if (usage & PIPE_TRANSFER_READ) {
                         vc4_load_tiled_image(trans->map, ptrans->stride,
                                              buf + slice->offset +
                                              ptrans->box.z * rsc->cube_map_stride,
@@ -254,9 +214,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                                              slice->tiling, rsc->cpp,
                                              &ptrans->box);
                 }
-                return (trans->map +
-                        box_start_x * rsc->cpp +
-                        box_start_y * ptrans->stride);
+                return trans->map;
         } else {
                 ptrans->stride = slice->stride;
                 ptrans->layer_stride = ptrans->stride;
diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c
index 07e1c9c5f6..2da520eb4d 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/src/gallium/drivers/vc4/vc4_tiling.c
@@ -63,15 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp)
                 height <= 4 * vc4_utile_height(cpp));
 }
 
-static void
-check_box_utile_alignment(const struct pipe_box *box, int cpp)
-{
-        assert(!(box->x & (vc4_utile_width(cpp) - 1)));
-        assert(!(box->y & (vc4_utile_height(cpp) - 1)));
-        assert(!(box->width & (vc4_utile_width(cpp) - 1)));
-        assert(!(box->height & (vc4_utile_height(cpp) - 1)));
-}
-
 /**
  * Takes a utile x and y (and the number of utiles of width of the image) and
  * returns the offset to the utile within a VC4_TILING_FORMAT_TF image.
@@ -216,8 +207,6 @@ vc4_load_tiled_image(void *dst, uint32_t dst_stride,
                      uint8_t tiling_format, int cpp,
                      const struct pipe_box *box)
 {
-        check_box_utile_alignment(box, cpp);
-
         if (tiling_format == VC4_TILING_FORMAT_LT) {
                 vc4_load_lt_image(dst, dst_stride,
                                   src, src_stride,
@@ -240,8 +229,6 @@ vc4_store_tiled_image(void *dst, uint32_t dst_stride,
                       uint8_t tiling_format, int cpp,
                       const struct pipe_box *box)
 {
-        check_box_utile_alignment(box, cpp);
-
         if (tiling_format == VC4_TILING_FORMAT_LT) {
                 vc4_store_lt_image(dst, dst_stride,
                                    src, src_stride,
diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c
index 8c875e7bd3..ec42a3dc2f 100644
--- a/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -41,6 +41,12 @@
 #define NEON_TAG(x) x ## _base
 #endif
 
+static inline uint32_t
+align_down(uint32_t val, uint32_t align)
+{
+        return val & ~(align - 1);
+}
+
 /** Returns the stride in bytes of a 64-byte microtile. */
 static uint32_t
 vc4_utile_stride(int cpp)
@@ -252,6 +258,66 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
 #endif
 
 }
+/**
+ * Returns the X value into the address bits for LT tiling.
+ *
+ * The LT tile load/stores rely on the X bits not intersecting with the Y
+ * bits.  Because of this, we have to choose to put the utile index within the
+ * LT tile into one of the two values, and we do so in swizzle_lt_x() to make
+ * NPOT handling easier.
+ */
+static uint32_t
+swizzle_lt_x(int x, int cpp)
+{
+        switch (cpp) {
+        case 1:
+                /* 8x8 inside of 4x4 */
+                return ((x & 0x7) << (0 - 0) |
+                        (x & ~0x7) << (6 - 3));
+        case 2:
+                /* 8x4 inside of 4x4 */
+                return ((x & 0x7) << (1 - 0) |
+                        (x & ~0x7) << (6 - 3));
+        case 4:
+                /* 4x4 inside of 4x4 */
+                return ((x & 0x3) << (2 - 0) |
+                        (x & ~0x3) << (6 - 2));
+        case 8:
+                /* 2x4 inside of 4x4 */
+                return ((x & 0x1) << (3 - 0) |
+                        (x & ~0x1) << (6 - 1));
+        default:
+                unreachable("bad cpp");
+        }
+}
+
+/**
+ * Returns the Y value into the address bits for LT tiling.
+ *
+ * The LT tile load/stores rely on the X bits not intersecting with the Y
+ * bits.
+ */
+static uint32_t
+swizzle_lt_y(int y, int cpp)
+{
+
+        switch (cpp) {
+        case 1:
+                /* 8x8 inside of 4x4 */
+                return ((y & 0x7) << 3);
+        case 2:
+                /* 8x4 inside of 4x4 */
+                return ((y & 0x3) << 4);
+        case 4:
+                /* 4x4 inside of 4x4 */
+                return ((y & 0x3) << 4);
+        case 8:
+                /* 2x4 inside of 4x4 */
+                return ((y & 0x3) << 4);
+        default:
+                unreachable("bad cpp");
+        }
+}
 
 /**
  * Helper for loading or storing to an LT image, where the box is aligned
@@ -261,9 +327,9 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
  * vc4_load_utile/vc4_store_utile helpers.
  */
 static inline void
-vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
-                    void *cpu, uint32_t cpu_stride,
-                    int cpp, const struct pipe_box *box, bool to_cpu)
+vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride,
+                     void *cpu, uint32_t cpu_stride,
+                     int cpp, const struct pipe_box *box, bool to_cpu)
 {
         uint32_t utile_w = vc4_utile_width(cpp);
         uint32_t utile_h = vc4_utile_height(cpp);
@@ -289,6 +355,90 @@ vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
         }
 }
 
+/**
+ * Helper for loading or storing to an LT image, where the box is not aligned
+ * to utiles.
+ *
+ * This walks through the raster-order data, copying to/from the corresponding
+ * tiled pixel.  This means we don't get write-combining on stores, but the
+ * loop is very few CPU instructions since the memcpy will be inlined.
+ */
+static inline void
+vc4_lt_image_unaligned(void *gpu, uint32_t gpu_stride,
+                       void *cpu, uint32_t cpu_stride,
+                       int cpp, const struct pipe_box *box, bool to_cpu)
+{
+
+        /* These are the address bits for the start of the box, split out into
+         * x/y so that they can be incremented separately in their loops.
+         */
+        uint32_t offs_x0 = swizzle_lt_x(box->x, cpp);
+        uint32_t offs_y = swizzle_lt_y(box->y, cpp);
+        /* The *_mask values are "what bits of the address are from x or y" */
+        uint32_t x_mask = swizzle_lt_x(~0, cpp);
+        uint32_t y_mask = swizzle_lt_y(~0, cpp);
+        uint32_t incr_y = swizzle_lt_x(gpu_stride / cpp, cpp);
+
+        assert(!(x_mask & y_mask));
+
+        offs_x0 += incr_y * (box->y / vc4_utile_height(cpp));
+
+        for (uint32_t y = 0; y < box->height; y++) {
+                void *gpu_row = gpu + offs_y;
+
+                uint32_t offs_x = offs_x0;
+
+                for (uint32_t x = 0; x < box->width; x++) {
+                        /* Use a memcpy here to move a pixel's worth of data.
+                         * We're relying on this function to be inlined, so
+                         * this will get expanded into the appropriate 1, 2,
+                         * or 4-byte move.
+                         */
+                        if (to_cpu) {
+                                memcpy(cpu + x * cpp, gpu_row + offs_x, cpp);
+                        } else {
+                                memcpy(gpu_row + offs_x, cpu + x * cpp, cpp);
+                        }
+
+                        /* This math trick with x_mask increments offs_x by 1
+                         * in x.
+                         */
+                        offs_x = (offs_x - x_mask) & x_mask;
+                }
+
+                offs_y = (offs_y - y_mask) & y_mask;
+                /* When offs_y wraps (we hit the end of the utile), we
+                 * increment offs_x0 by effectively the utile stride.
+                 */
+                if (!offs_y)
+                        offs_x0 += incr_y;
+
+                cpu += cpu_stride;
+        }
+}
+
+/**
+ * General LT image load/store helper.
+ */
+static inline void
+vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
+                    void *cpu, uint32_t cpu_stride,
+                    int cpp, const struct pipe_box *box, bool to_cpu)
+{
+        if (box->x & (vc4_utile_width(cpp) - 1) ||
+            box->y & (vc4_utile_height(cpp) - 1) ||
+            box->width & (vc4_utile_width(cpp) - 1) ||
+            box->height & (vc4_utile_height(cpp) - 1)) {
+                vc4_lt_image_unaligned(gpu, gpu_stride,
+                                       cpu, cpu_stride,
+                                       cpp, box, to_cpu);
+        } else {
+                vc4_lt_image_aligned(gpu, gpu_stride,
+                                     cpu, cpu_stride,
+                                     cpp, box, to_cpu);
+        }
+}
+
 static inline void
 vc4_lt_image_cpp_helper(void *gpu, uint32_t gpu_stride,
                         void *cpu, uint32_t cpu_stride,