Mesa (master): iris: Use copy_region and staging resources to avoid transfer stalls

Fri Mar 8 21:30:45 UTC 2019

Module: Mesa
Branch: master
Commit: 9d1334d2a0f983e175ffe371bee33f4ce048f910
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=9d1334d2a0f983e175ffe371bee33f4ce048f910

Author: Kenneth Graunke <kenneth at whitecape.org>
Date:   Fri Dec 21 03:04:18 2018 -0800

iris: Use copy_region and staging resources to avoid transfer stalls

This is similar to intel_miptree_map_blit and intel_buffer_object.c's
temporary blits in i965.

Improves performance of DiRT Rally by 20-25% by eliminating stalls.

Breaks piglit's spec/arb_shader_image_load_store/host-mem-barrier,
by using the GPU to do uploads, exposing a st/mesa issue where it
doesn't give us memory_barrier() calls.  This is a pre-existing issue
and will be fixed by a later patch (currently out for review).

---

 src/gallium/drivers/iris/iris_resource.c | 167 ++++++++++++++++++++++++++++---
 src/gallium/drivers/iris/iris_resource.h |   5 +
 src/gallium/drivers/iris/iris_screen.c   |   2 +
 src/gallium/drivers/iris/iris_screen.h   |   1 +
 4 files changed, 161 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c
index 03ce35462fa..5040637ee77 100644
--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@@ -794,6 +794,102 @@ iris_resource_get_handle(struct pipe_screen *pscreen,
 }
 
 static void
+iris_unmap_copy_region(struct iris_transfer *map)
+{
+   struct pipe_transfer *xfer = &map->base;
+   struct pipe_box *dst_box = &xfer->box;
+   struct pipe_box src_box = (struct pipe_box) {
+      .x = xfer->resource->target == PIPE_BUFFER ?
+           xfer->box.x % IRIS_MAP_BUFFER_ALIGNMENT : 0,
+      .width = dst_box->width,
+      .height = dst_box->height,
+      .depth = dst_box->depth,
+   };
+
+   if (xfer->usage & PIPE_TRANSFER_WRITE) {
+      iris_copy_region(map->blorp, map->batch, xfer->resource, xfer->level,
+                       dst_box->x, dst_box->y, dst_box->z, map->staging, 0,
+                       &src_box);
+   }
+
+   iris_resource_destroy(map->staging->screen, map->staging);
+
+   map->ptr = NULL;
+}
+
+static void
+iris_map_copy_region(struct iris_transfer *map)
+{
+   struct pipe_screen *pscreen = &map->batch->screen->base;
+   struct pipe_transfer *xfer = &map->base;
+   struct pipe_box *box = &xfer->box;
+   struct iris_resource *res = (void *) xfer->resource;
+
+   unsigned extra = xfer->resource->target == PIPE_BUFFER ?
+                    box->x % IRIS_MAP_BUFFER_ALIGNMENT : 0;
+
+   struct pipe_resource templ = (struct pipe_resource) {
+      .usage = PIPE_USAGE_STAGING,
+      .width0 = box->width + extra,
+      .height0 = box->height,
+      .depth0 = 1,
+      .nr_samples = xfer->resource->nr_samples,
+      .nr_storage_samples = xfer->resource->nr_storage_samples,
+      .array_size = box->depth,
+   };
+
+   if (xfer->resource->target == PIPE_BUFFER)
+      templ.target = PIPE_BUFFER;
+   else if (templ.array_size > 1)
+      templ.target = PIPE_TEXTURE_2D_ARRAY;
+   else
+      templ.target = PIPE_TEXTURE_2D;
+
+   /* Depth, stencil, and ASTC can't be linear surfaces, so we can't use
+    * xfer->resource->format directly.  Pick a bpb compatible format so
+    * resource creation will succeed; blorp_copy will override it anyway.
+    */
+   switch (util_format_get_blocksizebits(res->internal_format)) {
+   case 8:   templ.format = PIPE_FORMAT_R8_UINT;           break;
+   case 16:  templ.format = PIPE_FORMAT_R8G8_UINT;         break;
+   case 24:  templ.format = PIPE_FORMAT_R8G8B8_UINT;       break;
+   case 32:  templ.format = PIPE_FORMAT_R8G8B8A8_UINT;     break;
+   case 48:  templ.format = PIPE_FORMAT_R16G16B16_UINT;    break;
+   case 64:  templ.format = PIPE_FORMAT_R16G16B16A16_UINT; break;
+   case 96:  templ.format = PIPE_FORMAT_R32G32B32_UINT;    break;
+   case 128: templ.format = PIPE_FORMAT_R32G32B32A32_UINT; break;
+   default: unreachable("Invalid bpb");
+   }
+
+   map->staging = iris_resource_create(pscreen, &templ);
+   assert(map->staging);
+
+   if (templ.target != PIPE_BUFFER) {
+      struct isl_surf *surf = &((struct iris_resource *) map->staging)->surf;
+      xfer->stride = isl_surf_get_row_pitch_B(surf);
+      xfer->layer_stride = isl_surf_get_array_pitch(surf);
+   }
+
+   if (!(xfer->usage & PIPE_TRANSFER_DISCARD_RANGE)) {
+      iris_copy_region(map->blorp, map->batch, map->staging, 0, extra, 0, 0,
+                       xfer->resource, xfer->level, box);
+      /* Ensure writes to the staging BO land before we map it below. */
+      iris_emit_pipe_control_flush(map->batch,
+                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                   PIPE_CONTROL_CS_STALL);
+   }
+
+   struct iris_bo *staging_bo = iris_resource_bo(map->staging);
+
+   if (iris_batch_references(map->batch, staging_bo))
+      iris_batch_flush(map->batch);
+
+   map->ptr = iris_bo_map(map->dbg, staging_bo, xfer->usage) + extra;
+
+   map->unmap = iris_unmap_copy_region;
+}
+
+static void
 get_image_offset_el(struct isl_surf *surf, unsigned level, unsigned z,
                     unsigned *out_x0_el, unsigned *out_y0_el)
 {
@@ -1099,9 +1195,7 @@ iris_transfer_map(struct pipe_context *ctx,
     if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)
        usage |= PIPE_TRANSFER_DISCARD_RANGE;
 
-   if (surf->tiling != ISL_TILING_LINEAR &&
-       (usage & PIPE_TRANSFER_MAP_DIRECTLY))
-      return NULL;
+   bool map_would_stall = false;
 
    if (resource->target != PIPE_BUFFER) {
       iris_resource_access_raw(ice, &ice->batches[IRIS_BATCH_RENDER], res,
@@ -1110,13 +1204,18 @@ iris_transfer_map(struct pipe_context *ctx,
    }
 
    if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
-      for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
-         if (iris_batch_references(&ice->batches[i], res->bo))
-            iris_batch_flush(&ice->batches[i]);
-      }
+      map_would_stall = iris_bo_busy(res->bo);
+
+      for (int i = 0; i < IRIS_BATCH_COUNT; i++)
+         map_would_stall |= iris_batch_references(&ice->batches[i], res->bo);
+
+      if (map_would_stall && (usage & PIPE_TRANSFER_DONTBLOCK) &&
+                             (usage & PIPE_TRANSFER_MAP_DIRECTLY))
+         return NULL;
    }
 
-   if ((usage & PIPE_TRANSFER_DONTBLOCK) && iris_bo_busy(res->bo))
+   if (surf->tiling != ISL_TILING_LINEAR &&
+       (usage & PIPE_TRANSFER_MAP_DIRECTLY))
       return NULL;
 
    struct iris_transfer *map = slab_alloc(&ice->transfer_pool);
@@ -1141,13 +1240,53 @@ iris_transfer_map(struct pipe_context *ctx,
                    PIPE_TRANSFER_COHERENT |
                    PIPE_TRANSFER_DISCARD_RANGE);
 
-   if (surf->tiling == ISL_TILING_W) {
-      // XXX: just teach iris_map_tiled_memcpy about W tiling...
-      iris_map_s8(map);
-   } else if (surf->tiling != ISL_TILING_LINEAR) {
-      iris_map_tiled_memcpy(map);
+   /* Avoid using GPU copies for persistent/coherent buffers, as the idea
+    * there is to access them simultaneously on the CPU & GPU.  This also
+    * avoids trying to use GPU copies for our u_upload_mgr buffers which
+    * contain state we're constructing for a GPU draw call, which would
+    * kill us with infinite stack recursion.
+    */
+   bool no_gpu = usage & (PIPE_TRANSFER_PERSISTENT |
+                          PIPE_TRANSFER_COHERENT |
+                          PIPE_TRANSFER_MAP_DIRECTLY);
+
+   /* GPU copies are not useful for buffer reads.  Instead of stalling to
+    * read from the original buffer, we'd simply copy it to a temporary...
+    * then stall (a bit longer) to read from that buffer.
+    *
+    * Images are less clear-cut.  Color resolves are destructive, removing
+    * the underlying compression, so we'd rather blit the data to a linear
+    * temporary and map that, to avoid the resolve.  (It might be better to
+    * a tiled temporary and use the tiled_memcpy paths...)
+    */
+   if (!(usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+       res->aux.usage != ISL_AUX_USAGE_CCS_E &&
+       res->aux.usage != ISL_AUX_USAGE_CCS_D) {
+      no_gpu = true;
+   }
+
+   if (map_would_stall && !no_gpu) {
+      /* If we need a synchronous mapping and the resource is busy,
+       * we copy to/from a linear temporary buffer using the GPU.
+       */
+      map->batch = &ice->batches[IRIS_BATCH_RENDER];
+      map->blorp = &ice->blorp;
+      iris_map_copy_region(map);
    } else {
-      iris_map_direct(map);
+      /* Otherwise we're free to map on the CPU.  Flush if needed. */
+      for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
+         if (iris_batch_references(&ice->batches[i], res->bo))
+            iris_batch_flush(&ice->batches[i]);
+      }
+
+      if (surf->tiling == ISL_TILING_W) {
+         /* TODO: Teach iris_map_tiled_memcpy about W-tiling... */
+         iris_map_s8(map);
+      } else if (surf->tiling != ISL_TILING_LINEAR) {
+         iris_map_tiled_memcpy(map);
+      } else {
+         iris_map_direct(map);
+      }
    }
 
    return map->ptr;
diff --git a/src/gallium/drivers/iris/iris_resource.h b/src/gallium/drivers/iris/iris_resource.h
index 8b5d176191e..d9cb50de8b6 100644
--- a/src/gallium/drivers/iris/iris_resource.h
+++ b/src/gallium/drivers/iris/iris_resource.h
@@ -180,6 +180,11 @@ struct iris_transfer {
    void *buffer;
    void *ptr;
 
+   /** A linear staging resource for GPU-based copy_region transfers. */
+   struct pipe_resource *staging;
+   struct blorp_context *blorp;
+   struct iris_batch *batch;
+
    void (*unmap)(struct iris_transfer *);
 };
 
diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c
index 909fb876fbe..f18f986547e 100644
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -201,6 +201,8 @@ iris_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
       /* 3DSTATE_CONSTANT_XS requires the start of UBOs to be 32B aligned */
       return 32;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return IRIS_MAP_BUFFER_ALIGNMENT;
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       /* Choose a cacheline (64 bytes) so that we can safely have the CPU and
        * GPU writing the same SSBO on non-coherent systems (Atom CPUs).  With
diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h
index 77fbd906b73..386a5ba6343 100644
--- a/src/gallium/drivers/iris/iris_screen.h
+++ b/src/gallium/drivers/iris/iris_screen.h
@@ -38,6 +38,7 @@ struct iris_bo;
 
 #define IRIS_MAX_TEXTURE_SAMPLERS 32
 #define IRIS_MAX_SOL_BUFFERS 4
+#define IRIS_MAP_BUFFER_ALIGNMENT 64
 
 struct iris_screen {
    struct pipe_screen base;