Mesa (main): gallium/u_threaded: implement pipelined partial buffer uploads using CPU storage

Tue Oct 19 13:45:27 UTC 2021

Module: Mesa
Branch: main
Commit: 3df9d8ed807a6693d5fc8cbda4faec28af081ff3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=3df9d8ed807a6693d5fc8cbda4faec28af081ff3

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sun Oct 17 17:16:09 2021 -0400

gallium/u_threaded: implement pipelined partial buffer uploads using CPU storage

This removes resource_copy_region for BufferSubData. Drivers have to opt in
to use this.

See the comment in the header file.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13399>

---

 src/gallium/auxiliary/driver_noop/noop_pipe.c      |   2 +-
 src/gallium/auxiliary/util/u_threaded_context.c    | 108 +++++++++++++++++++--
 src/gallium/auxiliary/util/u_threaded_context.h    |  31 +++++-
 src/gallium/drivers/crocus/crocus_resource.c       |   2 +-
 src/gallium/drivers/freedreno/freedreno_resource.c |   2 +-
 src/gallium/drivers/iris/iris_resource.c           |   2 +-
 src/gallium/drivers/r600/r600_buffer_common.c      |   2 +-
 src/gallium/drivers/radeonsi/si_buffer.c           |   2 +-
 src/gallium/drivers/zink/zink_resource.c           |   2 +-
 9 files changed, 138 insertions(+), 15 deletions(-)

diff --git a/src/gallium/auxiliary/driver_noop/noop_pipe.c b/src/gallium/auxiliary/driver_noop/noop_pipe.c
index 73d35d003cf..19421dec596 100644
--- a/src/gallium/auxiliary/driver_noop/noop_pipe.c
+++ b/src/gallium/auxiliary/driver_noop/noop_pipe.c
@@ -120,7 +120,7 @@ static struct pipe_resource *noop_resource_create(struct pipe_screen *screen,
       FREE(nresource);
       return NULL;
    }
-   threaded_resource_init(&nresource->b.b);
+   threaded_resource_init(&nresource->b.b, false, 0);
    return &nresource->b.b;
 }
 
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 8eb7f86f161..dd45bbf58e3 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -71,6 +71,12 @@ typedef uint16_t (*tc_execute)(struct pipe_context *pipe, void *call, uint64_t *
 
 static const tc_execute execute_func[TC_NUM_CALLS];
 
+static void
+tc_buffer_subdata(struct pipe_context *_pipe,
+                  struct pipe_resource *resource,
+                  unsigned usage, unsigned offset,
+                  unsigned size, const void *data);
+
 static void
 tc_batch_check(UNUSED struct tc_batch *batch)
 {
@@ -669,18 +675,32 @@ tc_is_buffer_busy(struct threaded_context *tc, struct threaded_resource *tbuf,
    return tc->options.is_resource_busy(tc->pipe->screen, tbuf->latest, map_usage);
 }
 
+/**
+ * allow_cpu_storage should be false for user memory and imported buffers.
+ */
 void
-threaded_resource_init(struct pipe_resource *res)
+threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage,
+                       unsigned map_buffer_alignment)
 {
    struct threaded_resource *tres = threaded_resource(res);
 
    tres->latest = &tres->b;
+   tres->cpu_storage = NULL;
    util_range_init(&tres->valid_buffer_range);
    tres->is_shared = false;
    tres->is_user_ptr = false;
    tres->buffer_id_unique = 0;
    tres->pending_staging_uploads = 0;
    util_range_init(&tres->pending_staging_uploads_range);
+
+   if (allow_cpu_storage &&
+       !(res->flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
+                       PIPE_RESOURCE_FLAG_SPARSE |
+                       PIPE_RESOURCE_FLAG_ENCRYPTED)) &&
+       /* We need buffer invalidation and buffer busyness tracking for the CPU
+        * storage, which aren't supported with pipe_vertex_state. */
+       !(res->bind & PIPE_BIND_VERTEX_STATE))
+      tres->cpu_storage = align_malloc(res->width0, map_buffer_alignment);
 }
 
 void
@@ -692,6 +712,7 @@ threaded_resource_deinit(struct pipe_resource *res)
            pipe_resource_reference(&tres->latest, NULL);
    util_range_destroy(&tres->valid_buffer_range);
    util_range_destroy(&tres->pending_staging_uploads_range);
+   align_free(tres->cpu_storage);
 }
 
 struct pipe_context *
@@ -907,10 +928,12 @@ tc_get_query_result_resource(struct pipe_context *_pipe,
                              struct pipe_resource *resource, unsigned offset)
 {
    struct threaded_context *tc = threaded_context(_pipe);
+
+   tc_buffer_disable_cpu_storage(resource);
+
    struct tc_query_result_resource *p =
       tc_add_call(tc, TC_CALL_get_query_result_resource,
                   tc_query_result_resource);
-
    p->query = query;
    p->wait = wait;
    p->result_type = result_type;
@@ -1499,6 +1522,7 @@ tc_set_shader_images(struct pipe_context *_pipe,
             if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) {
                struct threaded_resource *tres = threaded_resource(resource);
 
+               tc_buffer_disable_cpu_storage(resource);
                util_range_add(&tres->b, &tres->valid_buffer_range,
                               images[i].u.buf.offset,
                               images[i].u.buf.offset + images[i].u.buf.size);
@@ -1591,6 +1615,7 @@ tc_set_shader_buffers(struct pipe_context *_pipe,
             tc_bind_buffer(&tc->shader_buffers[shader][start + i], next, &tres->b);
 
             if (writable_bitmask & BITFIELD_BIT(i)) {
+               tc_buffer_disable_cpu_storage(src->buffer);
                util_range_add(&tres->b, &tres->valid_buffer_range,
                               src->buffer_offset,
                               src->buffer_offset + src->buffer_size);
@@ -1737,6 +1762,7 @@ tc_set_stream_output_targets(struct pipe_context *_pipe,
       p->targets[i] = NULL;
       pipe_so_target_reference(&p->targets[i], tgs[i]);
       if (tgs[i]) {
+         tc_buffer_disable_cpu_storage(tgs[i]->buffer);
          tc_bind_buffer(&tc->streamout_buffers[i], next, tgs[i]->buffer);
       } else {
          tc_unbind_buffer(&tc->streamout_buffers[i]);
@@ -1906,6 +1932,9 @@ tc_create_image_handle(struct pipe_context *_pipe,
    struct threaded_context *tc = threaded_context(_pipe);
    struct pipe_context *pipe = tc->pipe;
 
+   if (image->resource->target == PIPE_BUFFER)
+      tc_buffer_disable_cpu_storage(image->resource);
+
    tc_sync(tc);
    return pipe->create_image_handle(pipe, image);
 }
@@ -2142,8 +2171,35 @@ tc_buffer_map(struct pipe_context *_pipe,
    struct threaded_resource *tres = threaded_resource(resource);
    struct pipe_context *pipe = tc->pipe;
 
+   /* PIPE_MAP_THREAD_SAFE is for glthread, which shouldn't use the CPU storage and
+    * this shouldn't normally be necessary because glthread only uses large buffers.
+    */
+   if (usage & PIPE_MAP_THREAD_SAFE)
+      tc_buffer_disable_cpu_storage(resource);
+
    usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width);
 
+   /* If the CPU storage is enabled, return it directly. */
+   if (tres->cpu_storage && !(usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
+      /* We can't let resource_copy_region disable the CPU storage. */
+      assert(!(tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY));
+
+      struct threaded_transfer *ttrans = slab_alloc(&tc->pool_transfers);
+      ttrans->b.resource = resource;
+      ttrans->b.level = 0;
+      ttrans->b.usage = usage;
+      ttrans->b.box = *box;
+      ttrans->b.stride = 0;
+      ttrans->b.layer_stride = 0;
+      ttrans->b.offset = 0;
+      ttrans->staging = NULL;
+      ttrans->valid_buffer_range = &tres->valid_buffer_range;
+      ttrans->cpu_storage_mapped = true;
+      *transfer = &ttrans->b;
+
+      return (uint8_t*)tres->cpu_storage + box->x;
+   }
+
    /* Do a staging transfer within the threaded context. The driver should
     * only get resource_copy_region.
     */
@@ -2169,6 +2225,7 @@ tc_buffer_map(struct pipe_context *_pipe,
       ttrans->b.stride = 0;
       ttrans->b.layer_stride = 0;
       ttrans->valid_buffer_range = &tres->valid_buffer_range;
+      ttrans->cpu_storage_mapped = false;
       *transfer = &ttrans->b;
 
       p_atomic_inc(&tres->pending_staging_uploads);
@@ -2203,6 +2260,7 @@ tc_buffer_map(struct pipe_context *_pipe,
    void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource,
                                 level, usage, box, transfer);
    threaded_transfer(*transfer)->valid_buffer_range = &tres->valid_buffer_range;
+   threaded_transfer(*transfer)->cpu_storage_mapped = false;
 
    if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
       tc_clear_driver_thread(tc);
@@ -2285,8 +2343,13 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
                               ttrans->staging, 0, &src_box);
    }
 
-   util_range_add(&tres->b, ttrans->valid_buffer_range,
-                  box->x, box->x + box->width);
+   /* Don't update the valid range when we're uploading the CPU storage
+    * because it includes the uninitialized range too.
+    */
+   if (!(ttrans->b.usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
+      util_range_add(&tres->b, ttrans->valid_buffer_range,
+                     box->x, box->x + box->width);
+   }
 }
 
 static void
@@ -2373,12 +2436,37 @@ tc_buffer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
       return;
    }
 
-   bool was_staging_transfer = false;
-
    if (transfer->usage & PIPE_MAP_WRITE &&
        !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT))
       tc_buffer_do_flush_region(tc, ttrans, &transfer->box);
 
+   if (ttrans->cpu_storage_mapped) {
+      /* GL allows simultaneous GPU stores with mapped buffers as long as GPU stores don't
+       * touch the mapped range. That's a problem because GPU stores free the CPU storage.
+       * If that happens, we just ignore the unmap call and don't upload anything to prevent
+       * a crash.
+       *
+       * Disallow the CPU storage in the driver to work around this.
+       */
+      assert(tres->cpu_storage);
+
+      if (tres->cpu_storage) {
+         tc_invalidate_buffer(tc, tres);
+         tc_buffer_subdata(&tc->base, &tres->b,
+                           PIPE_MAP_UNSYNCHRONIZED |
+                           TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE,
+                           0, tres->b.width0, tres->cpu_storage);
+         /* This shouldn't have been freed by buffer_subdata. */
+         assert(tres->cpu_storage);
+      }
+
+      tc_drop_resource_reference(ttrans->staging);
+      slab_free(&tc->pool_transfers, ttrans);
+      return;
+   }
+
+   bool was_staging_transfer = false;
+
    if (ttrans->staging) {
       was_staging_transfer = true;
 
@@ -2483,7 +2571,8 @@ tc_buffer_subdata(struct pipe_context *_pipe,
     */
    if (usage & (PIPE_MAP_UNSYNCHRONIZED |
                 PIPE_MAP_DISCARD_WHOLE_RESOURCE) ||
-       size > TC_MAX_SUBDATA_BYTES) {
+       size > TC_MAX_SUBDATA_BYTES ||
+       tres->cpu_storage) {
       struct pipe_transfer *transfer;
       struct pipe_box box;
       uint8_t *map = NULL;
@@ -3560,6 +3649,9 @@ tc_resource_copy_region(struct pipe_context *_pipe,
       tc_add_call(tc, TC_CALL_resource_copy_region,
                   tc_resource_copy_region);
 
+   if (dst->target == PIPE_BUFFER)
+      tc_buffer_disable_cpu_storage(dst);
+
    tc_set_resource_reference(&p->dst, dst);
    p->dst_level = dst_level;
    p->dstx = dstx;
@@ -3874,6 +3966,8 @@ tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res,
    struct tc_clear_buffer *p =
       tc_add_call(tc, TC_CALL_clear_buffer, tc_clear_buffer);
 
+   tc_buffer_disable_cpu_storage(res);
+
    tc_set_resource_reference(&p->res, res);
    tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], res);
    p->offset = offset;
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
index a961a11dbe5..ec7438c8e2f 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -208,6 +208,8 @@ struct tc_unflushed_batch_token;
 /* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */
 #define TC_DEBUG 0
 
+/* This is an internal flag not sent to the driver. */
+#define TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE   (1u << 28)
 /* These are map flags sent to drivers. */
 /* Never infer whether it's safe to use unsychronized mappings: */
 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
@@ -313,6 +315,13 @@ struct threaded_resource {
     */
    struct pipe_resource *latest;
 
+   /* Optional CPU storage of the buffer. When we get partial glBufferSubData(implemented by
+    * copy_buffer) + glDrawElements, we don't want to drain the gfx pipeline before executing
+    * the copy. For ideal pipelining, we upload to this CPU storage and then reallocate
+    * the GPU storage completely and reupload everything without copy_buffer.
+    */
+   void *cpu_storage;
+
    /* The buffer range which is initialized (with a write transfer, streamout,
     * or writable shader resources). The remainder of the buffer is considered
     * invalid and can be mapped unsynchronized.
@@ -360,6 +369,8 @@ struct threaded_transfer {
     * the base instance. Initially it's set to &b.resource->valid_buffer_range.
     */
    struct util_range *valid_buffer_range;
+
+   bool cpu_storage_mapped;
 };
 
 struct threaded_query {
@@ -503,7 +514,8 @@ struct threaded_context {
    struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS];
 };
 
-void threaded_resource_init(struct pipe_resource *res);
+void threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage,
+                            unsigned map_buffer_alignment);
 void threaded_resource_deinit(struct pipe_resource *res);
 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
 void tc_driver_internal_flush_notify(struct threaded_context *tc);
@@ -579,4 +591,21 @@ tc_assert_driver_thread(struct threaded_context *tc)
 #endif
 }
 
+/**
+ * This is called before GPU stores to disable the CPU storage because
+ * the CPU storage doesn't mirror the GPU storage.
+ *
+ * Drivers should also call it before exporting a DMABUF of a buffer.
+ */
+static inline void
+tc_buffer_disable_cpu_storage(struct pipe_resource *buf)
+{
+   struct threaded_resource *tres = threaded_resource(buf);
+
+   if (tres->cpu_storage) {
+      free(tres->cpu_storage);
+      tres->cpu_storage = NULL;
+   }
+}
+
 #endif
diff --git a/src/gallium/drivers/crocus/crocus_resource.c b/src/gallium/drivers/crocus/crocus_resource.c
index 51dcc921cfd..11309b8bef7 100644
--- a/src/gallium/drivers/crocus/crocus_resource.c
+++ b/src/gallium/drivers/crocus/crocus_resource.c
@@ -356,7 +356,7 @@ crocus_alloc_resource(struct pipe_screen *pscreen,
    res->base.b.screen = pscreen;
    res->orig_screen = crocus_pscreen_ref(pscreen);
    pipe_reference_init(&res->base.b.reference, 1);
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);
 
    if (templ->target == PIPE_BUFFER)
       util_range_init(&res->valid_buffer_range);
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index add830d9c12..f8c8eb9d0c9 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -1109,7 +1109,7 @@ alloc_resource_struct(struct pipe_screen *pscreen,
 
    pipe_reference_init(&rsc->track->reference, 1);
 
-   threaded_resource_init(prsc);
+   threaded_resource_init(prsc, false, 0);
 
    if (tmpl->target == PIPE_BUFFER)
       rsc->b.buffer_id_unique = util_idalloc_mt_alloc(&screen->buffer_ids);
diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c
index 8f3662f6e6f..12857416ef8 100644
--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@@ -479,7 +479,7 @@ iris_alloc_resource(struct pipe_screen *pscreen,
    res->base.b.screen = pscreen;
    res->orig_screen = iris_pscreen_ref(pscreen);
    pipe_reference_init(&res->base.b.reference, 1);
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);
 
    res->aux.possible_usages = 1 << ISL_AUX_USAGE_NONE;
    res->aux.sampler_usages = 1 << ISL_AUX_USAGE_NONE;
diff --git a/src/gallium/drivers/r600/r600_buffer_common.c b/src/gallium/drivers/r600/r600_buffer_common.c
index ba645298f81..2035456e4f6 100644
--- a/src/gallium/drivers/r600/r600_buffer_common.c
+++ b/src/gallium/drivers/r600/r600_buffer_common.c
@@ -584,7 +584,7 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
 	pipe_reference_init(&rbuffer->b.b.reference, 1);
 	rbuffer->b.b.screen = screen;
 
-	threaded_resource_init(&rbuffer->b.b);
+	threaded_resource_init(&rbuffer->b.b, false, 0);
 
 	rbuffer->buf = NULL;
 	rbuffer->bind_history = 0;
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index 1c45c534488..a16c50a94f6 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -567,7 +567,7 @@ static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen,
    pipe_reference_init(&buf->b.b.reference, 1);
    buf->b.b.screen = screen;
 
-   threaded_resource_init(&buf->b.b);
+   threaded_resource_init(&buf->b.b, false, 0);
 
    buf->buf = NULL;
    buf->bind_history = 0;
diff --git a/src/gallium/drivers/zink/zink_resource.c b/src/gallium/drivers/zink/zink_resource.c
index b403bb88864..64516a2900d 100644
--- a/src/gallium/drivers/zink/zink_resource.c
+++ b/src/gallium/drivers/zink/zink_resource.c
@@ -760,7 +760,7 @@ resource_create(struct pipe_screen *pscreen,
 
    res->base.b = *templ;
 
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);
    pipe_reference_init(&res->base.b.reference, 1);
    res->base.b.screen = pscreen;