Mesa (main): gallium: simplify VRAM uploads by adding PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY

Mon Aug 9 12:35:33 UTC 2021

Module: Mesa
Branch: main
Commit: 59fe704c45df49a6c1937e1b25c921b9342e624a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=59fe704c45df49a6c1937e1b25c921b9342e624a

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Fri Aug  6 01:02:50 2021 -0400

gallium: simplify VRAM uploads by adding PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY

When this flag is set, u_threaded_context will try not to map it directly
for better buffer placement. It's set by drivers when visible VRAM is too
small.

Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz at gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12257>

---

 src/gallium/auxiliary/util/u_threaded_context.c |  7 ++---
 src/gallium/auxiliary/util/u_threaded_context.h |  6 -----
 src/gallium/drivers/radeonsi/si_buffer.c        | 35 +++++++++----------------
 src/gallium/drivers/radeonsi/si_pipe.h          |  1 -
 src/gallium/include/pipe/p_defines.h            |  1 +
 5 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 8e9898310ab..5994a6a3097 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1996,11 +1996,8 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
    if (usage & (PIPE_MAP_DISCARD_RANGE |
                 PIPE_MAP_DISCARD_WHOLE_RESOURCE) &&
        !(usage & PIPE_MAP_PERSISTENT) &&
-       /* Try not to decrement the counter if it's not positive. Still racy,
-        * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
-       tres->max_forced_staging_uploads > 0 &&
-       tc->use_forced_staging_uploads &&
-       p_atomic_dec_return(&tres->max_forced_staging_uploads) >= 0) {
+       tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY &&
+       tc->use_forced_staging_uploads) {
       usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE |
                  PIPE_MAP_UNSYNCHRONIZED);
 
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
index 04088166248..0e1189d7f70 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -333,12 +333,6 @@ struct threaded_resource {
     */
    uint32_t buffer_id_unique;
 
-   /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
-    * method of CPU access when map flags allow it. Useful for buffers that
-    * are too large for the visible VRAM window.
-    */
-   int max_forced_staging_uploads;
-
    /* If positive, then a staging transfer is in progress.
     */
    int pending_staging_uploads;
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index 8ade9694b49..c958ab8acd5 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -147,27 +147,21 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res,
    /* Set expected VRAM and GART usage for the buffer. */
    res->vram_usage_kb = 0;
    res->gart_usage_kb = 0;
-   res->max_forced_staging_uploads = 0;
-   res->b.max_forced_staging_uploads = 0;
 
    if (res->domains & RADEON_DOMAIN_VRAM) {
       res->vram_usage_kb = MAX2(1, size / 1024);
 
-      if (!sscreen->info.smart_access_memory) {
-         /* We don't want to evict buffers from VRAM by mapping them for CPU access,
-          * because they might never be moved back again. If a buffer is large enough,
-          * upload data by copying from a temporary GTT buffer. 8K might not seem much,
-          * but there can be 100000 buffers.
-          *
-          * This tweak improves performance for viewperf.
-          */
-         const unsigned min_size = 8196; /* tuned to minimize mapped VRAM */
-         /* Number of uploads before mapping directly. A very high number helps display lists (snx). */
-         const unsigned max_staging_uploads = 1000000;
-
-         res->max_forced_staging_uploads = res->b.max_forced_staging_uploads =
-            sscreen->info.has_dedicated_vram && size >= min_size ? max_staging_uploads : 0;
-      }
+      /* We don't want to evict buffers from VRAM by mapping them for CPU access,
+       * because they might never be moved back again. If a buffer is large enough,
+       * upload data by copying from a temporary GTT buffer. 8K might not seem much,
+       * but there can be 100000 buffers.
+       *
+       * This tweak improves performance for viewperf creo & snx.
+       */
+      if (!sscreen->info.smart_access_memory &&
+          sscreen->info.has_dedicated_vram &&
+          size >= 8196)
+         res->b.b.flags |= PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY;
    } else if (res->domains & RADEON_DOMAIN_GTT) {
       res->gart_usage_kb = MAX2(1, size / 1024);
    }
@@ -296,8 +290,6 @@ void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *d
    radeon_bo_reference(sctx->screen->ws, &sdst->buf, ssrc->buf);
    sdst->gpu_address = ssrc->gpu_address;
    sdst->b.b.bind = ssrc->b.b.bind;
-   sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
-   sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
    sdst->flags = ssrc->flags;
 
    assert(sdst->vram_usage_kb == ssrc->vram_usage_kb);
@@ -395,10 +387,7 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resour
    bool force_discard_range = false;
    if (usage & (PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_DISCARD_RANGE) &&
        !(usage & PIPE_MAP_PERSISTENT) &&
-       /* Try not to decrement the counter if it's not positive. Still racy,
-        * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
-       buf->max_forced_staging_uploads > 0 &&
-       p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
+       buf->b.b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY) {
       usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_UNSYNCHRONIZED);
       usage |= PIPE_MAP_DISCARD_RANGE;
       force_discard_range = true;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index e385c57b394..a8722b20fa3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -301,7 +301,6 @@ struct si_resource {
    enum radeon_bo_domain domains:8;
    enum radeon_bo_flag flags:16;
    unsigned bind_history;
-   int max_forced_staging_uploads;
 
    /* The buffer range which is initialized (with a write transfer,
     * streamout, DMA, or as a random access target). The rest of
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 09db0b8b334..488a3968082 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -526,6 +526,7 @@ enum pipe_flush_flags
 #define PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE     (1 << 4)
 #define PIPE_RESOURCE_FLAG_ENCRYPTED             (1 << 5)
 #define PIPE_RESOURCE_FLAG_DONT_OVER_ALLOCATE    (1 << 6)
+#define PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY     (1 << 7) /* for small visible VRAM */
 #define PIPE_RESOURCE_FLAG_DRV_PRIV    (1 << 8) /* driver/winsys private */
 #define PIPE_RESOURCE_FLAG_FRONTEND_PRIV         (1 << 24) /* gallium frontend private */