Mesa (master): radeonsi: prefetch VB descriptors right after uploading

Sat Jan 30 20:59:51 UTC 2021

Module: Mesa
Branch: master
Commit: 47587758f21019229be5faf9417e7dc5b9ea732d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=47587758f21019229be5faf9417e7dc5b9ea732d

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Mon Jan 11 15:14:40 2021 -0500

radeonsi: prefetch VB descriptors right after uploading

This skips the logic that sets and checks prefetch_L2_mask.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8794>

---

 src/gallium/drivers/radeonsi/si_gfx_cs.c       |  2 --
 src/gallium/drivers/radeonsi/si_pipe.h         |  1 -
 src/gallium/drivers/radeonsi/si_state_draw.cpp | 49 +++++++++-----------------
 3 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 01bce3ffe28..a63e7db6deb 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -445,8 +445,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
       ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
    if (ctx->queued.named.ps)
       ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
-   if (ctx->vb_descriptors_buffer)
-      ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
 
    /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
    bool has_clear_state = ctx->screen->info.has_clear_state;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6c9d8130d73..fd161034cb1 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -99,7 +99,6 @@ extern "C" {
 #define SI_CONTEXT_VGT_FLUSH          (1 << 15)
 #define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
 
-#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0)
 #define SI_PREFETCH_LS              (1 << 1)
 #define SI_PREFETCH_HS              (1 << 2)
 #define SI_PREFETCH_ES              (1 << 3)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index fb4f721dbc4..2bb7035ffc7 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -66,15 +66,6 @@ static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_stat
    si_cp_dma_prefetch(sctx, bo, 0, bo->width0);
 }
 
-static void si_prefetch_VBO_descriptors(struct si_context *sctx)
-{
-   if (!sctx->vertex_elements->vb_desc_list_alloc_size)
-      return;
-
-   si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
-                            sctx->vertex_elements->vb_desc_list_alloc_size);
-}
-
 enum si_L2_prefetch_mode {
    PREFETCH_BEFORE_DRAW = 1,
    PREFETCH_AFTER_DRAW,
@@ -82,11 +73,11 @@ enum si_L2_prefetch_mode {
 };
 
 /**
- * Prefetch shaders and VBO descriptors.
+ * Prefetch shaders.
  */
 template<chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
          si_L2_prefetch_mode mode>
-static void si_emit_prefetch_L2(struct si_context *sctx)
+static void si_prefetch_shaders(struct si_context *sctx)
 {
    unsigned mask = sctx->prefetch_L2_mask;
 
@@ -101,8 +92,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
          if (mode != PREFETCH_AFTER_DRAW) {
             if (mask & SI_PREFETCH_HS)
                si_prefetch_shader_async(sctx, sctx->queued.named.hs);
-            if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-               si_prefetch_VBO_descriptors(sctx);
 
             if (mode == PREFETCH_BEFORE_DRAW)
                return;
@@ -116,8 +105,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
          if (mode != PREFETCH_AFTER_DRAW) {
             if (mask & SI_PREFETCH_GS)
                si_prefetch_shader_async(sctx, sctx->queued.named.gs);
-            if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-               si_prefetch_VBO_descriptors(sctx);
 
             if (mode == PREFETCH_BEFORE_DRAW)
                return;
@@ -129,8 +116,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
          if (mode != PREFETCH_AFTER_DRAW) {
             if (mask & SI_PREFETCH_VS)
                si_prefetch_shader_async(sctx, sctx->queued.named.vs);
-            if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-               si_prefetch_VBO_descriptors(sctx);
 
             if (mode == PREFETCH_BEFORE_DRAW)
                return;
@@ -143,8 +128,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
          if (mode != PREFETCH_AFTER_DRAW) {
             if (mask & SI_PREFETCH_LS)
                si_prefetch_shader_async(sctx, sctx->queued.named.ls);
-            if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-               si_prefetch_VBO_descriptors(sctx);
 
             if (mode == PREFETCH_BEFORE_DRAW)
                return;
@@ -162,8 +145,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
          if (mode != PREFETCH_AFTER_DRAW) {
             if (mask & SI_PREFETCH_ES)
                si_prefetch_shader_async(sctx, sctx->queued.named.es);
-            if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-               si_prefetch_VBO_descriptors(sctx);
 
             if (mode == PREFETCH_BEFORE_DRAW)
                return;
@@ -177,8 +158,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
          if (mode != PREFETCH_AFTER_DRAW) {
             if (mask & SI_PREFETCH_VS)
                si_prefetch_shader_async(sctx, sctx->queued.named.vs);
-            if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-               si_prefetch_VBO_descriptors(sctx);
 
             if (mode == PREFETCH_BEFORE_DRAW)
                return;
@@ -1331,7 +1310,7 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
 }
 
 template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
-static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
+static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
 {
    unsigned count = sctx->num_vertex_elements;
    bool pointer_dirty, user_sgprs_dirty;
@@ -1362,10 +1341,10 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
          sctx->vb_descriptors_gpu_list = ptr;
          radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,
                                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
-         sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+         si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
+                            alloc_size);
       } else {
          si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
-         sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
       }
 
       unsigned first_vb_use_mask = velems->first_vb_use_mask;
@@ -2142,8 +2121,10 @@ static void si_draw_vbo(struct pipe_context *ctx,
       sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
       /* <-- CUs are idle here. */
 
-      /* This uploads VBO descriptors and sets user SGPRs. */
-      if (unlikely((!si_upload_vertex_buffer_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
+      /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
+       * It should done after cache flushing.
+       */
+      if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
          DRAW_CLEANUP;
          return;
       }
@@ -2170,7 +2151,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
       /* Start prefetches after the draw has been started. Both will run
        * in parallel, but starting the draw first is more important.
        */
-      si_emit_prefetch_L2<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx);
+      si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx);
    } else {
       /* If we don't wait for idle, start prefetches first, then set
        * states, and draw at the end.
@@ -2179,10 +2160,12 @@ static void si_draw_vbo(struct pipe_context *ctx,
          sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
 
       /* Only prefetch the API VS and VBO descriptors. */
-      si_emit_prefetch_L2<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_BEFORE_DRAW>(sctx);
+      si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_BEFORE_DRAW>(sctx);
 
-      /* This uploads VBO descriptors and sets user SGPRs. */
-      if (unlikely((!si_upload_vertex_buffer_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
+      /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
+       * It should done after cache flushing and after the VS prefetch.
+       */
+      if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
          DRAW_CLEANUP;
          return;
       }
@@ -2206,7 +2189,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
 
       /* Prefetch the remaining shaders after the draw has been
        * started. */
-      si_emit_prefetch_L2<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_AFTER_DRAW>(sctx);
+      si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_AFTER_DRAW>(sctx);
    }
 
    /* Clear the context roll flag after the draw call.