Mesa (main): radeonsi: drop gfx7 support from the prim discard CS to simplify code

Mon Jun 21 19:17:04 UTC 2021

Module: Mesa
Branch: main
Commit: 4fa58c04e4f01d39c4a95658c031f6e4437cbf7b
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=4fa58c04e4f01d39c4a95658c031f6e4437cbf7b

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Mon May 31 18:32:52 2021 -0400

radeonsi: drop gfx7 support from the prim discard CS to simplify code

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11102>

---

 .../drivers/radeonsi/si_compute_prim_discard.c     | 44 +++++-----------------
 src/gallium/drivers/radeonsi/si_state_draw.cpp     |  2 +-
 2 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index 84ebad0661f..0d24fd2a6b4 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -180,8 +180,6 @@
                                  : UINT_MAX & ~(THREADGROUP_SIZE - 1))
 
 #define REWIND_SIGNAL_BIT 0x80000000
-/* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION 0
 
 void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
                                          unsigned *prim_discard_vertex_count_threshold,
@@ -189,7 +187,7 @@ void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_
 {
    *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
 
-   if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
+   if (sscreen->info.chip_class <= GFX7 || /* SI-CI support is not implemented */
        !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
       return;
 
@@ -1060,12 +1058,8 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
    unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
                               24 * (num_subdraws - 1) + /* subdraws */
                               30;                       /* leave some space at the end */
-   unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
-
-   if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
-      need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
-   else
-      need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+   unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0) +
+                          num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
 
    if (ring_full ||
        (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
@@ -1097,11 +1091,8 @@ void si_compute_signal_gfx(struct si_context *sctx)
    struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
    unsigned writeback_L2_flags = 0;
 
-   /* The writeback L2 flags vary with each chip generation. */
-   /* CI needs to flush vertex indices to memory. */
-   if (sctx->chip_class <= GFX7)
-      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
-   else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
+   /* GFX8 needs to flush L2 for CP to see the updated vertex count. */
+   if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
       writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
 
    if (!sctx->compute_num_prims_in_batch)
@@ -1417,27 +1408,10 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
          assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
          sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
 
-         if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
-            radeon_begin(gfx_cs);
-            radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-            radeon_emit(gfx_cs, 0);
-            radeon_end();
-
-            si_cp_wait_mem(
-               sctx, gfx_cs,
-               sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
-               REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
-            /* Use INDIRECT_BUFFER to chain to a different buffer
-             * to discard the CP prefetch cache.
-             */
-            sctx->ws->cs_check_space(gfx_cs, 0, true);
-         } else {
-            radeon_begin(gfx_cs);
-            radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
-            radeon_emit(gfx_cs, 0);
-            radeon_end();
-         }
+         radeon_begin(gfx_cs);
+         radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+         radeon_emit(gfx_cs, 0);
+         radeon_end();
       }
 
       sctx->compute_num_prims_in_batch += num_subdraw_prims;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index f43fd5afb0a..3fcac6ad821 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -2312,7 +2312,7 @@ template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
 static void si_init_draw_vbo(struct si_context *sctx)
 {
    /* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
-   if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX7)
+   if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8)
       return;
 
    if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS))