Mesa (main): radeonsi: change how the prim discard CS is enabled and splitting limits

Mon Jun 28 13:39:00 UTC 2021

Module: Mesa
Branch: main
Commit: 9fa0d2cf35017b0cece7d333e9bd8fd01f4b3f61
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=9fa0d2cf35017b0cece7d333e9bd8fd01f4b3f61

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Mon May 31 21:48:28 2021 -0400

radeonsi: change how the prim discard CS is enabled and splitting limits

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11510>

---

 .../drivers/radeonsi/si_compute_prim_discard.c     | 56 +++++++---------------
 src/gallium/drivers/radeonsi/si_pipe.c             |  1 +
 src/gallium/drivers/radeonsi/si_pipe.h             |  1 +
 3 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index ff875c1b88f..d2040b24f6a 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -129,20 +129,14 @@
 
 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
  * draw calls to process by compute before signaling the gfx IB. This reduces the number
- * of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH (512 * 1024)
-/* Draw call splitting at the packet level. This allows signaling the gfx IB
- * for big draw calls sooner, but doesn't allow context flushes between packets. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
-/* If there is not enough ring buffer space for the current IB, split draw calls into
- * this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
+ * of EOP events + REWIND packets, because they decrease performance.
+ * This also determines the granularity of draw-level and packet-level splitting.
+ */
+#define PRIMS_PER_IB  (1024 * 1024)  /* size per gfx IB */
+#define PRIMS_PER_BATCH (128 * 1024) /* size between REWIND packets */
 
 /* Derived values. */
 #define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL                                                                   \
-   (false /* TODO */ ? SPLIT_PRIMS_PACKET_LEVEL_VALUE                                  \
-                                 : UINT_MAX & ~(THREADGROUP_SIZE - 1))
 
 #define REWIND_SIGNAL_BIT 0x80000000
 
@@ -159,31 +153,18 @@ void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_
       return;
 
    /* TODO: enable this */
-   bool enable_on_pro_graphics_by_default = false;
+   bool enable_by_default = false;
 
    if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
-       (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
-        (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
-         sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
-         sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
-         sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
+       (enable_by_default && sscreen->allow_draw_out_of_order &&
+        sscreen->info.num_se >= 2)) {
       *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
 
       if (sscreen->debug_flags & DBG(ALWAYS_PD))
          *prim_discard_vertex_count_threshold = 0; /* always enable */
 
-      const uint32_t MB = 1024 * 1024;
-      const uint64_t GB = 1024 * 1024 * 1024;
-
-      /* The total size is double this per context.
-       * Greater numbers allow bigger gfx IBs.
-       */
-      if (sscreen->info.vram_size <= 2 * GB)
-         *index_ring_size_per_ib = 64 * MB;
-      else if (sscreen->info.vram_size <= 4 * GB)
-         *index_ring_size_per_ib = 128 * MB;
-      else
-         *index_ring_size_per_ib = 256 * MB;
+      /* The total size is double this per context. Greater numbers allow bigger gfx IBs. */
+      *index_ring_size_per_ib = PRIMS_PER_IB * 12; /* 3 32-bit indices per primitive. */
    }
 }
 
@@ -602,7 +583,6 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
    unsigned num_prims = num_prims_per_instance * instance_count;
    unsigned out_indexbuf_size = num_prims * 12;
    bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
-   const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
 
    /* Split draws at the draw call level if the ring is full. This makes
     * better use of the ring space.
@@ -614,9 +594,9 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
       unsigned vert_count_per_subdraw = 0;
 
       if (prim == PIPE_PRIM_TRIANGLES)
-         vert_count_per_subdraw = split_prims_draw_level * 3;
+         vert_count_per_subdraw = PRIMS_PER_BATCH * 3;
       else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
-         vert_count_per_subdraw = split_prims_draw_level;
+         vert_count_per_subdraw = PRIMS_PER_BATCH;
       else
          unreachable("shouldn't get here");
 
@@ -668,7 +648,7 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
       } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
          /* No primitive pair can be split, because strips reverse orientation
           * for odd primitives. */
-         STATIC_ASSERT(split_prims_draw_level % 2 == 0);
+         STATIC_ASSERT(PRIMS_PER_BATCH % 2 == 0);
 
          for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
             split_draw_range.start = base_start + start;
@@ -688,7 +668,7 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
       return SI_PRIM_DISCARD_DISABLED;
    }
 
-   unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL) * num_draws;
+   unsigned num_subdraws = DIV_ROUND_UP(num_prims, PRIMS_PER_BATCH) * num_draws;
    unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
                               24 * (num_subdraws - 1) + /* subdraws */
                               30;                       /* leave some space at the end */
@@ -975,14 +955,14 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
       sctx->compute_ib_last_shader = shader;
    }
 
-   STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
+   STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0);
 
    /* Big draw calls are split into smaller dispatches and draw packets. */
-   for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
+   for (unsigned start_prim = 0; start_prim < num_prims; start_prim = num_prims /* implement splitting */) {
       unsigned num_subdraw_prims;
 
-      if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
-         num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
+      if (start_prim + PRIMS_PER_BATCH < num_prims)
+         num_subdraw_prims = PRIMS_PER_BATCH;
       else
          num_subdraw_prims = num_prims - start_prim;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index e34abd6fd74..95b4a71873e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1235,6 +1235,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    sscreen->commutative_blend_add =
       driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
       driQueryOptionb(config->options, "allow_draw_out_of_order");
+   sscreen->allow_draw_out_of_order = driQueryOptionb(config->options, "allow_draw_out_of_order");
 
    sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) &&
                       sscreen->info.chip_class >= GFX10 &&
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b34427cf5ba..a58f26bb018 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -545,6 +545,7 @@ struct si_screen {
    bool has_out_of_order_rast;
    bool assume_no_z_fights;
    bool commutative_blend_add;
+   bool allow_draw_out_of_order;
    bool dpbb_allowed;
    bool use_ngg;
    bool use_ngg_culling;