[Mesa-dev] [PATCH 4/4] radeonsi: emulate REWIND using INDIRECT_BUFFER for primitive culling on CI

Thu Feb 14 06:31:05 UTC 2019

From: Marek Olšák <marek.olsak at amd.com>

This increases "Paraview - Many Spheres" performance from 37.91 to 43 fps
on Hawaii.
---
 src/gallium/drivers/r300/r300_blit.c          |  2 +-
 src/gallium/drivers/r300/r300_render.c        |  2 +-
 src/gallium/drivers/r600/r600_hw_context.c    |  2 +-
 src/gallium/drivers/r600/r600_pipe_common.c   |  2 +-
 src/gallium/drivers/radeon/radeon_winsys.h    |  6 +++++-
 .../radeonsi/si_compute_prim_discard.c        | 18 ++++++++---------
 src/gallium/drivers/radeonsi/si_dma_cs.c      |  2 +-
 src/gallium/drivers/radeonsi/si_gfx_cs.c      |  2 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     | 20 ++++++++++++-------
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |  3 ++-
 10 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index 01fccfbe7ed..4ac2589a9a7 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -375,21 +375,21 @@ static void r300_clear(struct pipe_context* pipe,
          * procedure. */
         /* Calculate zmask_clear and hiz_clear atom sizes. */
         unsigned dwords =
             r300->gpu_flush.size +
             (r300->zmask_clear.dirty ? r300->zmask_clear.size : 0) +
             (r300->hiz_clear.dirty ? r300->hiz_clear.size : 0) +
             (r300->cmask_clear.dirty ? r300->cmask_clear.size : 0) +
             r300_get_num_cs_end_dwords(r300);
 
         /* Reserve CS space. */
-        if (!r300->rws->cs_check_space(r300->cs, dwords)) {
+        if (!r300->rws->cs_check_space(r300->cs, dwords, false)) {
             r300_flush(&r300->context, PIPE_FLUSH_ASYNC, NULL);
         }
 
         /* Emit clear packets. */
         r300_emit_gpu_flush(r300, r300->gpu_flush.size, r300->gpu_flush.state);
         r300->gpu_flush.dirty = FALSE;
 
         if (r300->zmask_clear.dirty) {
             r300_emit_zmask_clear(r300, r300->zmask_clear.size,
                                   r300->zmask_clear.state);
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 211d35d0607..ed129e1a306 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -208,21 +208,21 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
 
     if (emit_vertex_arrays)
         cs_dwords += 55; /* emit_vertex_arrays */
 
     if (emit_vertex_arrays_swtcl)
         cs_dwords += 7; /* emit_vertex_arrays_swtcl */
 
     cs_dwords += r300_get_num_cs_end_dwords(r300);
 
     /* Reserve requested CS space. */
-    if (!r300->rws->cs_check_space(r300->cs, cs_dwords)) {
+    if (!r300->rws->cs_check_space(r300->cs, cs_dwords, false)) {
         r300_flush(&r300->context, PIPE_FLUSH_ASYNC, NULL);
         flushed = TRUE;
     }
 
     return flushed;
 }
 
 /**
  * Validate buffers and emit dirty state.
  * \param r300          The context.
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index a2f5f637b20..abf5d03e4f9 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -77,21 +77,21 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		num_dw += 3;
 	}
 
 	/* Count in framebuffer cache flushes at the end of CS. */
 	num_dw += R600_MAX_FLUSH_CS_DWORDS;
 
 	/* The fence at the end of CS. */
 	num_dw += 10;
 
 	/* Flush if there's not enough space. */
-	if (!ctx->b.ws->cs_check_space(ctx->b.gfx.cs, num_dw)) {
+	if (!ctx->b.ws->cs_check_space(ctx->b.gfx.cs, num_dw, false)) {
 		ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 	}
 }
 
 void r600_flush_emit(struct r600_context *rctx)
 {
 	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned cp_coher_cntl = 0;
 	unsigned wait_until = 0;
 
diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c
index 3c00ad691ac..d4d4511df3e 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@@ -283,21 +283,21 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 	 * IBs using too little memory are limited by the IB submission overhead.
 	 * IBs using too much memory are limited by the kernel/TTM overhead.
 	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 	 *
 	 * This heuristic makes sure that DMA requests are executed
 	 * very soon after the call is made and lowers memory usage.
 	 * It improves texture upload performance by keeping the DMA
 	 * engine busy while uploads are being submitted.
 	 */
 	num_dw++; /* for emit_wait_idle below */
-	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
+	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw, false) ||
 	    ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 		ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 		assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 	}
 
 	/* Wait for idle if either buffer has been used in the IB before to
 	 * prevent read-after-write hazards.
 	 */
 	if ((dst &&
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index c04c014bd2f..814fb9068dc 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -563,22 +563,26 @@ struct radeon_winsys {
      * \param cs        A command stream to validate.
      */
     bool (*cs_validate)(struct radeon_cmdbuf *cs);
 
     /**
      * Check whether the given number of dwords is available in the IB.
      * Optionally chain a new chunk of the IB if necessary and supported.
      *
      * \param cs        A command stream.
      * \param dw        Number of CS dwords requested by the caller.
+     * \param force_chaining  Chain the IB into a new buffer now to discard
+     *                        the CP prefetch cache.
+     * \return true if there is enough space
      */
-    bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw);
+    bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw,
+                           bool force_chaining);
 
     /**
      * Return the buffer list.
      *
      * This is the buffer list as passed to the kernel, i.e. it only contains
      * the parent buffers of sub-allocated buffers.
      *
      * \param cs    Command stream
      * \param list  Returned buffer list. Set to NULL to query the count only.
      * \return      The buffer count.
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index afb7424aa41..34282391bf1 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -178,21 +178,20 @@
 #define SPLIT_PRIMS_DRAW_LEVEL		PRIMS_PER_BATCH
 
 /* Derived values. */
 #define WAVES_PER_TG			DIV_ROUND_UP(THREADGROUP_SIZE, 64)
 #define SPLIT_PRIMS_PACKET_LEVEL	(VERTEX_COUNTER_GDS_MODE == 2 ? \
 					 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
 					 UINT_MAX & ~(THREADGROUP_SIZE - 1))
 
 #define REWIND_SIGNAL_BIT		0x80000000
 /* For emulating the rewind packet on CI. */
-#define REWIND_EMULATION_NOPS		4096 /* minimum for Hawaii, guessed */
 #define FORCE_REWIND_EMULATION		0
 
 void si_initialize_prim_discard_tunables(struct si_context *sctx)
 {
 	sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
 
 	if (sctx->chip_class == SI || /* SI support is not implemented */
 	    !sctx->screen->info.has_gds_ordered_append ||
 	    sctx->screen->debug_flags & DBG(NO_PD) ||
 	    /* If aux_context == NULL, we are initializing aux_context right now. */
@@ -1045,46 +1044,45 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
 	if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
 		if (SI_PRIM_DISCARD_DEBUG)
 			puts("PD failed: draw call too big, can't be split");
 		return SI_PRIM_DISCARD_DISABLED;
 	}
 
 	unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
 	unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
 				   24 * (num_subdraws - 1) + /* subdraws */
 				   20; /* leave some space at the end */
-	unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx) +
-			       num_subdraws * 6; /* DRAW(6) */
+	unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
 
 	if (sctx->chip_class <= CIK || FORCE_REWIND_EMULATION)
-		need_gfx_dw += num_subdraws * (9 + REWIND_EMULATION_NOPS);
+		need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
 	else
-		need_gfx_dw += num_subdraws * 2; /* use REWIND */
+		need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
 
 	if (ring_full ||
 	    (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
-	    !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw)) {
+	    !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
 		/* If the current IB is empty but the size is too small, add a NOP
 		 * packet to force a flush and get a bigger IB.
 		 */
 		if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
 		    gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
 			radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
 			radeon_emit(gfx_cs, 0);
 		}
 
 		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 	}
 
 	/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
 	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-	bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw);
+	bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
 	assert(compute_has_space);
 	assert(si_check_ring_space(sctx, out_indexbuf_size));
 	return SI_PRIM_DISCARD_ENABLED;
 }
 
 void si_compute_signal_gfx(struct si_context *sctx)
 {
 	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
 	unsigned writeback_L2_flags = 0;
 
@@ -1415,22 +1413,24 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
 			if (sctx->chip_class <= CIK || FORCE_REWIND_EMULATION) {
 				radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
 				radeon_emit(gfx_cs, 0);
 
 				si_cp_wait_mem(sctx, gfx_cs,
 					       sctx->compute_rewind_va |
 					       (uint64_t)sctx->screen->info.address32_hi << 32,
 					       REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
 					       WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
 
-				radeon_emit(gfx_cs, PKT3(PKT3_NOP, REWIND_EMULATION_NOPS - 2, 0));
-				gfx_cs->current.cdw += REWIND_EMULATION_NOPS - 1;
+				/* Use INDIRECT_BUFFER to chain to a different buffer
+				 * to discard the CP prefetch cache.
+				 */
+				sctx->ws->cs_check_space(gfx_cs, 0, true);
 			} else {
 				radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
 				radeon_emit(gfx_cs, 0);
 			}
 		}
 
 		sctx->compute_num_prims_in_batch += num_subdraw_prims;
 
 		uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
 		uint64_t index_va = out_indexbuf_va + start_prim * 12;
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 2aafc1f09a0..f04bc2e28da 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -155,21 +155,21 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
 	 * IBs using too little memory are limited by the IB submission overhead.
 	 * IBs using too much memory are limited by the kernel/TTM overhead.
 	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 	 *
 	 * This heuristic makes sure that DMA requests are executed
 	 * very soon after the call is made and lowers memory usage.
 	 * It improves texture upload performance by keeping the DMA
 	 * engine busy while uploads are being submitted.
 	 */
 	num_dw++; /* for emit_wait_idle below */
-	if (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
+	if (!ws->cs_check_space(ctx->dma_cs, num_dw, false) ||
 	    ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
 	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
 		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 		assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
 	}
 
 	/* Wait for idle if either buffer has been used in the IB before to
 	 * prevent read-after-write hazards.
 	 */
 	if ((dst &&
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 9dbe932c66f..779d8106839 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -49,21 +49,21 @@ void si_need_gfx_cs_space(struct si_context *ctx)
 						   ctx->vram, ctx->gtt))) {
 		ctx->gtt = 0;
 		ctx->vram = 0;
 		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 		return;
 	}
 	ctx->gtt = 0;
 	ctx->vram = 0;
 
 	unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
-	if (!ctx->ws->cs_check_space(cs, need_dwords))
+	if (!ctx->ws->cs_check_space(cs, need_dwords, false))
 		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence)
 {
 	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct radeon_winsys *ws = ctx->ws;
 	unsigned wait_flags = 0;
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 392f69e5fef..5424398c350 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1067,47 +1067,53 @@ amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *ib,
       }
    }
    return &cs->compute_ib.base;
 }
 
 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
 {
    return true;
 }
 
-static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
+static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
+                                  bool force_chaining)
 {
    struct amdgpu_ib *ib = amdgpu_ib(rcs);
    struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
    unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw;
    unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
    unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
    uint64_t va;
    uint32_t *new_ptr_ib_size;
 
    assert(rcs->current.cdw <= rcs->current.max_dw);
 
    /* 125% of the size for IB epilog. */
    unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
    ib->max_check_space_size = MAX2(ib->max_check_space_size,
                                    safe_byte_size);
 
-   if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
-      return false;
+   /* If force_chaining is true, we can't return. We have to chain. */
+   if (!force_chaining) {
+      if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
+         return false;
 
-   ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
+      ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
 
-   if (rcs->current.max_dw - rcs->current.cdw >= dw)
-      return true;
+      if (rcs->current.max_dw - rcs->current.cdw >= dw)
+         return true;
+   }
 
-   if (!amdgpu_cs_has_chaining(cs))
+   if (!amdgpu_cs_has_chaining(cs)) {
+      assert(!force_chaining);
       return false;
+   }
 
    /* Allocate a new chunk */
    if (rcs->num_prev >= rcs->max_prev) {
       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
       struct radeon_cmdbuf_chunk *new_prev;
 
       new_prev = REALLOC(rcs->prev,
                          sizeof(*new_prev) * rcs->max_prev,
                          sizeof(*new_prev) * new_max_prev);
       if (!new_prev)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 2288c320975..3da4d3ce238 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -417,21 +417,22 @@ static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
 
             assert(cs->base.current.cdw == 0);
             if (cs->base.current.cdw != 0) {
                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
             }
         }
     }
     return status;
 }
 
-static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
+static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
+                                      bool force_chaining)
 {
    assert(rcs->current.cdw <= rcs->current.max_dw);
    return rcs->current.max_dw - rcs->current.cdw >= dw;
 }
 
 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
                                               struct radeon_bo_list_item *list)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     int i;
-- 
2.17.1