[Mesa-dev] [PATCH 17/17] i965: Disentangle batch and state buffer flushing.

Kenneth Graunke kenneth at whitecape.org
Wed Sep 6 00:09:50 UTC 2017


We now flush the batch when either the batchbuffer or statebuffer
reaches the original intended batch size, instead of when the sum of
the two reaches a certain size (which makes no sense now that they're
separate buffers).

With this change, we also need to update our "are we near the end?"
estimate to require separate batch and state buffer space.  I obtained
these estimates by looking at the size of draw calls in the Unreal 4
Elemental Demo (using INTEL_DEBUG=flush and always_flush_batch=true).

This will increase the batch size by perhaps 2-4x, which will almost
certainly have a performance impact, and may impact overall system
responsiveness.

XXX: benchmark, may need a lot of tuning.
---
 src/mesa/drivers/dri/i965/brw_compute.c       | 18 ++++--------------
 src/mesa/drivers/dri/i965/brw_draw.c          | 18 ++++--------------
 src/mesa/drivers/dri/i965/brw_state.h         |  1 +
 src/mesa/drivers/dri/i965/genX_blorp_exec.c   |  4 ++--
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 25 +++++++++++++++++--------
 5 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
index 1bad7ac7a0c..7f0278ac92b 100644
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -167,7 +167,6 @@ static void
 brw_dispatch_compute_common(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
-   int estimated_buffer_space_needed;
    bool fail_next = false;
 
    if (!_mesa_check_conditional_render(ctx))
@@ -180,20 +179,11 @@ brw_dispatch_compute_common(struct gl_context *ctx)
 
    brw_predraw_resolve_inputs(brw);
 
-   const int sampler_state_size = 16; /* 16 bytes */
-   estimated_buffer_space_needed = 512; /* batchbuffer commands */
-   estimated_buffer_space_needed += (BRW_MAX_TEX_UNIT *
-                                     (sampler_state_size +
-                                      sizeof(struct gen5_sampler_default_color)));
-   estimated_buffer_space_needed += 1024; /* push constants */
-   estimated_buffer_space_needed += 512; /* misc. pad */
-
-   /* Flush the batch if it's approaching full, so that we don't wrap while
-    * we've got validated state that needs to be in the same batch as the
-    * primitives.
+   /* Flush the batch if the batch/state buffers are nearly full.  We can
+    * grow them if needed, but this is not free, so we'd like to avoid it.
     */
-   intel_batchbuffer_require_space(brw, estimated_buffer_space_needed,
-                                   RENDER_RING);
+   intel_batchbuffer_require_space(brw, 600, RENDER_RING);
+   brw_require_statebuffer_space(brw, 2500);
    intel_batchbuffer_save_state(brw);
 
  retry:
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index d1ec2e3f09d..06c6ed72c98 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -669,26 +669,16 @@ brw_try_draw_prims(struct gl_context *ctx,
    brw->ctx.NewDriverState |= BRW_NEW_VERTICES;
 
    for (i = 0; i < nr_prims; i++) {
-      int estimated_max_prim_size;
-      const int sampler_state_size = 16;
-
-      estimated_max_prim_size = 512; /* batchbuffer commands */
-      estimated_max_prim_size += BRW_MAX_TEX_UNIT *
-         (sampler_state_size + sizeof(struct gen5_sampler_default_color));
-      estimated_max_prim_size += 1024; /* gen6 VS push constants */
-      estimated_max_prim_size += 1024; /* gen6 WM push constants */
-      estimated_max_prim_size += 512; /* misc. pad */
-
       /* Flag BRW_NEW_DRAW_CALL on every draw.  This allows us to have
        * atoms that happen on every draw call.
        */
       brw->ctx.NewDriverState |= BRW_NEW_DRAW_CALL;
 
-      /* Flush the batch if it's approaching full, so that we don't wrap while
-       * we've got validated state that needs to be in the same batch as the
-       * primitives.
+      /* Flush the batch if the batch/state buffers are nearly full.  We can
+       * grow them if needed, but this is not free, so we'd like to avoid it.
        */
-      intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING);
+      intel_batchbuffer_require_space(brw, 1500, RENDER_RING);
+      brw_require_statebuffer_space(brw, 2400);
       intel_batchbuffer_save_state(brw);
 
       if (brw->num_instances != prims[i].num_instances ||
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index c8b71e72de5..9718739dea9 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -185,6 +185,7 @@ void brw_destroy_caches( struct brw_context *brw );
 void brw_print_program_cache(struct brw_context *brw);
 
 /* intel_batchbuffer.c */
+void brw_require_statebuffer_space(struct brw_context *brw, int size);
 void *brw_state_batch(struct brw_context *brw,
                       int size, int alignment, uint32_t *out_offset);
 uint32_t brw_state_batch_size(struct brw_context *brw, uint32_t offset);
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index 5bff7eaff59..3fe81c7c6a1 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -205,7 +205,6 @@ genX(blorp_exec)(struct blorp_batch *batch,
    assert(batch->blorp->driver_ctx == batch->driver_batch);
    struct brw_context *brw = batch->driver_batch;
    struct gl_context *ctx = &brw->ctx;
-   const uint32_t estimated_max_batch_usage = GEN_GEN >= 8 ? 1920 : 1700;
    bool check_aperture_failed_once = false;
 
    /* Flush the sampler and render caches.  We definitely need to flush the
@@ -222,7 +221,8 @@ genX(blorp_exec)(struct blorp_batch *batch,
    brw_select_pipeline(brw, BRW_RENDER_PIPELINE);
 
 retry:
-   intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
+   intel_batchbuffer_require_space(brw, 1400, RENDER_RING);
+   brw_require_statebuffer_space(brw, 600);
    intel_batchbuffer_save_state(brw);
    brw->no_batch_wrap = true;
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 0af9101e5f4..fba24198e05 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -315,9 +315,8 @@ intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
       intel_batchbuffer_flush(brw);
    }
 
-   /* For now, flush as if the batch and state buffers still shared a BO */
    const unsigned batch_used = USED_BATCH(*batch) * 4;
-   if (batch_used + sz >= BATCH_SZ - batch->state_used) {
+   if (batch_used + sz >= BATCH_SZ) {
       if (!brw->no_batch_wrap) {
          intel_batchbuffer_flush(brw);
       } else {
@@ -325,7 +324,7 @@ intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
             MIN2(batch->bo->size + batch->bo->size / 2, MAX_BATCH_SIZE);
          grow_buffer(brw, &batch->bo, &batch->map, batch_used, new_size);
          batch->map_next = (void *) batch->map + batch_used;
-         assert(batch_used + sz < batch->bo->size - batch->state_used);
+         assert(batch_used + sz < batch->bo->size);
       }
    }
 
@@ -946,6 +945,19 @@ brw_state_batch_size(struct brw_context *brw, uint32_t offset)
    return entry ? (uintptr_t) entry->data : 0;
 }
 
+/**
+ * Reserve some space in the statebuffer, or flush.
+ *
+ * This is used to estimate when we're near the end of the batch,
+ * so we can flush early.
+ */
+void
+brw_require_statebuffer_space(struct brw_context *brw, int size)
+{
+   if (brw->batch.state_used + size >= STATE_SZ)
+      intel_batchbuffer_flush(brw);
+}
+
 /**
  * Allocates a block of space in the batchbuffer for indirect state.
  */
@@ -961,10 +973,7 @@ brw_state_batch(struct brw_context *brw,
 
    uint32_t offset = ALIGN(batch->state_used, alignment);
 
-   /* For now, follow the old flushing behavior. */
-   int batch_space = USED_BATCH(*batch) * 4;
-
-   if (offset + size >= STATE_SZ - batch_space) {
+   if (offset + size >= STATE_SZ) {
       if (!brw->no_batch_wrap) {
          intel_batchbuffer_flush(brw);
          offset = ALIGN(batch->state_used, alignment);
@@ -973,7 +982,7 @@ brw_state_batch(struct brw_context *brw,
             batch->state_bo->size + batch->state_bo->size / 2;
          grow_buffer(brw, &batch->state_bo, &batch->state_map,
                      batch->state_used, new_size);
-         assert(offset + size < batch->state_bo->size - batch_space);
+         assert(offset + size < batch->state_bo->size);
       }
    }
 
-- 
2.14.1



More information about the mesa-dev mailing list