[Mesa-dev] [PATCH 26/30] i965/gs: Allocate URB space for use by geometry shader.

Tue Aug 20 11:30:41 PDT 2013

Previously, we gave all of the URB space (other than the small amount
that is used for push constants) to the vertex shader.  However, when
a geometry shader is active, we need to divide it up between the
vertex and geometry shaders.

The size of the URB entries for the vertex and geometry shaders can
vary dramatically from one shader to the next.  So it doesn't make
sense to simply split the available space in two.  In particular:

- On Ivy Bridge GT1, this would not leave enough space for the worst
  case geometry shader, which requires 64k of URB space.

- Due to hardware-imposed limits on the maximum number of URB entries,
  sometimes a given shader stage will only be capable of using a small
  amount of URB space.  When this happens, it may make sense to
  allocate substantially less than half of the available space to that
  stage.

Our algorithm for dividing space between the two stages is to first
compute (a) the minimum amount of URB space that each stage needs in
order to function properly, and (b) the amount of additional URB space
that each stage "wants" (i.e. that it would be capable of making use
of).  If the total amount of space available is not enough to satisfy
needs + wants, then each stage's "wants" amount is scaled back by the
same factor in order to fit.

When only a vertex shader is active, this algorithm produces
equivalent results to the old algorithm (if the vertex shader stage
can make use of all the available URB space, we assign all the space
to it; if it can't, we let it use as much as it can).

In the future, when we need to support tessellation control and
tessellation evaluation pipeline stages, it should be straightforward
to expand this algorithm to cover them.
---
 src/mesa/drivers/dri/i965/brw_context.h  |   5 +-
 src/mesa/drivers/dri/i965/gen7_blorp.cpp |  16 ++--
 src/mesa/drivers/dri/i965/gen7_urb.c     | 155 +++++++++++++++++++++++++------
 3 files changed, 141 insertions(+), 35 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2addf99..0b5c465 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1517,8 +1517,9 @@ void
 gen7_allocate_push_constants(struct brw_context *brw);
 
 void
-gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
-                    GLuint vs_size, GLuint vs_start);
+gen7_emit_urb_state(struct brw_context *brw,
+                    GLuint nr_vs_entries, GLuint vs_size, GLuint vs_start,
+                    GLuint nr_gs_entries, GLuint gs_size, GLuint gs_start);
 
 
 
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 44e7578..6e7a315 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -51,14 +51,16 @@ static void
 gen7_blorp_emit_urb_config(struct brw_context *brw,
                            const brw_blorp_params *params)
 {
-   /* The minimum valid value is 32. See 3DSTATE_URB_VS,
-    * Dword 1.15:0 "VS Number of URB Entries".
+   /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
+    * 1.15:0 "VS Number of URB Entries".
     */
-   int num_vs_entries = 32;
-   int vs_size = 2;
-   int vs_start = 2; /* skip over push constants */
-
-   gen7_emit_urb_state(brw, num_vs_entries, vs_size, vs_start);
+   gen7_emit_urb_state(brw,
+                       32 /* num_vs_entries */,
+                       2 /* vs_size */,
+                       2 /* vs_start */,
+                       0 /* num_gs_entries */,
+                       1 /* gs_size */,
+                       2 /* gs_start */);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 350f644..f45ac1c 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -74,34 +74,136 @@ gen7_upload_urb(struct brw_context *brw)
 {
    const int push_size_kB = brw->is_haswell && brw->gt == 3 ? 32 : 16;
 
-   /* Total space for entries is URB size - 16kB for push constants */
-   int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /* bytes */
-
    /* CACHE_NEW_VS_PROG */
    unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1);
-
-   int nr_vs_entries = handle_region_size / (vs_size * 64);
-   if (nr_vs_entries > brw->urb.max_vs_entries)
-      nr_vs_entries = brw->urb.max_vs_entries;
-
-   /* According to volume 2a, nr_vs_entries must be a multiple of 8. */
-   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8);
-
-   /* URB Starting Addresses are specified in multiples of 8kB. */
-   brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */
-
-   assert(brw->urb.nr_vs_entries % 8 == 0);
-   assert(brw->urb.nr_gs_entries % 8 == 0);
-   /* GS requirement */
-   assert(!brw->gs.prog_active);
+   unsigned vs_entry_size_bytes = vs_size * 64;
+   /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_VEC4_GS_PROG */
+   bool gs_present = brw->geometry_program;
+   unsigned gs_size = gs_present ?
+      brw->vec4_gs.prog_data->base.urb_entry_size : 1;
+   unsigned gs_entry_size_bytes = gs_size * 64;
+
+   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
+    *
+    *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
+    *     Allocation Size is less than 9 512-bit URB entries.
+    *
+    * Similar text exists for GS.
+    */
+   unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
+   unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
+
+   /* URB allocations must be done in 8k chunks. */
+   unsigned chunk_size_bytes = 8192;
+
+   /* Determine the size of the URB in chunks.
+    */
+   unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes;
+
+   /* Reserve space for push constants */
+   unsigned push_constant_bytes = 1024 * push_size_kB;
+   unsigned push_constant_chunks =
+      push_constant_bytes / chunk_size_bytes;
+
+   /* Initially, assign each stage the minimum amount of URB space it needs,
+    * and make a note of how much additional space it "wants" (the amount of
+    * additional space it could actually make use of).
+    */
+
+   /* VS always requires at least 32 URB entries */
+   unsigned vs_chunks =
+      ALIGN(32 * vs_entry_size_bytes, chunk_size_bytes) / chunk_size_bytes;
+   unsigned vs_wants =
+      ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
+            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
+
+   unsigned gs_chunks = 0;
+   unsigned gs_wants = 0;
+   if (gs_present) {
+      /* There are two constraints on the minimum amount of URB space we can
+       * allocate:
+       *
+       * (1) We need room for at least 2 URB entries, since we always operate
+       * the GS in DUAL_OBJECT mode.
+       *
+       * (2) We can't allocate less than nr_gs_entries_granularity.
+       */
+      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
+                        chunk_size_bytes) / chunk_size_bytes;
+      gs_wants =
+         ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
+               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
+   }
+
+   /* There should always be enough URB space to satisfy the minimum
+    * requirements of each stage.
+    */
+   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
+   assert(total_needs <= urb_chunks);
+
+   /* Mete out remaining space (if any) in proportion to "wants". */
+   unsigned total_wants = vs_wants + gs_wants;
+   unsigned remaining_space = urb_chunks - total_needs;
+   if (remaining_space > total_wants)
+      remaining_space = total_wants;
+   if (remaining_space > 0) {
+      unsigned vs_additional = (unsigned)
+         round(vs_wants * (((double) remaining_space) / total_wants));
+      vs_chunks += vs_additional;
+      remaining_space -= vs_additional;
+      gs_chunks += remaining_space;
+   }
+
+   /* Sanity check that we haven't over-allocated. */
+   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
+
+   /* Finally, compute the number of entries that can fit in the space
+    * allocated to each stage.
+    */
+   unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
+   unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
+
+   /* Since we rounded up when computing *_wants, this may be slightly more
+    * than the maximum allowed amount, so correct for that.
+    */
+   nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
+   nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
+
+   /* Ensure that we program a multiple of the granularity. */
+   nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
+   nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
+
+   /* Finally, sanity check to make sure we have at least the minimum number
+    * of entries needed for each stage.
+    */
+   assert(nr_vs_entries >= 32);
+   if (gs_present)
+      assert(nr_gs_entries >= 2);
+
+   /* And store the values we computed in brw so that they can be used by
+    * other state atoms.
+    */
+   brw->urb.nr_vs_entries = nr_vs_entries;
+   brw->urb.nr_gs_entries = nr_gs_entries;
+
+   /* Lay out the URB in the following order:
+    * - push constants
+    * - VS
+    * - GS
+    */
+   brw->urb.vs_start = push_constant_chunks;
+   brw->urb.gs_start = push_constant_chunks + vs_chunks;
 
    gen7_emit_vs_workaround_flush(brw);
-   gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start);
+   gen7_emit_urb_state(brw,
+                       brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
+                       brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start);
 }
 
 void
-gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
-                    GLuint vs_size, GLuint vs_start)
+gen7_emit_urb_state(struct brw_context *brw,
+                    GLuint nr_vs_entries, GLuint vs_size, GLuint vs_start,
+                    GLuint nr_gs_entries, GLuint gs_size, GLuint gs_start)
 {
    BEGIN_BATCH(8);
    OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2));
@@ -109,11 +211,12 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
              ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
              (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
 
-   /* Allocate the GS, HS, and DS zero space - we don't use them. */
    OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_gs_entries |
+             ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
 
+   /* Allocate the HS and DS zero space - we don't use them. */
    OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
    OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
              (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
@@ -127,8 +230,8 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
 const struct brw_tracked_state gen7_urb = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_CONTEXT,
-      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
+      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
+      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_VEC4_GS_PROG),
    },
    .emit = gen7_upload_urb,
 };
-- 
1.8.3.4