Mesa (master): i965/gs: Allocate URB space for use by GS.

Sun Sep 1 00:27:04 UTC 2013

Module: Mesa
Branch: master
Commit: fffba41c6828b8f46a162185147d3e9b9cc479e4
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fffba41c6828b8f46a162185147d3e9b9cc479e4

Author: Paul Berry <stereotype441 at gmail.com>
Date:   Wed Mar 27 09:49:17 2013 -0700

i965/gs: Allocate URB space for use by GS.

Previously, we gave all of the URB space (other than the small amount
that is used for push constants) to the vertex shader.  However, when
a geometry shader is active, we need to divide it up between the
vertex and geometry shaders.

The size of the URB entries for the vertex and geometry shaders can
vary dramatically from one shader to the next.  So it doesn't make
sense to simply split the available space in two.  In particular:

- On Ivy Bridge GT1, this would not leave enough space for the worst
  case geometry shader, which requires 64k of URB space.

- Due to hardware-imposed limits on the maximum number of URB entries,
  sometimes a given shader stage will only be capable of using a small
  amount of URB space.  When this happens, it may make sense to
  allocate substantially less than half of the available space to that
  stage.

Our algorithm for dividing space between the two stages is to first
compute (a) the minimum amount of URB space that each stage needs in
order to function properly, and (b) the amount of additional URB space
that each stage "wants" (i.e. that it would be capable of making use
of).  If the total amount of space available is not enough to satisfy
needs + wants, then each stage's "wants" amount is scaled back by the
same factor in order to fit.

When only a vertex shader is active, this algorithm produces
equivalent results to the old algorithm (if the vertex shader stage
can make use of all the available URB space, we assign all the space
to it; if it can't, we let it use as much as it can).

In the future, when we need to support tessellation control and
tessellation evaluation pipeline stages, it should be straightforward
to expand this algorithm to cover them.

v2: Use "unsigned" rather than "GLuint".

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>

---

 src/mesa/drivers/dri/i965/brw_context.h  |    6 +-
 src/mesa/drivers/dri/i965/gen7_blorp.cpp |   16 ++--
 src/mesa/drivers/dri/i965/gen7_urb.c     |  156 +++++++++++++++++++++++++-----
 3 files changed, 143 insertions(+), 35 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 3f17f1d..0bfe606 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1511,8 +1511,10 @@ void
 gen7_allocate_push_constants(struct brw_context *brw);
 
 void
-gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
-                    GLuint vs_size, GLuint vs_start);
+gen7_emit_urb_state(struct brw_context *brw,
+                    unsigned nr_vs_entries, unsigned vs_size,
+                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned gs_size, unsigned gs_start);
 
 
 
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index a387836..6c798b1 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -51,14 +51,16 @@ static void
 gen7_blorp_emit_urb_config(struct brw_context *brw,
                            const brw_blorp_params *params)
 {
-   /* The minimum valid value is 32. See 3DSTATE_URB_VS,
-    * Dword 1.15:0 "VS Number of URB Entries".
+   /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
+    * 1.15:0 "VS Number of URB Entries".
     */
-   int num_vs_entries = 32;
-   int vs_size = 2;
-   int vs_start = 2; /* skip over push constants */
-
-   gen7_emit_urb_state(brw, num_vs_entries, vs_size, vs_start);
+   gen7_emit_urb_state(brw,
+                       32 /* num_vs_entries */,
+                       2 /* vs_size */,
+                       2 /* vs_start */,
+                       0 /* num_gs_entries */,
+                       1 /* gs_size */,
+                       2 /* gs_start */);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 927af37..ed5cda8 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -74,34 +74,137 @@ gen7_upload_urb(struct brw_context *brw)
 {
    const int push_size_kB = brw->is_haswell && brw->gt == 3 ? 32 : 16;
 
-   /* Total space for entries is URB size - 16kB for push constants */
-   int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /* bytes */
-
    /* CACHE_NEW_VS_PROG */
    unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1);
-
-   int nr_vs_entries = handle_region_size / (vs_size * 64);
-   if (nr_vs_entries > brw->urb.max_vs_entries)
-      nr_vs_entries = brw->urb.max_vs_entries;
-
-   /* According to volume 2a, nr_vs_entries must be a multiple of 8. */
-   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8);
-
-   /* URB Starting Addresses are specified in multiples of 8kB. */
-   brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */
-
-   assert(brw->urb.nr_vs_entries % 8 == 0);
-   assert(brw->urb.nr_gs_entries % 8 == 0);
-   /* GS requirement */
-   assert(!brw->ff_gs.prog_active);
+   unsigned vs_entry_size_bytes = vs_size * 64;
+   /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_GS_PROG */
+   bool gs_present = brw->geometry_program;
+   unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
+   unsigned gs_entry_size_bytes = gs_size * 64;
+
+   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
+    *
+    *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
+    *     Allocation Size is less than 9 512-bit URB entries.
+    *
+    * Similar text exists for GS.
+    */
+   unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
+   unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
+
+   /* URB allocations must be done in 8k chunks. */
+   unsigned chunk_size_bytes = 8192;
+
+   /* Determine the size of the URB in chunks.
+    */
+   unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes;
+
+   /* Reserve space for push constants */
+   unsigned push_constant_bytes = 1024 * push_size_kB;
+   unsigned push_constant_chunks =
+      push_constant_bytes / chunk_size_bytes;
+
+   /* Initially, assign each stage the minimum amount of URB space it needs,
+    * and make a note of how much additional space it "wants" (the amount of
+    * additional space it could actually make use of).
+    */
+
+   /* VS always requires at least 32 URB entries */
+   unsigned vs_chunks =
+      ALIGN(32 * vs_entry_size_bytes, chunk_size_bytes) / chunk_size_bytes;
+   unsigned vs_wants =
+      ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
+            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
+
+   unsigned gs_chunks = 0;
+   unsigned gs_wants = 0;
+   if (gs_present) {
+      /* There are two constraints on the minimum amount of URB space we can
+       * allocate:
+       *
+       * (1) We need room for at least 2 URB entries, since we always operate
+       * the GS in DUAL_OBJECT mode.
+       *
+       * (2) We can't allocate less than nr_gs_entries_granularity.
+       */
+      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
+                        chunk_size_bytes) / chunk_size_bytes;
+      gs_wants =
+         ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
+               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
+   }
+
+   /* There should always be enough URB space to satisfy the minimum
+    * requirements of each stage.
+    */
+   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
+   assert(total_needs <= urb_chunks);
+
+   /* Mete out remaining space (if any) in proportion to "wants". */
+   unsigned total_wants = vs_wants + gs_wants;
+   unsigned remaining_space = urb_chunks - total_needs;
+   if (remaining_space > total_wants)
+      remaining_space = total_wants;
+   if (remaining_space > 0) {
+      unsigned vs_additional = (unsigned)
+         round(vs_wants * (((double) remaining_space) / total_wants));
+      vs_chunks += vs_additional;
+      remaining_space -= vs_additional;
+      gs_chunks += remaining_space;
+   }
+
+   /* Sanity check that we haven't over-allocated. */
+   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
+
+   /* Finally, compute the number of entries that can fit in the space
+    * allocated to each stage.
+    */
+   unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
+   unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
+
+   /* Since we rounded up when computing *_wants, this may be slightly more
+    * than the maximum allowed amount, so correct for that.
+    */
+   nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
+   nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
+
+   /* Ensure that we program a multiple of the granularity. */
+   nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
+   nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
+
+   /* Finally, sanity check to make sure we have at least the minimum number
+    * of entries needed for each stage.
+    */
+   assert(nr_vs_entries >= 32);
+   if (gs_present)
+      assert(nr_gs_entries >= 2);
+
+   /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems
+    * better to put reasonable data in there rather than leave them
+    * uninitialized.
+    */
+   brw->urb.nr_vs_entries = nr_vs_entries;
+   brw->urb.nr_gs_entries = nr_gs_entries;
+
+   /* Lay out the URB in the following order:
+    * - push constants
+    * - VS
+    * - GS
+    */
+   brw->urb.vs_start = push_constant_chunks;
+   brw->urb.gs_start = push_constant_chunks + vs_chunks;
 
    gen7_emit_vs_workaround_flush(brw);
-   gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start);
+   gen7_emit_urb_state(brw,
+                       brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
+                       brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start);
 }
 
 void
-gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
-                    GLuint vs_size, GLuint vs_start)
+gen7_emit_urb_state(struct brw_context *brw,
+                    unsigned nr_vs_entries, unsigned vs_size,
+                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned gs_size, unsigned gs_start)
 {
    BEGIN_BATCH(8);
    OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2));
@@ -109,11 +212,12 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
              ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
              (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
 
-   /* Allocate the GS, HS, and DS zero space - we don't use them. */
    OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_gs_entries |
+             ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
 
+   /* Allocate the HS and DS zero space - we don't use them. */
    OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
    OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
              (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
@@ -127,8 +231,8 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
 const struct brw_tracked_state gen7_urb = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_CONTEXT,
-      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_FF_GS_PROG),
+      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
+      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
    },
    .emit = gen7_upload_urb,
 };