[Mesa-dev] [PATCH 08/22] i965/gs: Allocate URB space for use by GS.

Paul Berry stereotype441 at gmail.com
Thu Aug 29 08:21:19 PDT 2013


On 28 August 2013 21:00, Kenneth Graunke <kenneth at whitecape.org> wrote:

> On 08/26/2013 03:12 PM, Paul Berry wrote:
>
>> Previously, we gave all of the URB space (other than the small amount
>> that is used for push constants) to the vertex shader.  However, when
>> a geometry shader is active, we need to divide it up between the
>> vertex and geometry shaders.
>>
>> The size of the URB entries for the vertex and geometry shaders can
>> vary dramatically from one shader to the next.  So it doesn't make
>> sense to simply split the available space in two.  In particular:
>>
>> - On Ivy Bridge GT1, this would not leave enough space for the worst
>>    case geometry shader, which requires 64k of URB space.
>>
>> - Due to hardware-imposed limits on the maximum number of URB entries,
>>    sometimes a given shader stage will only be capable of using a small
>>    amount of URB space.  When this happens, it may make sense to
>>    allocate substantially less than half of the available space to that
>>    stage.
>>
>> Our algorithm for dividing space between the two stages is to first
>> compute (a) the minimum amount of URB space that each stage needs in
>> order to function properly, and (b) the amount of additional URB space
>> that each stage "wants" (i.e. that it would be capable of making use
>> of).  If the total amount of space available is not enough to satisfy
>> needs + wants, then each stage's "wants" amount is scaled back by the
>> same factor in order to fit.
>>
>> When only a vertex shader is active, this algorithm produces
>> equivalent results to the old algorithm (if the vertex shader stage
>> can make use of all the available URB space, we assign all the space
>> to it; if it can't, we let it use as much as it can).
>>
>> In the future, when we need to support tessellation control and
>> tessellation evaluation pipeline stages, it should be straightforward
>> to expand this algorithm to cover them.
>>
>> v2: Use "unsigned" rather than "GLuint".
>> ---
>>   src/mesa/drivers/dri/i965/brw_**context.h  |   6 +-
>>   src/mesa/drivers/dri/i965/**gen7_blorp.cpp |  16 ++--
>>   src/mesa/drivers/dri/i965/**gen7_urb.c     | 155
>> +++++++++++++++++++++++++-----**-
>>   3 files changed, 142 insertions(+), 35 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/**brw_context.h
>> b/src/mesa/drivers/dri/i965/**brw_context.h
>> index be5175f..77f2a6b 100644
>> --- a/src/mesa/drivers/dri/i965/**brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/**brw_context.h
>> @@ -1511,8 +1511,10 @@ void
>>   gen7_allocate_push_constants(**struct brw_context *brw);
>>
>>   void
>> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
>> -                    GLuint vs_size, GLuint vs_start);
>> +gen7_emit_urb_state(struct brw_context *brw,
>> +                    unsigned nr_vs_entries, unsigned vs_size,
>> +                    unsigned vs_start, unsigned nr_gs_entries,
>> +                    unsigned gs_size, unsigned gs_start);
>>
>>
>>
>> diff --git a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> index a387836..6c798b1 100644
>> --- a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> +++ b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> @@ -51,14 +51,16 @@ static void
>>   gen7_blorp_emit_urb_config(**struct brw_context *brw,
>>                              const brw_blorp_params *params)
>>   {
>> -   /* The minimum valid value is 32. See 3DSTATE_URB_VS,
>> -    * Dword 1.15:0 "VS Number of URB Entries".
>> +   /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS,
>> Dword
>> +    * 1.15:0 "VS Number of URB Entries".
>>       */
>> -   int num_vs_entries = 32;
>> -   int vs_size = 2;
>> -   int vs_start = 2; /* skip over push constants */
>> -
>> -   gen7_emit_urb_state(brw, num_vs_entries, vs_size, vs_start);
>> +   gen7_emit_urb_state(brw,
>> +                       32 /* num_vs_entries */,
>> +                       2 /* vs_size */,
>> +                       2 /* vs_start */,
>> +                       0 /* num_gs_entries */,
>> +                       1 /* gs_size */,
>> +                       2 /* gs_start */);
>>   }
>>
>>
>> diff --git a/src/mesa/drivers/dri/i965/**gen7_urb.c
>> b/src/mesa/drivers/dri/i965/**gen7_urb.c
>> index 927af37..2d10cc12 100644
>> --- a/src/mesa/drivers/dri/i965/**gen7_urb.c
>> +++ b/src/mesa/drivers/dri/i965/**gen7_urb.c
>> @@ -74,34 +74,136 @@ gen7_upload_urb(struct brw_context *brw)
>>   {
>>      const int push_size_kB = brw->is_haswell && brw->gt == 3 ? 32 : 16;
>>
>> -   /* Total space for entries is URB size - 16kB for push constants */
>> -   int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /*
>> bytes */
>> -
>>      /* CACHE_NEW_VS_PROG */
>>      unsigned vs_size = MAX2(brw->vs.prog_data->base.**urb_entry_size,
>> 1);
>> -
>> -   int nr_vs_entries = handle_region_size / (vs_size * 64);
>> -   if (nr_vs_entries > brw->urb.max_vs_entries)
>> -      nr_vs_entries = brw->urb.max_vs_entries;
>> -
>> -   /* According to volume 2a, nr_vs_entries must be a multiple of 8. */
>> -   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8);
>> -
>> -   /* URB Starting Addresses are specified in multiples of 8kB. */
>> -   brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */
>> -
>> -   assert(brw->urb.nr_vs_entries % 8 == 0);
>> -   assert(brw->urb.nr_gs_entries % 8 == 0);
>> -   /* GS requirement */
>> -   assert(!brw->ff_gs.prog_**active);
>> +   unsigned vs_entry_size_bytes = vs_size * 64;
>> +   /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_GS_PROG */
>> +   bool gs_present = brw->geometry_program;
>> +   unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_**entry_size
>> : 1;
>> +   unsigned gs_entry_size_bytes = gs_size * 64;
>> +
>> +   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
>> +    *
>> +    *     VS Number of URB Entries must be divisible by 8 if the VS URB
>> Entry
>> +    *     Allocation Size is less than 9 512-bit URB entries.
>> +    *
>> +    * Similar text exists for GS.
>> +    */
>> +   unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
>> +   unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
>> +
>> +   /* URB allocations must be done in 8k chunks. */
>> +   unsigned chunk_size_bytes = 8192;
>> +
>> +   /* Determine the size of the URB in chunks.
>> +    */
>> +   unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes;
>> +
>> +   /* Reserve space for push constants */
>> +   unsigned push_constant_bytes = 1024 * push_size_kB;
>> +   unsigned push_constant_chunks =
>> +      push_constant_bytes / chunk_size_bytes;
>> +
>> +   /* Initially, assign each stage the minimum amount of URB space it
>> needs,
>> +    * and make a note of how much additional space it "wants" (the
>> amount of
>> +    * additional space it could actually make use of).
>> +    */
>> +
>> +   /* VS always requires at least 32 URB entries */
>> +   unsigned vs_chunks =
>> +      ALIGN(32 * vs_entry_size_bytes, chunk_size_bytes) /
>> chunk_size_bytes;
>> +   unsigned vs_wants =
>> +      ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
>> +            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
>> +
>> +   unsigned gs_chunks = 0;
>> +   unsigned gs_wants = 0;
>> +   if (gs_present) {
>> +      /* There are two constraints on the minimum amount of URB space we
>> can
>> +       * allocate:
>> +       *
>> +       * (1) We need room for at least 2 URB entries, since we always
>> operate
>> +       * the GS in DUAL_OBJECT mode.
>> +       *
>> +       * (2) We can't allocate less than nr_gs_entries_granularity.
>> +       */
>> +      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
>> +                        chunk_size_bytes) / chunk_size_bytes;
>> +      gs_wants =
>> +         ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
>> +               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
>> +   }
>> +
>> +   /* There should always be enough URB space to satisfy the minimum
>> +    * requirements of each stage.
>> +    */
>> +   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
>> +   assert(total_needs <= urb_chunks);
>> +
>> +   /* Mete out remaining space (if any) in proportion to "wants". */
>> +   unsigned total_wants = vs_wants + gs_wants;
>> +   unsigned remaining_space = urb_chunks - total_needs;
>> +   if (remaining_space > total_wants)
>> +      remaining_space = total_wants;
>> +   if (remaining_space > 0) {
>> +      unsigned vs_additional = (unsigned)
>> +         round(vs_wants * (((double) remaining_space) / total_wants));
>> +      vs_chunks += vs_additional;
>> +      remaining_space -= vs_additional;
>> +      gs_chunks += remaining_space;
>> +   }
>> +
>> +   /* Sanity check that we haven't over-allocated. */
>> +   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
>> +
>> +   /* Finally, compute the number of entries that can fit in the space
>> +    * allocated to each stage.
>> +    */
>> +   unsigned nr_vs_entries = vs_chunks * chunk_size_bytes /
>> vs_entry_size_bytes;
>> +   unsigned nr_gs_entries = gs_chunks * chunk_size_bytes /
>> gs_entry_size_bytes;
>> +
>> +   /* Since we rounded up when computing *_wants, this may be slightly
>> more
>> +    * than the maximum allowed amount, so correct for that.
>> +    */
>> +   nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
>> +   nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
>> +
>> +   /* Ensure that we program a multiple of the granularity. */
>> +   nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
>> +   nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
>> +
>> +   /* Finally, sanity check to make sure we have at least the minimum
>> number
>> +    * of entries needed for each stage.
>> +    */
>> +   assert(nr_vs_entries >= 32);
>> +   if (gs_present)
>> +      assert(nr_gs_entries >= 2);
>> +
>> +   /* And store the values we computed in brw so that they can be used by
>> +    * other state atoms.
>> +    */
>>
>
> Actually, no other atoms ever use these.  The only reason these fields
> exist is for Gen4, where the VS_STATE, CLIP_STATE, etc. packets actually
> specified the URB configuration for that stage.
>
> I just set them because they were there, and putting data in them seemed
> better than leaving them uninitialized.
>

Ok, I've changed the comment to say:

   /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems
    * better to put reasonable data in there rather than leave them
    * uninitialized.
    */



>
> However, I can't see any reason any other atom would /ever/ care, so I'm
> somewhat tempted to keep these as locals.  Would be a tiny bit more
> efficient.
>

I could go either way on this.  We can always do it in a follow-up patch if
this state atom winds up looming large in profiling runs.


>
> This patch seems pretty reasonable.  It's rather complicated, but...I
> don't honestly know that I could come up with anything better.
>
>
>  +   brw->urb.nr_vs_entries = nr_vs_entries;
>> +   brw->urb.nr_gs_entries = nr_gs_entries;
>> +
>> +   /* Lay out the URB in the following order:
>> +    * - push constants
>> +    * - VS
>> +    * - GS
>> +    */
>> +   brw->urb.vs_start = push_constant_chunks;
>> +   brw->urb.gs_start = push_constant_chunks + vs_chunks;
>>
>>      gen7_emit_vs_workaround_flush(**brw);
>> -   gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size,
>> brw->urb.vs_start);
>> +   gen7_emit_urb_state(brw,
>> +                       brw->urb.nr_vs_entries, vs_size,
>> brw->urb.vs_start,
>> +                       brw->urb.nr_gs_entries, gs_size,
>> brw->urb.gs_start);
>>   }
>>
>>   void
>> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
>> -                    GLuint vs_size, GLuint vs_start)
>> +gen7_emit_urb_state(struct brw_context *brw,
>> +                    unsigned nr_vs_entries, unsigned vs_size,
>> +                    unsigned vs_start, unsigned nr_gs_entries,
>> +                    unsigned gs_size, unsigned gs_start)
>>   {
>>      BEGIN_BATCH(8);
>>      OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2));
>> @@ -109,11 +211,12 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint
>> nr_vs_entries,
>>                ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
>>                (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>>
>> -   /* Allocate the GS, HS, and DS zero space - we don't use them. */
>>      OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2));
>> -   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
>> -             (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>> +   OUT_BATCH(nr_gs_entries |
>> +             ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
>> +             (gs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>>
>> +   /* Allocate the HS and DS zero space - we don't use them. */
>>      OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
>>      OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
>>                (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>> @@ -127,8 +230,8 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint
>> nr_vs_entries,
>>   const struct brw_tracked_state gen7_urb = {
>>      .dirty = {
>>         .mesa = 0,
>> -      .brw = BRW_NEW_CONTEXT,
>> -      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_FF_GS_PROG),
>> +      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
>> +      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
>>      },
>>      .emit = gen7_upload_urb,
>>   };
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20130829/be4fa395/attachment-0001.html>


More information about the mesa-dev mailing list