[Mesa-dev] [PATCH 08/22] i965/gs: Allocate URB space for use by GS.
Paul Berry
stereotype441 at gmail.com
Thu Aug 29 08:21:19 PDT 2013
On 28 August 2013 21:00, Kenneth Graunke <kenneth at whitecape.org> wrote:
> On 08/26/2013 03:12 PM, Paul Berry wrote:
>
>> Previously, we gave all of the URB space (other than the small amount
>> that is used for push constants) to the vertex shader. However, when
>> a geometry shader is active, we need to divide it up between the
>> vertex and geometry shaders.
>>
>> The size of the URB entries for the vertex and geometry shaders can
>> vary dramatically from one shader to the next. So it doesn't make
>> sense to simply split the available space in two. In particular:
>>
>> - On Ivy Bridge GT1, this would not leave enough space for the worst
>> case geometry shader, which requires 64k of URB space.
>>
>> - Due to hardware-imposed limits on the maximum number of URB entries,
>> sometimes a given shader stage will only be capable of using a small
>> amount of URB space. When this happens, it may make sense to
>> allocate substantially less than half of the available space to that
>> stage.
>>
>> Our algorithm for dividing space between the two stages is to first
>> compute (a) the minimum amount of URB space that each stage needs in
>> order to function properly, and (b) the amount of additional URB space
>> that each stage "wants" (i.e. that it would be capable of making use
>> of). If the total amount of space available is not enough to satisfy
>> needs + wants, then each stage's "wants" amount is scaled back by the
>> same factor in order to fit.
>>
>> When only a vertex shader is active, this algorithm produces
>> equivalent results to the old algorithm (if the vertex shader stage
>> can make use of all the available URB space, we assign all the space
>> to it; if it can't, we let it use as much as it can).
>>
>> In the future, when we need to support tessellation control and
>> tessellation evaluation pipeline stages, it should be straightforward
>> to expand this algorithm to cover them.
>>
>> v2: Use "unsigned" rather than "GLuint".
>> ---
>> src/mesa/drivers/dri/i965/brw_**context.h | 6 +-
>> src/mesa/drivers/dri/i965/**gen7_blorp.cpp | 16 ++--
>> src/mesa/drivers/dri/i965/**gen7_urb.c | 155
>> +++++++++++++++++++++++++-----**-
>> 3 files changed, 142 insertions(+), 35 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/**brw_context.h
>> b/src/mesa/drivers/dri/i965/**brw_context.h
>> index be5175f..77f2a6b 100644
>> --- a/src/mesa/drivers/dri/i965/**brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/**brw_context.h
>> @@ -1511,8 +1511,10 @@ void
>> gen7_allocate_push_constants(**struct brw_context *brw);
>>
>> void
>> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
>> - GLuint vs_size, GLuint vs_start);
>> +gen7_emit_urb_state(struct brw_context *brw,
>> + unsigned nr_vs_entries, unsigned vs_size,
>> + unsigned vs_start, unsigned nr_gs_entries,
>> + unsigned gs_size, unsigned gs_start);
>>
>>
>>
>> diff --git a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> index a387836..6c798b1 100644
>> --- a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> +++ b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp
>> @@ -51,14 +51,16 @@ static void
>> gen7_blorp_emit_urb_config(**struct brw_context *brw,
>> const brw_blorp_params *params)
>> {
>> - /* The minimum valid value is 32. See 3DSTATE_URB_VS,
>> - * Dword 1.15:0 "VS Number of URB Entries".
>> + /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS,
>> Dword
>> + * 1.15:0 "VS Number of URB Entries".
>> */
>> - int num_vs_entries = 32;
>> - int vs_size = 2;
>> - int vs_start = 2; /* skip over push constants */
>> -
>> - gen7_emit_urb_state(brw, num_vs_entries, vs_size, vs_start);
>> + gen7_emit_urb_state(brw,
>> + 32 /* num_vs_entries */,
>> + 2 /* vs_size */,
>> + 2 /* vs_start */,
>> + 0 /* num_gs_entries */,
>> + 1 /* gs_size */,
>> + 2 /* gs_start */);
>> }
>>
>>
>> diff --git a/src/mesa/drivers/dri/i965/**gen7_urb.c
>> b/src/mesa/drivers/dri/i965/**gen7_urb.c
>> index 927af37..2d10cc12 100644
>> --- a/src/mesa/drivers/dri/i965/**gen7_urb.c
>> +++ b/src/mesa/drivers/dri/i965/**gen7_urb.c
>> @@ -74,34 +74,136 @@ gen7_upload_urb(struct brw_context *brw)
>> {
>> const int push_size_kB = brw->is_haswell && brw->gt == 3 ? 32 : 16;
>>
>> - /* Total space for entries is URB size - 16kB for push constants */
>> - int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /*
>> bytes */
>> -
>> /* CACHE_NEW_VS_PROG */
>> unsigned vs_size = MAX2(brw->vs.prog_data->base.**urb_entry_size,
>> 1);
>> -
>> - int nr_vs_entries = handle_region_size / (vs_size * 64);
>> - if (nr_vs_entries > brw->urb.max_vs_entries)
>> - nr_vs_entries = brw->urb.max_vs_entries;
>> -
>> - /* According to volume 2a, nr_vs_entries must be a multiple of 8. */
>> - brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8);
>> -
>> - /* URB Starting Addresses are specified in multiples of 8kB. */
>> - brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */
>> -
>> - assert(brw->urb.nr_vs_entries % 8 == 0);
>> - assert(brw->urb.nr_gs_entries % 8 == 0);
>> - /* GS requirement */
>> - assert(!brw->ff_gs.prog_**active);
>> + unsigned vs_entry_size_bytes = vs_size * 64;
>> + /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_GS_PROG */
>> + bool gs_present = brw->geometry_program;
>> + unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_**entry_size
>> : 1;
>> + unsigned gs_entry_size_bytes = gs_size * 64;
>> +
>> + /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
>> + *
>> + * VS Number of URB Entries must be divisible by 8 if the VS URB
>> Entry
>> + * Allocation Size is less than 9 512-bit URB entries.
>> + *
>> + * Similar text exists for GS.
>> + */
>> + unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
>> + unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
>> +
>> + /* URB allocations must be done in 8k chunks. */
>> + unsigned chunk_size_bytes = 8192;
>> +
>> + /* Determine the size of the URB in chunks.
>> + */
>> + unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes;
>> +
>> + /* Reserve space for push constants */
>> + unsigned push_constant_bytes = 1024 * push_size_kB;
>> + unsigned push_constant_chunks =
>> + push_constant_bytes / chunk_size_bytes;
>> +
>> + /* Initially, assign each stage the minimum amount of URB space it
>> needs,
>> + * and make a note of how much additional space it "wants" (the
>> amount of
>> + * additional space it could actually make use of).
>> + */
>> +
>> + /* VS always requires at least 32 URB entries */
>> + unsigned vs_chunks =
>> + ALIGN(32 * vs_entry_size_bytes, chunk_size_bytes) /
>> chunk_size_bytes;
>> + unsigned vs_wants =
>> + ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
>> + chunk_size_bytes) / chunk_size_bytes - vs_chunks;
>> +
>> + unsigned gs_chunks = 0;
>> + unsigned gs_wants = 0;
>> + if (gs_present) {
>> + /* There are two constraints on the minimum amount of URB space we
>> can
>> + * allocate:
>> + *
>> + * (1) We need room for at least 2 URB entries, since we always
>> operate
>> + * the GS in DUAL_OBJECT mode.
>> + *
>> + * (2) We can't allocate less than nr_gs_entries_granularity.
>> + */
>> + gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
>> + chunk_size_bytes) / chunk_size_bytes;
>> + gs_wants =
>> + ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
>> + chunk_size_bytes) / chunk_size_bytes - gs_chunks;
>> + }
>> +
>> + /* There should always be enough URB space to satisfy the minimum
>> + * requirements of each stage.
>> + */
>> + unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
>> + assert(total_needs <= urb_chunks);
>> +
>> + /* Mete out remaining space (if any) in proportion to "wants". */
>> + unsigned total_wants = vs_wants + gs_wants;
>> + unsigned remaining_space = urb_chunks - total_needs;
>> + if (remaining_space > total_wants)
>> + remaining_space = total_wants;
>> + if (remaining_space > 0) {
>> + unsigned vs_additional = (unsigned)
>> + round(vs_wants * (((double) remaining_space) / total_wants));
>> + vs_chunks += vs_additional;
>> + remaining_space -= vs_additional;
>> + gs_chunks += remaining_space;
>> + }
>> +
>> + /* Sanity check that we haven't over-allocated. */
>> + assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
>> +
>> + /* Finally, compute the number of entries that can fit in the space
>> + * allocated to each stage.
>> + */
>> + unsigned nr_vs_entries = vs_chunks * chunk_size_bytes /
>> vs_entry_size_bytes;
>> + unsigned nr_gs_entries = gs_chunks * chunk_size_bytes /
>> gs_entry_size_bytes;
>> +
>> + /* Since we rounded up when computing *_wants, this may be slightly
>> more
>> + * than the maximum allowed amount, so correct for that.
>> + */
>> + nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
>> + nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
>> +
>> + /* Ensure that we program a multiple of the granularity. */
>> + nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
>> + nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
>> +
>> + /* Finally, sanity check to make sure we have at least the minimum
>> number
>> + * of entries needed for each stage.
>> + */
>> + assert(nr_vs_entries >= 32);
>> + if (gs_present)
>> + assert(nr_gs_entries >= 2);
>> +
>> + /* And store the values we computed in brw so that they can be used by
>> + * other state atoms.
>> + */
>>
>
> Actually, no other atoms ever use these. The only reason these fields
> exist is for Gen4, where the VS_STATE, CLIP_STATE, etc. packets actually
> specified the URB configuration for that stage.
>
> I just set them because they were there, and putting data in them seemed
> better than leaving them uninitialized.
>
Ok, I've changed the comment to say:
/* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems
* better to put reasonable data in there rather than leave them
* uninitialized.
*/
>
> However, I can't see any reason any other atom would /ever/ care, so I'm
> somewhat tempted to keep these as locals. Would be a tiny bit more
> efficient.
>
I could go either way on this. We can always do it in a follow-up patch if
this state atom winds up looming large in profiling runs.
>
> This patch seems pretty reasonable. It's rather complicated, but...I
> don't honestly know that I could come up with anything better.
>
>
> + brw->urb.nr_vs_entries = nr_vs_entries;
>> + brw->urb.nr_gs_entries = nr_gs_entries;
>> +
>> + /* Lay out the URB in the following order:
>> + * - push constants
>> + * - VS
>> + * - GS
>> + */
>> + brw->urb.vs_start = push_constant_chunks;
>> + brw->urb.gs_start = push_constant_chunks + vs_chunks;
>>
>> gen7_emit_vs_workaround_flush(**brw);
>> - gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size,
>> brw->urb.vs_start);
>> + gen7_emit_urb_state(brw,
>> + brw->urb.nr_vs_entries, vs_size,
>> brw->urb.vs_start,
>> + brw->urb.nr_gs_entries, gs_size,
>> brw->urb.gs_start);
>> }
>>
>> void
>> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
>> - GLuint vs_size, GLuint vs_start)
>> +gen7_emit_urb_state(struct brw_context *brw,
>> + unsigned nr_vs_entries, unsigned vs_size,
>> + unsigned vs_start, unsigned nr_gs_entries,
>> + unsigned gs_size, unsigned gs_start)
>> {
>> BEGIN_BATCH(8);
>> OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2));
>> @@ -109,11 +211,12 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint
>> nr_vs_entries,
>> ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
>> (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>>
>> - /* Allocate the GS, HS, and DS zero space - we don't use them. */
>> OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2));
>> - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
>> - (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>> + OUT_BATCH(nr_gs_entries |
>> + ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
>> + (gs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>>
>> + /* Allocate the HS and DS zero space - we don't use them. */
>> OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
>> OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
>> (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT));
>> @@ -127,8 +230,8 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint
>> nr_vs_entries,
>> const struct brw_tracked_state gen7_urb = {
>> .dirty = {
>> .mesa = 0,
>> - .brw = BRW_NEW_CONTEXT,
>> - .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_FF_GS_PROG),
>> + .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
>> + .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
>> },
>> .emit = gen7_upload_urb,
>> };
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20130829/be4fa395/attachment-0001.html>
More information about the mesa-dev
mailing list