[Mesa-dev] [PATCH 08/22] i965/gs: Allocate URB space for use by GS.
Kenneth Graunke
kenneth at whitecape.org
Wed Aug 28 21:00:47 PDT 2013
On 08/26/2013 03:12 PM, Paul Berry wrote:
> Previously, we gave all of the URB space (other than the small amount
> that is used for push constants) to the vertex shader. However, when
> a geometry shader is active, we need to divide it up between the
> vertex and geometry shaders.
>
> The size of the URB entries for the vertex and geometry shaders can
> vary dramatically from one shader to the next. So it doesn't make
> sense to simply split the available space in two. In particular:
>
> - On Ivy Bridge GT1, this would not leave enough space for the worst
> case geometry shader, which requires 64k of URB space.
>
> - Due to hardware-imposed limits on the maximum number of URB entries,
> sometimes a given shader stage will only be capable of using a small
> amount of URB space. When this happens, it may make sense to
> allocate substantially less than half of the available space to that
> stage.
>
> Our algorithm for dividing space between the two stages is to first
> compute (a) the minimum amount of URB space that each stage needs in
> order to function properly, and (b) the amount of additional URB space
> that each stage "wants" (i.e. that it would be capable of making use
> of). If the total amount of space available is not enough to satisfy
> needs + wants, then each stage's "wants" amount is scaled back by the
> same factor in order to fit.
>
> When only a vertex shader is active, this algorithm produces
> equivalent results to the old algorithm (if the vertex shader stage
> can make use of all the available URB space, we assign all the space
> to it; if it can't, we let it use as much as it can).
>
> In the future, when we need to support tessellation control and
> tessellation evaluation pipeline stages, it should be straightforward
> to expand this algorithm to cover them.
>
> v2: Use "unsigned" rather than "GLuint".
> ---
> src/mesa/drivers/dri/i965/brw_context.h | 6 +-
> src/mesa/drivers/dri/i965/gen7_blorp.cpp | 16 ++--
> src/mesa/drivers/dri/i965/gen7_urb.c | 155 +++++++++++++++++++++++++------
> 3 files changed, 142 insertions(+), 35 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index be5175f..77f2a6b 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1511,8 +1511,10 @@ void
> gen7_allocate_push_constants(struct brw_context *brw);
>
> void
> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
> - GLuint vs_size, GLuint vs_start);
> +gen7_emit_urb_state(struct brw_context *brw,
> + unsigned nr_vs_entries, unsigned vs_size,
> + unsigned vs_start, unsigned nr_gs_entries,
> + unsigned gs_size, unsigned gs_start);
>
>
>
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index a387836..6c798b1 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -51,14 +51,16 @@ static void
> gen7_blorp_emit_urb_config(struct brw_context *brw,
> const brw_blorp_params *params)
> {
> - /* The minimum valid value is 32. See 3DSTATE_URB_VS,
> - * Dword 1.15:0 "VS Number of URB Entries".
> + /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
> + * 1.15:0 "VS Number of URB Entries".
> */
> - int num_vs_entries = 32;
> - int vs_size = 2;
> - int vs_start = 2; /* skip over push constants */
> -
> - gen7_emit_urb_state(brw, num_vs_entries, vs_size, vs_start);
> + gen7_emit_urb_state(brw,
> + 32 /* num_vs_entries */,
> + 2 /* vs_size */,
> + 2 /* vs_start */,
> + 0 /* num_gs_entries */,
> + 1 /* gs_size */,
> + 2 /* gs_start */);
> }
>
>
> diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
> index 927af37..2d10cc12 100644
> --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> @@ -74,34 +74,136 @@ gen7_upload_urb(struct brw_context *brw)
> {
> const int push_size_kB = brw->is_haswell && brw->gt == 3 ? 32 : 16;
>
> - /* Total space for entries is URB size - 16kB for push constants */
> - int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /* bytes */
> -
> /* CACHE_NEW_VS_PROG */
> unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1);
> -
> - int nr_vs_entries = handle_region_size / (vs_size * 64);
> - if (nr_vs_entries > brw->urb.max_vs_entries)
> - nr_vs_entries = brw->urb.max_vs_entries;
> -
> - /* According to volume 2a, nr_vs_entries must be a multiple of 8. */
> - brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8);
> -
> - /* URB Starting Addresses are specified in multiples of 8kB. */
> - brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */
> -
> - assert(brw->urb.nr_vs_entries % 8 == 0);
> - assert(brw->urb.nr_gs_entries % 8 == 0);
> - /* GS requirement */
> - assert(!brw->ff_gs.prog_active);
> + unsigned vs_entry_size_bytes = vs_size * 64;
> + /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_GS_PROG */
> + bool gs_present = brw->geometry_program;
> + unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
> + unsigned gs_entry_size_bytes = gs_size * 64;
> +
> + /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
> + *
> + * VS Number of URB Entries must be divisible by 8 if the VS URB Entry
> + * Allocation Size is less than 9 512-bit URB entries.
> + *
> + * Similar text exists for GS.
> + */
> + unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
> + unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
> +
> + /* URB allocations must be done in 8k chunks. */
> + unsigned chunk_size_bytes = 8192;
> +
> + /* Determine the size of the URB in chunks.
> + */
> + unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes;
> +
> + /* Reserve space for push constants */
> + unsigned push_constant_bytes = 1024 * push_size_kB;
> + unsigned push_constant_chunks =
> + push_constant_bytes / chunk_size_bytes;
> +
> + /* Initially, assign each stage the minimum amount of URB space it needs,
> + * and make a note of how much additional space it "wants" (the amount of
> + * additional space it could actually make use of).
> + */
> +
> + /* VS always requires at least 32 URB entries */
> + unsigned vs_chunks =
> + ALIGN(32 * vs_entry_size_bytes, chunk_size_bytes) / chunk_size_bytes;
> + unsigned vs_wants =
> + ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
> + chunk_size_bytes) / chunk_size_bytes - vs_chunks;
> +
> + unsigned gs_chunks = 0;
> + unsigned gs_wants = 0;
> + if (gs_present) {
> + /* There are two constraints on the minimum amount of URB space we can
> + * allocate:
> + *
> + * (1) We need room for at least 2 URB entries, since we always operate
> + * the GS in DUAL_OBJECT mode.
> + *
> + * (2) We can't allocate less than nr_gs_entries_granularity.
> + */
> + gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
> + chunk_size_bytes) / chunk_size_bytes;
> + gs_wants =
> + ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
> + chunk_size_bytes) / chunk_size_bytes - gs_chunks;
> + }
> +
> + /* There should always be enough URB space to satisfy the minimum
> + * requirements of each stage.
> + */
> + unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
> + assert(total_needs <= urb_chunks);
> +
> + /* Mete out remaining space (if any) in proportion to "wants". */
> + unsigned total_wants = vs_wants + gs_wants;
> + unsigned remaining_space = urb_chunks - total_needs;
> + if (remaining_space > total_wants)
> + remaining_space = total_wants;
> + if (remaining_space > 0) {
> + unsigned vs_additional = (unsigned)
> + round(vs_wants * (((double) remaining_space) / total_wants));
> + vs_chunks += vs_additional;
> + remaining_space -= vs_additional;
> + gs_chunks += remaining_space;
> + }
> +
> + /* Sanity check that we haven't over-allocated. */
> + assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
> +
> + /* Finally, compute the number of entries that can fit in the space
> + * allocated to each stage.
> + */
> + unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
> + unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
> +
> + /* Since we rounded up when computing *_wants, this may be slightly more
> + * than the maximum allowed amount, so correct for that.
> + */
> + nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
> + nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
> +
> + /* Ensure that we program a multiple of the granularity. */
> + nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
> + nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
> +
> + /* Finally, sanity check to make sure we have at least the minimum number
> + * of entries needed for each stage.
> + */
> + assert(nr_vs_entries >= 32);
> + if (gs_present)
> + assert(nr_gs_entries >= 2);
> +
> + /* And store the values we computed in brw so that they can be used by
> + * other state atoms.
> + */
Actually, no other atoms ever use these. The only reason these fields
exist is for Gen4, where the VS_STATE, CLIP_STATE, etc. packets actually
specified the URB configuration for that stage.
I just set them because they were there, and putting data in them seemed
better than leaving them uninitialized.
However, I can't see any reason any other atom would /ever/ care, so I'm
somewhat tempted to keep these as locals. Would be a tiny bit more
efficient.
This patch seems pretty reasonable. It's rather complicated, but...I
don't honestly know that I could come up with anything better.
> + brw->urb.nr_vs_entries = nr_vs_entries;
> + brw->urb.nr_gs_entries = nr_gs_entries;
> +
> + /* Lay out the URB in the following order:
> + * - push constants
> + * - VS
> + * - GS
> + */
> + brw->urb.vs_start = push_constant_chunks;
> + brw->urb.gs_start = push_constant_chunks + vs_chunks;
>
> gen7_emit_vs_workaround_flush(brw);
> - gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start);
> + gen7_emit_urb_state(brw,
> + brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
> + brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start);
> }
>
> void
> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
> - GLuint vs_size, GLuint vs_start)
> +gen7_emit_urb_state(struct brw_context *brw,
> + unsigned nr_vs_entries, unsigned vs_size,
> + unsigned vs_start, unsigned nr_gs_entries,
> + unsigned gs_size, unsigned gs_start)
> {
> BEGIN_BATCH(8);
> OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2));
> @@ -109,11 +211,12 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
> ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
> (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
>
> - /* Allocate the GS, HS, and DS zero space - we don't use them. */
> OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2));
> - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
> - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
> + OUT_BATCH(nr_gs_entries |
> + ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
> + (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
>
> + /* Allocate the HS and DS zero space - we don't use them. */
> OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
> OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
> (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
> @@ -127,8 +230,8 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries,
> const struct brw_tracked_state gen7_urb = {
> .dirty = {
> .mesa = 0,
> - .brw = BRW_NEW_CONTEXT,
> - .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_FF_GS_PROG),
> + .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
> + .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
> },
> .emit = gen7_upload_urb,
> };
>
More information about the mesa-dev
mailing list