[Mesa-dev] [PATCH 09/26] i965: URB allocations for tessellation
Matt Turner
mattst88 at gmail.com
Thu Dec 3 12:01:11 PST 2015
On Wed, Dec 2, 2015 at 4:15 PM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> From: Chris Forbes <chrisf at ijw.co.nz>
>
> Signed-off-by: Chris Forbes <chrisf at ijw.co.nz>
The commit title should be some imperative statement. Maybe just add
"Add" to the beginning.
> ---
> src/mesa/drivers/dri/i965/brw_context.h | 17 +++-
> src/mesa/drivers/dri/i965/gen7_blorp.cpp | 8 ++
> src/mesa/drivers/dri/i965/gen7_urb.c | 162 +++++++++++++++++++++++++------
> 3 files changed, 157 insertions(+), 30 deletions(-)
>
> The URB code could use some janitorial work - using arrays based on
> MESA_SHADER_* instead of replicating a bunch of code would be much nicer.
>
> I just don't feel like doing it today.
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index e22f21d..88f6713 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -995,6 +995,8 @@ struct brw_context
> struct {
> GLuint vsize; /* vertex size plus header in urb registers */
> GLuint gsize; /* GS output size in urb registers */
> + GLuint hsize; /* Tessellation control output size in urb registers */
> + GLuint dsize; /* Tessellation evaluation output size in urb registers */
> GLuint csize; /* constant buffer size in urb registers */
> GLuint sfsize; /* setup data size in urb registers */
>
> @@ -1007,12 +1009,16 @@ struct brw_context
> GLuint max_gs_entries; /* Maximum number of GS entries */
>
> GLuint nr_vs_entries;
> + GLuint nr_hs_entries;
> + GLuint nr_ds_entries;
> GLuint nr_gs_entries;
> GLuint nr_clip_entries;
> GLuint nr_sf_entries;
> GLuint nr_cs_entries;
>
> GLuint vs_start;
> + GLuint hs_start;
> + GLuint ds_start;
> GLuint gs_start;
> GLuint clip_start;
> GLuint sf_start;
> @@ -1023,6 +1029,7 @@ struct brw_context
> * URB space for the GS.
> */
> bool gs_present;
> + bool ts_present;
> } urb;
>
>
> @@ -1628,12 +1635,18 @@ void gen8_emit_3dstate_sample_pattern(struct brw_context *brw);
> /* gen7_urb.c */
> void
> gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
> + unsigned hs_size, unsigned ds_size,
> unsigned gs_size, unsigned fs_size);
>
> void
> gen7_emit_urb_state(struct brw_context *brw,
> - unsigned nr_vs_entries, unsigned vs_size,
> - unsigned vs_start, unsigned nr_gs_entries,
> + unsigned nr_vs_entries,
> + unsigned vs_size, unsigned vs_start,
> + unsigned nr_hs_entries,
> + unsigned hs_size, unsigned hs_start,
> + unsigned nr_ds_entries,
> + unsigned ds_size, unsigned ds_start,
> + unsigned nr_gs_entries,
> unsigned gs_size, unsigned gs_start);
>
>
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index e87b9d1..89b73ca 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -50,6 +50,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
> unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16;
> gen7_emit_push_constant_state(brw,
> urb_size / 2 /* vs_size */,
> + 0 /* hs_size */,
> + 0 /* ds_size */,
> 0 /* gs_size */,
> urb_size / 2 /* fs_size */);
>
> @@ -60,6 +62,12 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
> 32 /* num_vs_entries */,
> 2 /* vs_size */,
> 2 /* vs_start */,
> + 0 /* num_hs_entries */,
> + 1 /* hs_size */,
> + 2 /* hs_start */,
> + 0 /* num_ds_entries */,
> + 1 /* ds_size */,
> + 2 /* ds_start */,
> 0 /* num_gs_entries */,
> 1 /* gs_size */,
> 2 /* gs_start */);
> diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
> index 161de77..9a09a19 100644
> --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> @@ -34,7 +34,7 @@
> * __________-__________ _________________-_________________
> * / \ / \
> * +-------------------------------------------------------------+
> - * | VS/FS/GS Push | VS/GS URB |
> + * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
> * | Constants | Entries |
> * +-------------------------------------------------------------+
> *
> @@ -65,22 +65,29 @@ gen7_allocate_push_constants(struct brw_context *brw)
> (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1;
>
> /* BRW_NEW_GEOMETRY_PROGRAM */
> + int stages = 2;
> + /* BRW_NEW_TESS_CTRL_PROGRAM, BRW_NEW_TESS_EVAL_PROGRAM */
> bool gs_present = brw->geometry_program;
> + if (gs_present)
> + stages += 1;
> + bool ts_present = brw->tess_eval_program;
> + if (ts_present) {
> + assert(brw->tess_ctrl_program);
> + stages += 2;
> + }
>
> - unsigned vs_size, gs_size;
> - if (gs_present) {
> - vs_size = avail_size / 3;
> - avail_size -= vs_size;
> - gs_size = avail_size / 2;
> - avail_size -= gs_size;
> - } else {
> - vs_size = avail_size / 2;
> - avail_size -= vs_size;
> - gs_size = 0;
> + unsigned vs_size, hs_size = 0, ds_size = 0, gs_size = 0;
> + avail_size -= (vs_size = avail_size / stages--);
> + if (ts_present) {
> + avail_size -= (hs_size = avail_size / stages--);
> + avail_size -= (ds_size = avail_size / stages--);
Whew.
> }
> + if (gs_present)
> + avail_size -= (gs_size = avail_size / stages--);
> unsigned fs_size = avail_size;
>
> gen7_emit_push_constant_state(brw, multiplier * vs_size,
> + multiplier * hs_size, multiplier * ds_size,
> multiplier * gs_size, multiplier * fs_size);
>
> /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS):
> @@ -99,15 +106,24 @@ gen7_allocate_push_constants(struct brw_context *brw)
>
> void
> gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
> + unsigned hs_size, unsigned ds_size,
> unsigned gs_size, unsigned fs_size)
> {
> unsigned offset = 0;
>
> - BEGIN_BATCH(6);
> + BEGIN_BATCH(10);
> OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
> OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> offset += vs_size;
>
> + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2));
> + OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> + offset += hs_size;
> +
> + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2));
> + OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> + offset += ds_size;
> +
> OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2));
> OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> offset += gs_size;
> @@ -130,7 +146,10 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
> const struct brw_tracked_state gen7_push_constant_space = {
> .dirty = {
> .mesa = 0,
> - .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
> + .brw = BRW_NEW_CONTEXT |
> + BRW_NEW_GEOMETRY_PROGRAM |
> + BRW_NEW_TESS_CTRL_PROGRAM |
> + BRW_NEW_TESS_EVAL_PROGRAM,
> },
> .emit = gen7_allocate_push_constants,
> };
> @@ -138,6 +157,7 @@ const struct brw_tracked_state gen7_push_constant_space = {
> static void
> gen7_upload_urb(struct brw_context *brw)
> {
> + const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
> const int push_size_kB =
> (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16;
>
> @@ -149,27 +169,46 @@ gen7_upload_urb(struct brw_context *brw)
> unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
> unsigned gs_entry_size_bytes = gs_size * 64;
>
> + /* BRW_NEW_TESS_CTRL_PROGRAM, BRW_NEW_TCS_PROG_DATA */
> + /* BRW_NEW_TESS_EVAL_PROGRAM, BRW_NEW_TES_PROG_DATA */
> + const bool ts_present = brw->tess_eval_program;
> + if (ts_present)
> + assert(brw->tess_ctrl_program);
> + unsigned hs_size = ts_present ? brw->tcs.prog_data->base.urb_entry_size : 1;
> + unsigned hs_entry_size_bytes = hs_size * 64;
> + unsigned ds_size = ts_present ? brw->tes.prog_data->base.urb_entry_size : 1;
> + unsigned ds_entry_size_bytes = ds_size * 64;
> +
> /* If we're just switching between programs with the same URB requirements,
> * skip the rest of the logic.
> */
> if (!(brw->ctx.NewDriverState & BRW_NEW_CONTEXT) &&
> brw->urb.vsize == vs_size &&
> brw->urb.gs_present == gs_present &&
> - brw->urb.gsize == gs_size) {
> + brw->urb.gsize == gs_size &&
> + brw->urb.ts_present == ts_present &&
> + brw->urb.hsize == hs_size &&
> + brw->urb.dsize == ds_size) {
> return;
> }
> brw->urb.vsize = vs_size;
> brw->urb.gs_present = gs_present;
> brw->urb.gsize = gs_size;
> + brw->urb.ts_present = ts_present;
> + brw->urb.hsize = hs_size;
> + brw->urb.dsize = ds_size;
> +
I suspect you didn't mean to add this newline.
>
> /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
> *
> * VS Number of URB Entries must be divisible by 8 if the VS URB Entry
> * Allocation Size is less than 9 512-bit URB entries.
> *
> - * Similar text exists for GS.
> + * Similar text exists for HS, DS and GS.
> */
> unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
> + unsigned hs_granularity = (hs_size < 9) ? 8 : 1;
> + unsigned ds_granularity = (ds_size < 9) ? 8 : 1;
> unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
>
> /* URB allocations must be done in 8k chunks. */
> @@ -190,8 +229,10 @@ gen7_upload_urb(struct brw_context *brw)
> */
>
> /* VS has a lower limit on the number of URB entries */
> + unsigned vs_min_entries = ts_present ? 192 : brw->urb.min_vs_entries;
The 3DSTATE_URB_VS documentation says this is BDW-only.
> +
> unsigned vs_chunks =
> - ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) /
> + ALIGN(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes) /
> chunk_size_bytes;
> unsigned vs_wants =
> ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
> @@ -215,14 +256,36 @@ gen7_upload_urb(struct brw_context *brw)
> chunk_size_bytes) / chunk_size_bytes - gs_chunks;
> }
>
> + unsigned hs_chunks = 0;
> + unsigned hs_wants = 0;
> + unsigned ds_chunks = 0;
> + unsigned ds_wants = 0;
> +
> + if (ts_present) {
> + hs_chunks =
> + ALIGN(hs_granularity * hs_entry_size_bytes, chunk_size_bytes) /
> + chunk_size_bytes;
> + hs_wants =
> + ALIGN(brw->urb.max_hs_entries * hs_entry_size_bytes,
> + chunk_size_bytes) / chunk_size_bytes - hs_chunks;
> +
> + ds_chunks =
> + ALIGN(devinfo->urb.min_ds_entries * ds_entry_size_bytes, chunk_size_bytes) /
> + chunk_size_bytes;
> + ds_wants =
> + ALIGN(brw->urb.max_ds_entries * ds_entry_size_bytes,
> + chunk_size_bytes) / chunk_size_bytes - ds_chunks;
Align the overflowing expression in these ALIGN()s
> + }
> +
> /* There should always be enough URB space to satisfy the minimum
> * requirements of each stage.
> */
> - unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
> + unsigned total_needs = push_constant_chunks +
> + vs_chunks + hs_chunks + ds_chunks + gs_chunks;
> assert(total_needs <= urb_chunks);
>
> /* Mete out remaining space (if any) in proportion to "wants". */
> - unsigned total_wants = vs_wants + gs_wants;
> + unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants;
> unsigned remaining_space = urb_chunks - total_needs;
> if (remaining_space > total_wants)
> remaining_space = total_wants;
> @@ -231,61 +294,99 @@ gen7_upload_urb(struct brw_context *brw)
> roundf(vs_wants * (((float) remaining_space) / total_wants));
> vs_chunks += vs_additional;
> remaining_space -= vs_additional;
> + total_wants -= vs_wants;
> +
> + unsigned hs_additional = (unsigned)
> + round(hs_wants * (((double) remaining_space) / total_wants));
s/(unsigned) round/lround/
> + hs_chunks += hs_additional;
> + remaining_space -= hs_additional;
> + total_wants -= hs_wants;
> +
> + unsigned ds_additional = (unsigned)
> + round(ds_wants * (((double) remaining_space) / total_wants));
s/(unsigned) round/lround/
More information about the mesa-dev
mailing list