[Mesa-dev] [PATCH 09/22] i965/gs: Allocate push constant space for use by GS.
Kenneth Graunke
kenneth at whitecape.org
Wed Aug 28 17:58:40 PDT 2013
On 08/26/2013 03:12 PM, Paul Berry wrote:
> Previously, we would always use the same push constant allocation
> regardless of what shader programs were being run: the available push
> constant space was split into 2 equal size partitions, one for the
> vertex shader, and one for the fragment shader.
>
> Now that we are adding geometry shader support, we need to do
> something smarter. This patch adjusts things so that when a geometry
> shader is in use, we split the available push constant space into 3
> nearly-equal size partitions instead of 2.
>
> Since the push constant allocation is now affected by GL state, it can
> no longer be set up by brw_upload_initial_gpu_state(); instead it must
> be set up by a state atom.
> ---
> src/mesa/drivers/dri/i965/brw_context.h | 3 +-
> src/mesa/drivers/dri/i965/brw_defines.h | 1 +
> src/mesa/drivers/dri/i965/brw_state.h | 4 +-
> src/mesa/drivers/dri/i965/brw_state_upload.c | 5 +-
> src/mesa/drivers/dri/i965/gen7_blorp.cpp | 6 ++
> src/mesa/drivers/dri/i965/gen7_urb.c | 101 +++++++++++++++++++++++----
> 6 files changed, 98 insertions(+), 22 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 77f2a6b..95f9bb2 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1508,7 +1508,8 @@ gen6_get_sample_position(struct gl_context *ctx,
>
> /* gen7_urb.c */
> void
> -gen7_allocate_push_constants(struct brw_context *brw);
> +gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
> + unsigned gs_size, unsigned fs_size);
>
> void
> gen7_emit_urb_state(struct brw_context *brw,
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 832ff55..8d9a824 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1284,6 +1284,7 @@ enum brw_message_target {
> # define GEN7_URB_STARTING_ADDRESS_SHIFT 25
>
> #define _3DSTATE_PUSH_CONSTANT_ALLOC_VS 0x7912 /* GEN7+ */
> +#define _3DSTATE_PUSH_CONSTANT_ALLOC_GS 0x7915 /* GEN7+ */
> #define _3DSTATE_PUSH_CONSTANT_ALLOC_PS 0x7916 /* GEN7+ */
> # define GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT 16
>
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 85f82fe..4814639 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -112,6 +112,7 @@ extern const struct brw_tracked_state gen7_cc_viewport_state_pointer;
> extern const struct brw_tracked_state gen7_clip_state;
> extern const struct brw_tracked_state gen7_disable_stages;
> extern const struct brw_tracked_state gen7_ps_state;
> +extern const struct brw_tracked_state gen7_push_constant_space;
> extern const struct brw_tracked_state gen7_sbe_state;
> extern const struct brw_tracked_state gen7_sf_clip_viewport;
> extern const struct brw_tracked_state gen7_sf_state;
> @@ -220,9 +221,6 @@ uint32_t
> get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset,
> int fs_attr, bool two_side_color, uint32_t *max_source_attr);
>
> -/* gen7_urb.c */
> -void gen7_allocate_push_constants(struct brw_context *brw);
> -
> #ifdef __cplusplus
> }
> #endif
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index b883002..9638c69 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -188,6 +188,7 @@ static const struct brw_tracked_state *gen7_atoms[] =
> &gen7_cc_viewport_state_pointer, /* must do after brw_cc_vp */
> &gen7_sf_clip_viewport,
>
> + &gen7_push_constant_space,
> &gen7_urb,
> &gen6_blend_state, /* must do before cc unit */
> &gen6_color_calc_state, /* must do before cc unit */
> @@ -251,10 +252,6 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
> return;
>
> brw_upload_invariant_state(brw);
> -
> - if (brw->gen >= 7) {
> - gen7_allocate_push_constants(brw);
> - }
> }
>
> void brw_init_state( struct brw_context *brw )
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index 6c798b1..9df3d92 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -51,6 +51,12 @@ static void
> gen7_blorp_emit_urb_config(struct brw_context *brw,
> const brw_blorp_params *params)
> {
> + unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16;
> + gen7_emit_push_constant_state(brw,
> + urb_size / 2 /* vs_size */,
> + 0 /* gs_size */,
> + urb_size / 2 /* fs_size */);
> +
> /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
> * 1.15:0 "VS Number of URB Entries".
> */
> diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
> index 2d10cc12..4dc8f6e 100644
> --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> @@ -30,12 +30,12 @@
> /**
> * The following diagram shows how we partition the URB:
> *
> - * 8kB 8kB Rest of the URB space
> - * ____-____ ____-____ _________________-_________________
> - * / \ / \ / \
> + * 16kB or 32kB Rest of the URB space
> + * __________-__________ _________________-_________________
> + * / \ / \
> * +-------------------------------------------------------------+
> - * | VS Push | FS Push | VS |
> - * | Constants | Constants | Handles |
> + * | VS/FS/GS Push | VS/GS URB |
> + * | Constants | Entries |
> * +-------------------------------------------------------------+
> *
> * Notably, push constants must be stored at the beginning of the URB
> @@ -43,8 +43,12 @@
> * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
> * doubles this (32kB).
> *
> - * Currently we split the constant buffer space evenly between VS and FS.
> - * This is probably not ideal, but simple.
> + * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
> + * sized) in increments of 1kB. Haswell GT3 requires them to be located and
> + * sized in increments of 2kB.
> + *
> + * Currently we split the constant buffer space evenly among whatever stages
> + * are active. This is probably not ideal, but simple.
> *
> * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
> * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
> @@ -53,22 +57,91 @@
> * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
> * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
> */
> -void
> +static void
> gen7_allocate_push_constants(struct brw_context *brw)
> {
> - unsigned size = 8;
> - if (brw->is_haswell && brw->gt == 3)
> - size = 16;
> + unsigned avail_size = 16;
> + unsigned multiplier = (brw->is_haswell && brw->gt == 3) ? 2 : 1;
> +
> + /* BRW_NEW_GEOMETRY_PROGRAM */
> + bool gs_present = brw->geometry_program;
> +
> + unsigned vs_size, gs_size;
> + if (gs_present) {
> + vs_size = avail_size / 3;
> + avail_size -= vs_size;
> + gs_size = avail_size / 2;
> + avail_size -= gs_size;
> + } else {
> + vs_size = avail_size / 2;
> + avail_size -= vs_size;
> + gs_size = 0;
> + }
> + unsigned fs_size = avail_size;
> +
> + gen7_emit_push_constant_state(brw, multiplier * vs_size,
> + multiplier * gs_size, multiplier * fs_size);
> +}
> +
> +void
> +gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
> + unsigned gs_size, unsigned fs_size)
> +{
> + unsigned offset = 0;
>
> - BEGIN_BATCH(4);
> + BEGIN_BATCH(6);
> OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
> - OUT_BATCH(size);
> + OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> + offset += vs_size;
> +
> + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2));
> + OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> + offset += gs_size;
>
> OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_PS << 16 | (2 - 2));
> - OUT_BATCH(size | size << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> + OUT_BATCH(offset | fs_size << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> ADVANCE_BATCH();
> +
> + /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
> + *
> + * A PIPE_CONTOL command with the CS Stall bit set must be programmed
> + * in the ring after this instruction.
> + *
> + * No such restriction exists for Haswell.
> + */
> + if (!brw->is_haswell) {
> + BEGIN_BATCH(4);
> + OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
> + /* From p61 of the Ivy Bridge PRM (1.10.4 PIPE_CONTROL Command: DW1[20]
> + * CS Stall):
> + *
> + * One of the following must also be set:
> + * - Render Target Cache Flush Enable ([12] of DW1)
> + * - Depth Cache Flush Enable ([0] of DW1)
> + * - Stall at Pixel Scoreboard ([1] of DW1)
> + * - Depth Stall ([13] of DW1)
> + * - Post-Sync Operation ([13] of DW1)
> + *
> + * We choose to do a Post-Sync Operation (Write Immediate Data), since
> + * it seems like it will incur the least additional performance penalty.
> + */
> + OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_WRITE_IMMEDIATE);
> + OUT_RELOC(brw->batch.workaround_bo,
> + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
> + OUT_BATCH(0);
> + ADVANCE_BATCH();
> + }
This is really a bug fix, since it's implementing a hardware workaround
on a packet we've always been emitting. I'd really like to see it in a
separate patch, for easier bisectability.
Interestingly, this text doesn't appear in the latest documentation, and
I don't see it in the workaround database. It is definitely there in
the PRM, though. I'm not sure what to make of it.
> }
>
> +const struct brw_tracked_state gen7_push_constant_space = {
> + .dirty = {
> + .mesa = 0,
> + .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
It's worth noting that this will cause a full pipeline stall whenever
the current geometry shader changes (since 3DSTATE_PUSH_CONSTANT_ALLOC_*
are non-pipelined). We could avoid this when switching between two
different geometry shaders. But I suspect it's premature to care about
that. Not sure it'll matter much anyway.
> + .cache = 0,
> + },
> + .emit = gen7_allocate_push_constants,
> +};
> +
> static void
> gen7_upload_urb(struct brw_context *brw)
> {
>
Other than pulling the PIPE_CONTROL into a separate patch, this looks
fine to me.
More information about the mesa-dev
mailing list