[Mesa-dev] [PATCH 09/22] i965/gs: Allocate push constant space for use by GS.

Kenneth Graunke kenneth at whitecape.org
Wed Aug 28 17:58:40 PDT 2013


On 08/26/2013 03:12 PM, Paul Berry wrote:
> Previously, we would always use the same push constant allocation
> regardless of what shader programs were being run: the available push
> constant space was split into 2 equal size partitions, one for the
> vertex shader, and one for the fragment shader.
>
> Now that we are adding geometry shader support, we need to do
> something smarter.  This patch adjusts things so that when a geometry
> shader is in use, we split the available push constant space into 3
> nearly-equal size partitions instead of 2.
>
> Since the push constant allocation is now affected by GL state, it can
> no longer be set up by brw_upload_initial_gpu_state(); instead it must
> be set up by a state atom.
> ---
>   src/mesa/drivers/dri/i965/brw_context.h      |   3 +-
>   src/mesa/drivers/dri/i965/brw_defines.h      |   1 +
>   src/mesa/drivers/dri/i965/brw_state.h        |   4 +-
>   src/mesa/drivers/dri/i965/brw_state_upload.c |   5 +-
>   src/mesa/drivers/dri/i965/gen7_blorp.cpp     |   6 ++
>   src/mesa/drivers/dri/i965/gen7_urb.c         | 101 +++++++++++++++++++++++----
>   6 files changed, 98 insertions(+), 22 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 77f2a6b..95f9bb2 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1508,7 +1508,8 @@ gen6_get_sample_position(struct gl_context *ctx,
>
>   /* gen7_urb.c */
>   void
> -gen7_allocate_push_constants(struct brw_context *brw);
> +gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
> +                              unsigned gs_size, unsigned fs_size);
>
>   void
>   gen7_emit_urb_state(struct brw_context *brw,
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 832ff55..8d9a824 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1284,6 +1284,7 @@ enum brw_message_target {
>   # define GEN7_URB_STARTING_ADDRESS_SHIFT                25
>
>   #define _3DSTATE_PUSH_CONSTANT_ALLOC_VS         0x7912 /* GEN7+ */
> +#define _3DSTATE_PUSH_CONSTANT_ALLOC_GS         0x7915 /* GEN7+ */
>   #define _3DSTATE_PUSH_CONSTANT_ALLOC_PS         0x7916 /* GEN7+ */
>   # define GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT         16
>
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 85f82fe..4814639 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -112,6 +112,7 @@ extern const struct brw_tracked_state gen7_cc_viewport_state_pointer;
>   extern const struct brw_tracked_state gen7_clip_state;
>   extern const struct brw_tracked_state gen7_disable_stages;
>   extern const struct brw_tracked_state gen7_ps_state;
> +extern const struct brw_tracked_state gen7_push_constant_space;
>   extern const struct brw_tracked_state gen7_sbe_state;
>   extern const struct brw_tracked_state gen7_sf_clip_viewport;
>   extern const struct brw_tracked_state gen7_sf_state;
> @@ -220,9 +221,6 @@ uint32_t
>   get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset,
>                     int fs_attr, bool two_side_color, uint32_t *max_source_attr);
>
> -/* gen7_urb.c */
> -void gen7_allocate_push_constants(struct brw_context *brw);
> -
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index b883002..9638c69 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -188,6 +188,7 @@ static const struct brw_tracked_state *gen7_atoms[] =
>      &gen7_cc_viewport_state_pointer, /* must do after brw_cc_vp */
>      &gen7_sf_clip_viewport,
>
> +   &gen7_push_constant_space,
>      &gen7_urb,
>      &gen6_blend_state,		/* must do before cc unit */
>      &gen6_color_calc_state,	/* must do before cc unit */
> @@ -251,10 +252,6 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
>         return;
>
>      brw_upload_invariant_state(brw);
> -
> -   if (brw->gen >= 7) {
> -      gen7_allocate_push_constants(brw);
> -   }
>   }
>
>   void brw_init_state( struct brw_context *brw )
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index 6c798b1..9df3d92 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -51,6 +51,12 @@ static void
>   gen7_blorp_emit_urb_config(struct brw_context *brw,
>                              const brw_blorp_params *params)
>   {
> +   unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16;
> +   gen7_emit_push_constant_state(brw,
> +                                 urb_size / 2 /* vs_size */,
> +                                 0 /* gs_size */,
> +                                 urb_size / 2 /* fs_size */);
> +
>      /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
>       * 1.15:0 "VS Number of URB Entries".
>       */
> diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
> index 2d10cc12..4dc8f6e 100644
> --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> @@ -30,12 +30,12 @@
>   /**
>    * The following diagram shows how we partition the URB:
>    *
> - *      8kB         8kB              Rest of the URB space
> - *   ____-____   ____-____   _________________-_________________
> - *  /         \ /         \ /                                   \
> + *        16kB or 32kB               Rest of the URB space
> + *   __________-__________   _________________-_________________
> + *  /                     \ /                                   \
>    * +-------------------------------------------------------------+
> - * | VS Push   | FS Push   | VS                                  |
> - * | Constants | Constants | Handles                             |
> + * |     VS/FS/GS Push     |              VS/GS URB              |
> + * |       Constants       |               Entries               |
>    * +-------------------------------------------------------------+
>    *
>    * Notably, push constants must be stored at the beginning of the URB
> @@ -43,8 +43,12 @@
>    * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
>    * doubles this (32kB).
>    *
> - * Currently we split the constant buffer space evenly between VS and FS.
> - * This is probably not ideal, but simple.
> + * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
> + * sized) in increments of 1kB.  Haswell GT3 requires them to be located and
> + * sized in increments of 2kB.
> + *
> + * Currently we split the constant buffer space evenly among whatever stages
> + * are active.  This is probably not ideal, but simple.
>    *
>    * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
>    * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
> @@ -53,22 +57,91 @@
>    * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
>    * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
>    */
> -void
> +static void
>   gen7_allocate_push_constants(struct brw_context *brw)
>   {
> -   unsigned size = 8;
> -   if (brw->is_haswell && brw->gt == 3)
> -      size = 16;
> +   unsigned avail_size = 16;
> +   unsigned multiplier = (brw->is_haswell && brw->gt == 3) ? 2 : 1;
> +
> +   /* BRW_NEW_GEOMETRY_PROGRAM */
> +   bool gs_present = brw->geometry_program;
> +
> +   unsigned vs_size, gs_size;
> +   if (gs_present) {
> +      vs_size = avail_size / 3;
> +      avail_size -= vs_size;
> +      gs_size = avail_size / 2;
> +      avail_size -= gs_size;
> +   } else {
> +      vs_size = avail_size / 2;
> +      avail_size -= vs_size;
> +      gs_size = 0;
> +   }
> +   unsigned fs_size = avail_size;
> +
> +   gen7_emit_push_constant_state(brw, multiplier * vs_size,
> +                                 multiplier * gs_size, multiplier * fs_size);
> +}
> +
> +void
> +gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
> +                              unsigned gs_size, unsigned fs_size)
> +{
> +   unsigned offset = 0;
>
> -   BEGIN_BATCH(4);
> +   BEGIN_BATCH(6);
>      OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
> -   OUT_BATCH(size);
> +   OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> +   offset += vs_size;
> +
> +   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2));
> +   OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> +   offset += gs_size;
>
>      OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_PS << 16 | (2 - 2));
> -   OUT_BATCH(size | size << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
> +   OUT_BATCH(offset | fs_size << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
>      ADVANCE_BATCH();
> +
> +   /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
> +    *
> +    *     A PIPE_CONTOL command with the CS Stall bit set must be programmed
> +    *     in the ring after this instruction.
> +    *
> +    * No such restriction exists for Haswell.
> +    */
> +   if (!brw->is_haswell) {
> +      BEGIN_BATCH(4);
> +      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
> +      /* From p61 of the Ivy Bridge PRM (1.10.4 PIPE_CONTROL Command: DW1[20]
> +       * CS Stall):
> +       *
> +       *     One of the following must also be set:
> +       *     - Render Target Cache Flush Enable ([12] of DW1)
> +       *     - Depth Cache Flush Enable ([0] of DW1)
> +       *     - Stall at Pixel Scoreboard ([1] of DW1)
> +       *     - Depth Stall ([13] of DW1)
> +       *     - Post-Sync Operation ([13] of DW1)
> +       *
> +       * We choose to do a Post-Sync Operation (Write Immediate Data), since
> +       * it seems like it will incur the least additional performance penalty.
> +       */
> +      OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_WRITE_IMMEDIATE);
> +      OUT_RELOC(brw->batch.workaround_bo,
> +                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
> +      OUT_BATCH(0);
> +      ADVANCE_BATCH();
> +   }

This is really a bug fix, since it's implementing a hardware workaround 
on a packet we've always been emitting.  I'd really like to see it in a 
separate patch, for easier bisectability.

Interestingly, this text doesn't appear in the latest documentation, and 
I don't see it in the workaround database.  It is definitely there in 
the PRM, though.  I'm not sure what to make of it.

>   }
>
> +const struct brw_tracked_state gen7_push_constant_space = {
> +   .dirty = {
> +      .mesa = 0,
> +      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,

It's worth noting that this will cause a full pipeline stall whenever 
the current geometry shader changes (since 3DSTATE_PUSH_CONSTANT_ALLOC_* 
are non-pipelined).  We could avoid this when switching between two 
different geometry shaders.  But I suspect it's premature to care about 
that.  Not sure it'll matter much anyway.

> +      .cache = 0,
> +   },
> +   .emit = gen7_allocate_push_constants,
> +};
> +
>   static void
>   gen7_upload_urb(struct brw_context *brw)
>   {
>

Other than pulling the PIPE_CONTROL into a separate patch, this looks 
fine to me.


More information about the mesa-dev mailing list