[Mesa-dev] [PATCH 21/22] i965/gen7: Generalize gen7_vs_state in preparation for GS.

Wed Aug 28 20:40:19 PDT 2013

On 08/26/2013 03:12 PM, Paul Berry wrote:
> ---
>   src/mesa/drivers/dri/i965/brw_state.h     |  41 ++++++++++
>   src/mesa/drivers/dri/i965/gen7_vs_state.c | 123 ++++++++++++++++++++----------
>   2 files changed, 122 insertions(+), 42 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index b54338a..efef994 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -128,6 +128,38 @@ extern const struct brw_tracked_state gen7_wm_state;
>   extern const struct brw_tracked_state haswell_cut_index;
>
>
> +/**
> + * Parameters that differ between Gen7 VS and GS state upload commands.
> + */
> +struct gen7_vec4_upload_params
> +{
> +   /**
> +    * Command used to set the binding table pointers for this stage.
> +    */
> +   unsigned binding_table_pointers_cmd;
> +
> +   /**
> +    * Command used to set the sampler state pointers for this stage.
> +    */
> +   unsigned sampler_state_pointers_cmd;
> +
> +   /**
> +    * Command used to send constants for this stage.
> +    */
> +   unsigned constant_cmd;
> +
> +   /**
> +    * Command used to send state for this stage.
> +    */
> +   unsigned state_cmd;
> +
> +   /**
> +    * Size of the state command for this stage.
> +    */
> +   unsigned state_cmd_size;
> +};
> +
> +
>   /* brw_misc_state.c */
>   void brw_upload_invariant_state(struct brw_context *brw);
>   uint32_t
> @@ -240,6 +272,15 @@ brw_vec4_upload_binding_table(struct brw_context *brw,
>                                 struct brw_vec4_context_base *vec4_ctx,
>                                 const struct brw_vec4_prog_data *prog_data);
>
> +/* gen7_vs_state.c */
> +void
> +gen7_upload_vec4_state(struct brw_context *brw,
> +                       const struct gen7_vec4_upload_params *upload_params,
> +                       const struct brw_vec4_context_base *vec4_ctx,
> +                       bool active, bool alt_floating_point_mode,
> +                       const struct brw_vec4_prog_data *prog_data,
> +                       const unsigned *stage_specific_cmd_data);
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> index 30fe802..fd81112 100644
> --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> @@ -29,33 +29,31 @@
>   #include "program/prog_statevars.h"
>   #include "intel_batchbuffer.h"
>
> -static void
> -upload_vs_state(struct brw_context *brw)
> -{
> -   struct gl_context *ctx = &brw->ctx;
> -   const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;
> -   uint32_t floating_point_mode = 0;
> -   const int max_threads_shift = brw->is_haswell ?
> -      HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
>
> -   gen7_emit_vs_workaround_flush(brw);
> -
> -   /* BRW_NEW_VS_BINDING_TABLE */
> +void
> +gen7_upload_vec4_state(struct brw_context *brw,
> +                       const struct gen7_vec4_upload_params *upload_params,
> +                       const struct brw_vec4_context_base *vec4_ctx,
> +                       bool active, bool alt_floating_point_mode,
> +                       const struct brw_vec4_prog_data *prog_data,
> +                       const unsigned *stage_specific_cmd_data)
> +{
> +   /* BRW_NEW_*_BINDING_TABLE */
>      BEGIN_BATCH(2);
> -   OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_VS << 16 | (2 - 2));
> +   OUT_BATCH(upload_params->binding_table_pointers_cmd << 16 | (2 - 2));
>      OUT_BATCH(vec4_ctx->bind_bo_offset);
>      ADVANCE_BATCH();
>
>      /* CACHE_NEW_SAMPLER */
>      BEGIN_BATCH(2);
> -   OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS_VS << 16 | (2 - 2));
> +   OUT_BATCH(upload_params->sampler_state_pointers_cmd << 16 | (2 - 2));
>      OUT_BATCH(vec4_ctx->sampler_offset);
>      ADVANCE_BATCH();
>
> -   if (vec4_ctx->push_const_size == 0) {
> +   if (!active || vec4_ctx->push_const_size == 0) {
>         /* Disable the push constant buffers. */
>         BEGIN_BATCH(7);
> -      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (7 - 2));
> +      OUT_BATCH(upload_params->constant_cmd << 16 | (7 - 2));
>         OUT_BATCH(0);
>         OUT_BATCH(0);
>         OUT_BATCH(0);
> @@ -65,10 +63,10 @@ upload_vs_state(struct brw_context *brw)
>         ADVANCE_BATCH();
>      } else {
>         BEGIN_BATCH(7);
> -      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (7 - 2));
> +      OUT_BATCH(upload_params->constant_cmd << 16 | (7 - 2));
>         OUT_BATCH(vec4_ctx->push_const_size);
>         OUT_BATCH(0);
> -      /* Pointer to the VS constant buffer.  Covered by the set of
> +      /* Pointer to the stage's constant buffer.  Covered by the set of
>          * state flags from gen6_prepare_wm_contants
>          */
>         OUT_BATCH(vec4_ctx->push_const_offset | GEN7_MOCS_L3);
> @@ -78,36 +76,77 @@ upload_vs_state(struct brw_context *brw)
>         ADVANCE_BATCH();
>      }
>
> +   BEGIN_BATCH(upload_params->state_cmd_size);
> +   OUT_BATCH(upload_params->state_cmd << 16 |
> +             (upload_params->state_cmd_size - 2));
> +   if (active) {
> +      OUT_BATCH(vec4_ctx->prog_offset);
> +      OUT_BATCH((alt_floating_point_mode ? GEN6_FLOATING_POINT_MODE_ALT
> +                 : GEN6_FLOATING_POINT_MODE_IEEE_754) |
> +                ((ALIGN(vec4_ctx->sampler_count, 4)/4) <<
> +                 GEN6_SAMPLER_COUNT_SHIFT));
> +
> +      if (prog_data->total_scratch) {
> +         OUT_RELOC(vec4_ctx->scratch_bo,
> +                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
> +                   ffs(prog_data->total_scratch) - 11);
> +      } else {
> +         OUT_BATCH(0);
> +      }
> +   } else {
> +      OUT_BATCH(0); /* prog_bo */
> +      OUT_BATCH((0 << GEN6_SAMPLER_COUNT_SHIFT) |
> +                (0 << GEN6_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> +      OUT_BATCH(0); /* scratch space base offset */
> +   }
> +   for (int i = 0; i < upload_params->state_cmd_size - 4; ++i)
> +      OUT_BATCH(stage_specific_cmd_data[i]);
> +   ADVANCE_BATCH();
> +}
> +
> +
> +static const struct gen7_vec4_upload_params vs_upload_params = {
> +   .binding_table_pointers_cmd = _3DSTATE_BINDING_TABLE_POINTERS_VS,
> +   .sampler_state_pointers_cmd = _3DSTATE_SAMPLER_STATE_POINTERS_VS,
> +   .constant_cmd = _3DSTATE_CONSTANT_VS,
> +   .state_cmd = _3DSTATE_VS,
> +   .state_cmd_size = 6,
> +};
> +
> +
> +static void
> +upload_vs_state(struct brw_context *brw)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +   const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;
> +   const int max_threads_shift = brw->is_haswell ?
> +      HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
> +   /* CACHE_NEW_VS_PROG */
> +   const struct brw_vec4_prog_data *prog_data = &brw->vs.prog_data->base;
> +
> +   gen7_emit_vs_workaround_flush(brw);
> +
>      /* Use ALT floating point mode for ARB vertex programs, because they
>       * require 0^0 == 1.
>       */
> -   if (ctx->Shader.CurrentVertexProgram == NULL)
> -      floating_point_mode = GEN6_FLOATING_POINT_MODE_ALT;
> -
> -   BEGIN_BATCH(6);
> -   OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
> -   OUT_BATCH(vec4_ctx->prog_offset);
> -   OUT_BATCH(floating_point_mode |
> -	     ((ALIGN(vec4_ctx->sampler_count, 4)/4) <<
> -              GEN6_SAMPLER_COUNT_SHIFT));
> -
> -   if (brw->vs.prog_data->base.total_scratch) {
> -      OUT_RELOC(vec4_ctx->scratch_bo,
> -		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
> -		ffs(brw->vs.prog_data->base.total_scratch) - 11);
> -   } else {
> -      OUT_BATCH(0);
> -   }
> +   bool alt_floating_point_mode = (ctx->Shader.CurrentVertexProgram == NULL);
>
> -   OUT_BATCH((brw->vs.prog_data->base.dispatch_grf_start_reg <<
> -              GEN6_VS_DISPATCH_START_GRF_SHIFT) |
> -	     (brw->vs.prog_data->base.urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
> -	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
> +   unsigned stage_specific_cmd_data[2];
> +   stage_specific_cmd_data[0] =
> +      (prog_data->dispatch_grf_start_reg <<
> +       GEN6_VS_DISPATCH_START_GRF_SHIFT) |
> +      (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
> +      (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT);
> +   stage_specific_cmd_data[1] =
> +      ((brw->max_vs_threads - 1) << max_threads_shift) |
> +      GEN6_VS_STATISTICS_ENABLE |
> +      GEN6_VS_ENABLE;
>
> -   OUT_BATCH(((brw->max_vs_threads - 1) << max_threads_shift) |
> -	     GEN6_VS_STATISTICS_ENABLE |
> -	     GEN6_VS_ENABLE);
> -   ADVANCE_BATCH();
> +   /* BRW_NEW_VS_BINDING_TABLE */
> +   /* CACHE_NEW_SAMPLER */
> +   gen7_upload_vec4_state(brw, &vs_upload_params, vec4_ctx, true /* active */,
> +                          alt_floating_point_mode, prog_data,
> +                          stage_specific_cmd_data);
>   }
>
>   const struct brw_tracked_state gen7_vs_state = {
>

I can't quite put my finger on why, but I really dislike this code 
reuse.  It just seems really awkward to me.

I feel like the code reuse is minimal; we're sharing really simple code 
here - nothing complicated.  Instead we get one packet split across 
multiple functions in separate files, making it harder to follow.

Plus, you end up with a function that awkwardly emits four separate 
packets...but it doesn't even know the names of the packets.  You have 
to pass them in via a struct.  It doesn't even know the length of the 
final packet.

Personally, I'd like to split the binding table and sampler table code 
out of the unit state.  (I had originally done them as a separate atom, 
but Eric suggested doing it this way.)

Instead, I'd envisioned uploading the new tables when we create them: 
the function that creates a new VS binding table would also emit 
3DSTATE_BINDING_TABLE_POINTERS_VS to upload it.  That way it's all tied 
neatly together in one place.  (We started doing something similar with 
BLEND_STATE recently.)  That doesn't necessarily need to happen today, 
but I feel like this pushes us away from it.

I've even thought about splitting out the 3DSTATE_CONSTANT commands into 
a separate atom, so we don't need to re-upload the program if only 
uniforms change.  Not sure whether that's a good idea yet.

For now, I'd be much happier to see patches 21 and 22 replaced by a 
patch that emits 3DSTATE_GS in the current style with no reuse.  I'm 
ambivalent about patch 20.