[Mesa-dev] [PATCH 21/22] i965/gen7: Generalize gen7_vs_state in preparation for GS.

Thu Aug 29 00:57:45 PDT 2013

On 28 August 2013 20:40, Kenneth Graunke <kenneth at whitecape.org> wrote:

> On 08/26/2013 03:12 PM, Paul Berry wrote:
>
>> ---
>>   src/mesa/drivers/dri/i965/brw_**state.h     |  41 ++++++++++
>>   src/mesa/drivers/dri/i965/**gen7_vs_state.c | 123
>> ++++++++++++++++++++----------
>>   2 files changed, 122 insertions(+), 42 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/**brw_state.h
>> b/src/mesa/drivers/dri/i965/**brw_state.h
>> index b54338a..efef994 100644
>> --- a/src/mesa/drivers/dri/i965/**brw_state.h
>> +++ b/src/mesa/drivers/dri/i965/**brw_state.h
>> @@ -128,6 +128,38 @@ extern const struct brw_tracked_state gen7_wm_state;
>>   extern const struct brw_tracked_state haswell_cut_index;
>>
>>
>> +/**
>> + * Parameters that differ between Gen7 VS and GS state upload commands.
>> + */
>> +struct gen7_vec4_upload_params
>> +{
>> +   /**
>> +    * Command used to set the binding table pointers for this stage.
>> +    */
>> +   unsigned binding_table_pointers_cmd;
>> +
>> +   /**
>> +    * Command used to set the sampler state pointers for this stage.
>> +    */
>> +   unsigned sampler_state_pointers_cmd;
>> +
>> +   /**
>> +    * Command used to send constants for this stage.
>> +    */
>> +   unsigned constant_cmd;
>> +
>> +   /**
>> +    * Command used to send state for this stage.
>> +    */
>> +   unsigned state_cmd;
>> +
>> +   /**
>> +    * Size of the state command for this stage.
>> +    */
>> +   unsigned state_cmd_size;
>> +};
>> +
>> +
>>   /* brw_misc_state.c */
>>   void brw_upload_invariant_state(**struct brw_context *brw);
>>   uint32_t
>> @@ -240,6 +272,15 @@ brw_vec4_upload_binding_table(**struct brw_context
>> *brw,
>>                                 struct brw_vec4_context_base *vec4_ctx,
>>                                 const struct brw_vec4_prog_data
>> *prog_data);
>>
>> +/* gen7_vs_state.c */
>> +void
>> +gen7_upload_vec4_state(struct brw_context *brw,
>> +                       const struct gen7_vec4_upload_params
>> *upload_params,
>> +                       const struct brw_vec4_context_base *vec4_ctx,
>> +                       bool active, bool alt_floating_point_mode,
>> +                       const struct brw_vec4_prog_data *prog_data,
>> +                       const unsigned *stage_specific_cmd_data);
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/src/mesa/drivers/dri/i965/**gen7_vs_state.c
>> b/src/mesa/drivers/dri/i965/**gen7_vs_state.c
>> index 30fe802..fd81112 100644
>> --- a/src/mesa/drivers/dri/i965/**gen7_vs_state.c
>> +++ b/src/mesa/drivers/dri/i965/**gen7_vs_state.c
>> @@ -29,33 +29,31 @@
>>   #include "program/prog_statevars.h"
>>   #include "intel_batchbuffer.h"
>>
>> -static void
>> -upload_vs_state(struct brw_context *brw)
>> -{
>> -   struct gl_context *ctx = &brw->ctx;
>> -   const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;
>> -   uint32_t floating_point_mode = 0;
>> -   const int max_threads_shift = brw->is_haswell ?
>> -      HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
>>
>> -   gen7_emit_vs_workaround_flush(**brw);
>> -
>> -   /* BRW_NEW_VS_BINDING_TABLE */
>> +void
>> +gen7_upload_vec4_state(struct brw_context *brw,
>> +                       const struct gen7_vec4_upload_params
>> *upload_params,
>> +                       const struct brw_vec4_context_base *vec4_ctx,
>> +                       bool active, bool alt_floating_point_mode,
>> +                       const struct brw_vec4_prog_data *prog_data,
>> +                       const unsigned *stage_specific_cmd_data)
>> +{
>> +   /* BRW_NEW_*_BINDING_TABLE */
>>      BEGIN_BATCH(2);
>> -   OUT_BATCH(_3DSTATE_BINDING_**TABLE_POINTERS_VS << 16 | (2 - 2));
>> +   OUT_BATCH(upload_params->**binding_table_pointers_cmd << 16 | (2 -
>> 2));
>>      OUT_BATCH(vec4_ctx->bind_bo_**offset);
>>      ADVANCE_BATCH();
>>
>>      /* CACHE_NEW_SAMPLER */
>>      BEGIN_BATCH(2);
>> -   OUT_BATCH(_3DSTATE_SAMPLER_**STATE_POINTERS_VS << 16 | (2 - 2));
>> +   OUT_BATCH(upload_params->**sampler_state_pointers_cmd << 16 | (2 -
>> 2));
>>      OUT_BATCH(vec4_ctx->sampler_**offset);
>>      ADVANCE_BATCH();
>>
>> -   if (vec4_ctx->push_const_size == 0) {
>> +   if (!active || vec4_ctx->push_const_size == 0) {
>>         /* Disable the push constant buffers. */
>>         BEGIN_BATCH(7);
>> -      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (7 - 2));
>> +      OUT_BATCH(upload_params->**constant_cmd << 16 | (7 - 2));
>>         OUT_BATCH(0);
>>         OUT_BATCH(0);
>>         OUT_BATCH(0);
>> @@ -65,10 +63,10 @@ upload_vs_state(struct brw_context *brw)
>>         ADVANCE_BATCH();
>>      } else {
>>         BEGIN_BATCH(7);
>> -      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (7 - 2));
>> +      OUT_BATCH(upload_params->**constant_cmd << 16 | (7 - 2));
>>         OUT_BATCH(vec4_ctx->push_**const_size);
>>         OUT_BATCH(0);
>> -      /* Pointer to the VS constant buffer.  Covered by the set of
>> +      /* Pointer to the stage's constant buffer.  Covered by the set of
>>          * state flags from gen6_prepare_wm_contants
>>          */
>>         OUT_BATCH(vec4_ctx->push_**const_offset | GEN7_MOCS_L3);
>> @@ -78,36 +76,77 @@ upload_vs_state(struct brw_context *brw)
>>         ADVANCE_BATCH();
>>      }
>>
>> +   BEGIN_BATCH(upload_params->**state_cmd_size);
>> +   OUT_BATCH(upload_params->**state_cmd << 16 |
>> +             (upload_params->state_cmd_size - 2));
>> +   if (active) {
>> +      OUT_BATCH(vec4_ctx->prog_**offset);
>> +      OUT_BATCH((alt_floating_point_**mode ?
>> GEN6_FLOATING_POINT_MODE_ALT
>> +                 : GEN6_FLOATING_POINT_MODE_IEEE_**754) |
>> +                ((ALIGN(vec4_ctx->sampler_**count, 4)/4) <<
>> +                 GEN6_SAMPLER_COUNT_SHIFT));
>> +
>> +      if (prog_data->total_scratch) {
>> +         OUT_RELOC(vec4_ctx->scratch_**bo,
>> +                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
>> +                   ffs(prog_data->total_scratch) - 11);
>> +      } else {
>> +         OUT_BATCH(0);
>> +      }
>> +   } else {
>> +      OUT_BATCH(0); /* prog_bo */
>> +      OUT_BATCH((0 << GEN6_SAMPLER_COUNT_SHIFT) |
>> +                (0 << GEN6_BINDING_TABLE_ENTRY_**COUNT_SHIFT));
>> +      OUT_BATCH(0); /* scratch space base offset */
>> +   }
>> +   for (int i = 0; i < upload_params->state_cmd_size - 4; ++i)
>> +      OUT_BATCH(stage_specific_cmd_**data[i]);
>> +   ADVANCE_BATCH();
>> +}
>> +
>> +
>> +static const struct gen7_vec4_upload_params vs_upload_params = {
>> +   .binding_table_pointers_cmd = _3DSTATE_BINDING_TABLE_**POINTERS_VS,
>> +   .sampler_state_pointers_cmd = _3DSTATE_SAMPLER_STATE_**POINTERS_VS,
>> +   .constant_cmd = _3DSTATE_CONSTANT_VS,
>> +   .state_cmd = _3DSTATE_VS,
>> +   .state_cmd_size = 6,
>> +};
>> +
>> +
>> +static void
>> +upload_vs_state(struct brw_context *brw)
>> +{
>> +   struct gl_context *ctx = &brw->ctx;
>> +   const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;
>> +   const int max_threads_shift = brw->is_haswell ?
>> +      HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
>> +   /* CACHE_NEW_VS_PROG */
>> +   const struct brw_vec4_prog_data *prog_data = &brw->vs.prog_data->base;
>> +
>> +   gen7_emit_vs_workaround_flush(**brw);
>> +
>>      /* Use ALT floating point mode for ARB vertex programs, because they
>>       * require 0^0 == 1.
>>       */
>> -   if (ctx->Shader.**CurrentVertexProgram == NULL)
>> -      floating_point_mode = GEN6_FLOATING_POINT_MODE_ALT;
>> -
>> -   BEGIN_BATCH(6);
>> -   OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
>> -   OUT_BATCH(vec4_ctx->prog_**offset);
>> -   OUT_BATCH(floating_point_mode |
>> -            ((ALIGN(vec4_ctx->sampler_**count, 4)/4) <<
>> -              GEN6_SAMPLER_COUNT_SHIFT));
>> -
>> -   if (brw->vs.prog_data->base.**total_scratch) {
>> -      OUT_RELOC(vec4_ctx->scratch_**bo,
>> -               I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
>> -               ffs(brw->vs.prog_data->base.**total_scratch) - 11);
>> -   } else {
>> -      OUT_BATCH(0);
>> -   }
>> +   bool alt_floating_point_mode = (ctx->Shader.**CurrentVertexProgram
>> == NULL);
>>
>> -   OUT_BATCH((brw->vs.prog_data->**base.dispatch_grf_start_reg <<
>> -              GEN6_VS_DISPATCH_START_GRF_**SHIFT) |
>> -            (brw->vs.prog_data->base.urb_**read_length <<
>> GEN6_VS_URB_READ_LENGTH_SHIFT) |
>> -            (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_**SHIFT));
>> +   unsigned stage_specific_cmd_data[2];
>> +   stage_specific_cmd_data[0] =
>> +      (prog_data->dispatch_grf_**start_reg <<
>> +       GEN6_VS_DISPATCH_START_GRF_**SHIFT) |
>> +      (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
>> +      (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_**SHIFT);
>> +   stage_specific_cmd_data[1] =
>> +      ((brw->max_vs_threads - 1) << max_threads_shift) |
>> +      GEN6_VS_STATISTICS_ENABLE |
>> +      GEN6_VS_ENABLE;
>>
>> -   OUT_BATCH(((brw->max_vs_**threads - 1) << max_threads_shift) |
>> -            GEN6_VS_STATISTICS_ENABLE |
>> -            GEN6_VS_ENABLE);
>> -   ADVANCE_BATCH();
>> +   /* BRW_NEW_VS_BINDING_TABLE */
>> +   /* CACHE_NEW_SAMPLER */
>> +   gen7_upload_vec4_state(brw, &vs_upload_params, vec4_ctx, true /*
>> active */,
>> +                          alt_floating_point_mode, prog_data,
>> +                          stage_specific_cmd_data);
>>   }
>>
>>   const struct brw_tracked_state gen7_vs_state = {
>>
>>
> I can't quite put my finger on why, but I really dislike this code reuse.
>  It just seems really awkward to me.
>
> I feel like the code reuse is minimal; we're sharing really simple code
> here - nothing complicated.  Instead we get one packet split across
> multiple functions in separate files, making it harder to follow.
>
> Plus, you end up with a function that awkwardly emits four separate
> packets...but it doesn't even know the names of the packets.  You have to
> pass them in via a struct.  It doesn't even know the length of the final
> packet.
>
> Personally, I'd like to split the binding table and sampler table code out
> of the unit state.  (I had originally done them as a separate atom, but
> Eric suggested doing it this way.)
>
> Instead, I'd envisioned uploading the new tables when we create them: the
> function that creates a new VS binding table would also emit
> 3DSTATE_BINDING_TABLE_**POINTERS_VS to upload it.  That way it's all tied
> neatly together in one place.  (We started doing something similar with
> BLEND_STATE recently.)  That doesn't necessarily need to happen today, but
> I feel like this pushes us away from it.
>
> I've even thought about splitting out the 3DSTATE_CONSTANT commands into a
> separate atom, so we don't need to re-upload the program if only uniforms
> change.  Not sure whether that's a good idea yet.
>
> For now, I'd be much happier to see patches 21 and 22 replaced by a patch
> that emits 3DSTATE_GS in the current style with no reuse.  I'm ambivalent
> about patch 20.
>

Would you have some time tomorrow to do some in-person brainstorming about
other ways we might be able to structure this patch without completely
giving up on reuse?

I'm asking because I'm really concerned about the amount of code
duplication we have in the back-end state emission logic right now.  In my
experience, plans of the form "duplicate code now, we can always
consolidate it later", while usually well-meaning, often don't work out
because people get busy and "later" becomes "never".  Our current code base
seems to exhibit a lot of signs of that effect--that's why a lot of state
atoms have separate (but nearly identical) vs and fs versions, or separate
(but usually somewhat less identical) gen4-5, gen6, and gen7 versions.
Some state atoms are even split both ways.

The situation is going to get a lot worse when we add gen8, geometry
shaders, tessellation control shaders, and tessellation evaluation shaders
to the mix.  I'd prefer to shift from a "duplicate code now, consolidate
later" strategy to a "generalize the code first, then add the new cases",
as I've done here.  If we can't do that, I would at least like to have an
idea of what consolidation we are planning to do in the near future so that
we don't just perpetuate the code duplication status quo.

Incidentally, I can think of a few ways I could have made the code reuse a
lot cleaner using C++, however I am still gun-shy after how negatively my
use of C++ in blorp was received.  Here are some examples.

In C++, we could make a base class to represent the first 4 (shared) DWORDs
of the 3DSTATE_{VS,GS} packets, and derived classes to represent the
remaining DWORDS, like this (to simplify the discussion, let's assume that
the stage is always active):

class gen7_3dstate_vec4
{
public:
  init(struct brw_context *brw, unsigned opcode, unsigned size,
       const struct brw_vec4_context_base *vec4_ctx,
       const struct brw_vec4_prog_data *prog_data)
  {
    dw0 = opcode << 16 | (size - 2);
    dw1 = vec4_ctx->prog_offset;
    dw2 = (ALIGN(vec4_ctx->sampler_count, 4)/4) <<
      GEN6_SAMPLER_COUNT_SHIFT;
    dw3 = ffs(prog_data->total_scratch) - 11;
    drm_intel_bo_emit_reloc(...dw3...);
  }

protected:
  unsigned dw0;
  unsigned dw1;
  unsigned dw2;
  unsigned dw3;
};

class gen7_3dstate_vs : public gen7_3dstate_vec4
{
public:
  init(struct brw_context *brw)
  {
    const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;
    const struct brw_vec4_prog_data *prog_data = &brw->vs.prog_data->base;
    gen7_3dstate_vec4::init(brw, _3DSTATE_VS, 6, vec4_ctx, prog_data);
    if (brw->ctx.Shader.CurrentVertexProgram == NULL)
      dw2 |= GEN6_FLOATING_POINT_MODE_ALT;
    dw4 = (prog_data->dispatch_grf_start_reg <<
           GEN6_VS_DISPATCH_START_GRF_SHIFT) |
      (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
      (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT);
    dw5 = ((brw->max_vs_threads - 1) << max_threads_shift) |
      GEN6_VS_STATISTICS_ENABLE |
      GEN6_VS_ENABLE;
  }

protected:
   unsigned dw4;
   unsigned dw5;
};

Then upload_vs_state() would shrink down to:

static void
upload_vs_state(struct brw_context *brw)
{
   /* For simplicity assume we only have to output the 3DSTATE_VS command */
   void *p = allocate sizeof(gen7_3dstate_vs) bytes of space in the batch;
   gen7_3dstate_vs *cmd = new(p) gen7_3dstate_vs();
   cmd->init(brw);
   ADVANCE_BATCH();
}

A further improvement could be made if state atoms used a C++ dispatch
mechanism like this:

class brw_tracked_state {
public:
  virtual void emit(struct brw_context *brw) const = 0;
  /* leaving out dirty bits for simplicity */
};

In that case, I wouldn't have had to make a separate
gen7_vec4_upload_params struct; the extra parameters could have simply
lived in the derived class for the state atom.

Using a C++ dispatch mechanism for brw_tracked_state would also open up
more possible ways to share code between state atoms.  Here's an example
that leverages the gen7_3dstate_vs class above:

template<class command>
class gen7_vec4_state
{
public:
  virtual void emit(struct brw_context *brw) const
  {
    void *p = allocate sizeof(command) bytes of space in the batch;
    command *cmd = new(p) command();
    cmd->init(brw);
    ADVANCE_BATCH();
  }
};

With the above template, we wouldn't need separate upload_vs_state and
upload_gs_state functions.  We would simply instantiate:

gen7_vec4_state<gen7_3dstate_vs> gen7_vs_state;
gen7_vec4_state<gen7_3dstate_gs> gen7_gs_state;

and the compiler would generate the appropriate emit() functions for us.
With suitable use of inline functions, there's no reason this technique
would be any less performant than what we have now, and IMHO it would be
much easier to extend and maintain.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20130829/366e4749/attachment-0001.html>