[Mesa-dev] [PATCH] i965: Compute VS attribute WA bits earlier and check if they changed.

Thu Dec 4 20:42:44 PST 2014

On 12/03/2014 05:42 PM, Kenneth Graunke wrote:
> BRW_NEW_VERTICES is flagged every time we draw a primitive.  Having
> the brw_vs_prog atom depend on BRW_NEW_VERTICES meant that we had to
> compute the VS program key and do a program cache lookup for every
> single primitive.  This is painfully expensive.
> 
> The workaround bit computation is almost entirely based on the vertex
> attribute arrays (brw->vb.inputs[i]), which are set by brw_merge_inputs.
> The only thing it uses the VS program for is to see which VS inputs are
> actually read.  brw_merge_inputs() happens once per primitive, and can
> safely look at the currently bound vertex program, as it doesn't change
> in the middle of a draw.
> 
> This patch moves the workaround bit computation to brw_merge_inputs(),
> right after assigning brw->vb.inputs[i], and stores the previous WA bit
> values in the context.  If they've actually changed from the last draw
> (which is uncommon), we signal that we need a new vertex program,
> causing brw_vs_prog to compute a new key.
> 
> Logically, it might make sense to create a BRW_NEW_VS_WORKAROUND_BITS
> state flag, but I chose to use BRW_NEW_VERTEX_PROGRAM since it should
> work just as well, and already exists.
> 
> Improves performance in Gl32Batch7 by 15.5728% +/- 1.96978% on my
> Haswell GT3e system.  I'm told that Baytrail shows similar gains.

I got 9.08476% +/- 0.322829% (n=40) for 32-bit and 10.1315% +/-
0.315496% for 64-bit on my BYT using Fedora 20's Mesa CFLAGS.

Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>

> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_context.h |  8 ++++++
>  src/mesa/drivers/dri/i965/brw_draw.c    | 46 +++++++++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_vs.c      | 42 +++---------------------------
>  3 files changed, 58 insertions(+), 38 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index ec4b3dd..ace29df 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1129,6 +1129,14 @@ struct brw_context
>         * the same VB packed over and over again.
>         */
>        unsigned int start_vertex_bias;
> +
> +      /**
> +       * Certain vertex attribute formats aren't natively handled by the
> +       * hardware and require special VS code to fix up their values.
> +       *
> +       * These bitfields indicate which workarounds are needed.
> +       */
> +      uint8_t attrib_wa_flags[VERT_ATTRIB_MAX];
>     } vb;
>  
>     struct {
> diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
> index 4c2802a..d2ea085 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw.c
> @@ -46,6 +46,7 @@
>  #include "brw_defines.h"
>  #include "brw_context.h"
>  #include "brw_state.h"
> +#include "brw_vs.h"
>  
>  #include "intel_batchbuffer.h"
>  #include "intel_buffers.h"
> @@ -281,6 +282,7 @@ static void brw_emit_prim(struct brw_context *brw,
>  static void brw_merge_inputs( struct brw_context *brw,
>  		       const struct gl_client_array *arrays[])
>  {
> +   const struct gl_context *ctx = &brw->ctx;
>     GLuint i;
>  
>     for (i = 0; i < brw->vb.nr_buffers; i++) {
> @@ -293,6 +295,50 @@ static void brw_merge_inputs( struct brw_context *brw,
>        brw->vb.inputs[i].buffer = -1;
>        brw->vb.inputs[i].glarray = arrays[i];
>     }
> +
> +   if (brw->gen < 8 && !brw->is_haswell) {
> +      struct gl_program *vp = &ctx->VertexProgram._Current->Base;
> +      /* Prior to Haswell, the hardware can't natively support GL_FIXED or
> +       * 2_10_10_10_REV vertex formats.  Set appropriate workaround flags.
> +       */
> +      for (i = 0; i < VERT_ATTRIB_MAX; i++) {
> +         if (!(vp->InputsRead & BITFIELD64_BIT(i)))
> +            continue;
> +
> +         uint8_t wa_flags = 0;
> +
> +         switch (brw->vb.inputs[i].glarray->Type) {
> +
> +         case GL_FIXED:
> +            wa_flags = brw->vb.inputs[i].glarray->Size;
> +            break;
> +
> +         case GL_INT_2_10_10_10_REV:
> +            wa_flags |= BRW_ATTRIB_WA_SIGN;
> +            /* fallthough */
> +
> +         case GL_UNSIGNED_INT_2_10_10_10_REV:
> +            if (brw->vb.inputs[i].glarray->Format == GL_BGRA)
> +               wa_flags |= BRW_ATTRIB_WA_BGRA;
> +
> +            if (brw->vb.inputs[i].glarray->Normalized)
> +               wa_flags |= BRW_ATTRIB_WA_NORMALIZE;
> +            else if (!brw->vb.inputs[i].glarray->Integer)
> +               wa_flags |= BRW_ATTRIB_WA_SCALE;
> +
> +            break;
> +         }
> +
> +         if (brw->vb.attrib_wa_flags[i] != wa_flags) {
> +            brw->vb.attrib_wa_flags[i] = wa_flags;
> +            /* New workaround flags means we need to compute a new program
> +             * key and look up the associated vertex shader.  Just flag that
> +             * we have a new shader program, since it triggers that effect.
> +             */
> +            brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
> +         }
> +      }
> +   }
>  }
>  
>  /**
> diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
> index 2f628e5..6dbac1e 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs.c
> +++ b/src/mesa/drivers/dri/i965/brw_vs.c
> @@ -449,42 +449,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
>     brw_populate_sampler_prog_key_data(ctx, prog, brw->vs.base.sampler_count,
>                                        &key.base.tex);
>  
> -   /* BRW_NEW_VERTICES */
> -   if (brw->gen < 8 && !brw->is_haswell) {
> -      /* Prior to Haswell, the hardware can't natively support GL_FIXED or
> -       * 2_10_10_10_REV vertex formats.  Set appropriate workaround flags.
> -       */
> -      for (i = 0; i < VERT_ATTRIB_MAX; i++) {
> -         if (!(vp->program.Base.InputsRead & BITFIELD64_BIT(i)))
> -            continue;
> -
> -         uint8_t wa_flags = 0;
> -
> -         switch (brw->vb.inputs[i].glarray->Type) {
> -
> -         case GL_FIXED:
> -            wa_flags = brw->vb.inputs[i].glarray->Size;
> -            break;
> -
> -         case GL_INT_2_10_10_10_REV:
> -            wa_flags |= BRW_ATTRIB_WA_SIGN;
> -            /* fallthough */
> -
> -         case GL_UNSIGNED_INT_2_10_10_10_REV:
> -            if (brw->vb.inputs[i].glarray->Format == GL_BGRA)
> -               wa_flags |= BRW_ATTRIB_WA_BGRA;
> -
> -            if (brw->vb.inputs[i].glarray->Normalized)
> -               wa_flags |= BRW_ATTRIB_WA_NORMALIZE;
> -            else if (!brw->vb.inputs[i].glarray->Integer)
> -               wa_flags |= BRW_ATTRIB_WA_SCALE;
> -
> -            break;
> -         }
> -
> -         key.gl_attrib_wa_flags[i] = wa_flags;
> -      }
> -   }
> +   /* BRW_NEW_VERTEX_PROGRAM */
> +   memcpy(key.gl_attrib_wa_flags, brw->vb.attrib_wa_flags,
> +          sizeof(brw->vb.attrib_wa_flags));
>  
>     if (!brw_search_cache(&brw->cache, BRW_CACHE_VS_PROG,
>  			 &key, sizeof(key),
> @@ -521,8 +488,7 @@ const struct brw_tracked_state brw_vs_prog = {
>                 _NEW_POLYGON |
>                 _NEW_TEXTURE |
>                 _NEW_TRANSFORM,
> -      .brw   = BRW_NEW_VERTEX_PROGRAM |
> -               BRW_NEW_VERTICES,
> +      .brw   = BRW_NEW_VERTEX_PROGRAM,
>     },
>     .emit = brw_upload_vs_prog
>  };
>