[Mesa-dev] [PATCH v2 16/16] i965: Generate vs code using scalar backend for BDW+

Kenneth Graunke kenneth at whitecape.org
Fri Nov 14 15:22:39 PST 2014


On Thursday, November 13, 2014 04:28:22 PM Kristian Høgsberg wrote:
> With everything in place, we can now use the scalar backend compiler for
> vertex shaders on BDW+.  We make scalar vertex shaders the default on
> BDW+ but add a new vec4vs debug option to force the vec4 backend.
> 
> No piglit regressions.
> 
> Performance impact is minimal, I see a ~1.5 improvement on the T-Rex
> GLBenchmark case, but in general it's in the noise.  Some of our
> internal synthetic, vs bounded benchmarks show great improvement, 20%-40%
> in some cases, but real-world cases are mostly unaffected.
> 
> Signed-off-by: Kristian Høgsberg <krh at bitplanet.net>
> ---
>  src/mesa/drivers/dri/i965/brw_context.c  | 13 +++++++
>  src/mesa/drivers/dri/i965/brw_context.h  |  1 +
>  src/mesa/drivers/dri/i965/brw_shader.cpp | 17 +++++++--
>  src/mesa/drivers/dri/i965/brw_vec4.cpp   | 60 +++++++++++++++++++++++++-------
>  src/mesa/drivers/dri/i965/intel_debug.c  |  1 +
>  src/mesa/drivers/dri/i965/intel_debug.h  |  1 +
>  6 files changed, 78 insertions(+), 15 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index e1a994a..f56cfb2 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -557,6 +557,15 @@ brw_initialize_context_constants(struct brw_context *brw)
>     ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = true;
>     ctx->Const.ShaderCompilerOptions[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
>  
> +   if (brw->scalar_vs) {
> +      /* If we're using the scalar backend for vertex shaders, we need to
> +       * configure these accordingly.
> +       */
> +      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
> +      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
> +      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = false;

This is concerning.  Any shaders that use variable indexing of temporary
or output arrays will now get giant IF-ladders, which is horribly inefficient.
It seems like a regression in terms of functionality.

I don't know how important indirect output support is for VS, or how hard
that would be.  If we really support it in the 4x2 backend, I can't imagine
it'd be that hard in scalar one.

Supporting indirect access of temp arrays by spilling to scratch (like the
4x2 backend does) should be trivial, and would improve the fragment shader
backend considerably as well.  I'd like to see that happen before this lands.

OptimizeForAOS begs the question..."If we ever fall back to SIMD4x2, how do
we set different options?"  I think we should ignore that problem.  The
EmitNoIndirect* flags can be fixed by adding support to the scalar backend.
The only things we lose from OptimizeForAOS are Matt's vectorizer pass
(which I think we should run regardless of this flag), and the matrix
flipper (which isn't very important).

> +   }
> +
>     /* ARB_viewport_array */
>     if (brw->gen >= 7 && ctx->API == API_OPENGL_CORE) {
>        ctx->Const.MaxViewports = GEN7_NUM_VIEWPORTS;
> @@ -755,6 +764,10 @@ brwCreateContext(gl_api api,
>  
>     brw_process_driconf_options(brw);
>     brw_process_intel_debug_variable(brw);
> +
> +   if (brw->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
> +      brw->scalar_vs = true;
> +
>     brw_initialize_context_constants(brw);
>  
>     ctx->Const.ResetStrategy = notify_reset
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 463f3d2..f198103 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1067,6 +1067,7 @@ struct brw_context
>     bool has_pln;
>     bool no_simd8;
>     bool use_rep_send;
> +   bool scalar_vs;
>  
>     /**
>      * Some versions of Gen hardware don't do centroid interpolation correctly
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 3c78afd..26da729 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -71,6 +71,19 @@ brw_shader_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
>     return true;
>  }
>  
> +static inline bool
> +is_scalar_shader_stage(struct brw_context *brw, int stage)
> +{
> +   switch (stage) {
> +   case MESA_SHADER_FRAGMENT:
> +      return true;
> +   case MESA_SHADER_VERTEX:
> +      return brw->scalar_vs;
> +   default:
> +      return false;
> +   }
> +}
> +
>  static void
>  brw_lower_packing_builtins(struct brw_context *brw,
>                             gl_shader_stage shader_type,
> @@ -91,7 +104,7 @@ brw_lower_packing_builtins(struct brw_context *brw,
>         * lowering is needed. For SOA code, the Half2x16 ops must be
>         * scalarized.
>         */
> -      if (shader_type == MESA_SHADER_FRAGMENT) {
> +      if (is_scalar_shader_stage(brw, shader_type)) {
>           ops |= LOWER_PACK_HALF_2x16_TO_SPLIT
>               |  LOWER_UNPACK_HALF_2x16_TO_SPLIT;
>        }
> @@ -179,7 +192,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
>        do {
>  	 progress = false;
>  
> -	 if (stage == MESA_SHADER_FRAGMENT) {
> +	 if (is_scalar_shader_stage(brw, stage)) {
>  	    brw_do_channel_expressions(shader->base.ir);
>  	    brw_do_vector_splitting(shader->base.ir);
>  	 }
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index 280db47..3e9cc23 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -22,6 +22,7 @@
>   */
>  
>  #include "brw_vec4.h"
> +#include "brw_fs.h"
>  #include "brw_cfg.h"
>  #include "brw_vs.h"
>  #include "brw_dead_control_flow.h"
> @@ -1863,6 +1864,7 @@ brw_vs_emit(struct brw_context *brw,
>  {
>     bool start_busy = false;
>     double start_time = 0;
> +   const unsigned *assembly = NULL;
>  
>     if (unlikely(brw->perf_debug)) {
>        start_busy = (brw->batch.last_bo &&
> @@ -1877,23 +1879,55 @@ brw_vs_emit(struct brw_context *brw,
>     if (unlikely(INTEL_DEBUG & DEBUG_VS))
>        brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
>  
> -   vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
> -   if (!v.run()) {
> -      if (prog) {
> -         prog->LinkStatus = false;
> -         ralloc_strcat(&prog->InfoLog, v.fail_msg);
> -      }
> +   if (prog && brw->gen >= 8 && brw->scalar_vs) {
> +      fs_visitor v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
> +      if (!v.run_vs()) {
> +         if (prog) {
> +            prog->LinkStatus = false;
> +            ralloc_strcat(&prog->InfoLog, v.fail_msg);
> +         }
>  
> -      _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
> -                    v.fail_msg);
> +         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
> +                       v.fail_msg);
>  
> -      return NULL;
> +         return NULL;
> +      }
> +
> +      fs_generator g(brw, mem_ctx, (void *) &c->key, &prog_data->base.base,
> +                     &c->vp->program.Base, v.runtime_check_aads_emit);
> +      if (INTEL_DEBUG & DEBUG_VS) {
> +         char *name = ralloc_asprintf(mem_ctx, "%s vertex shader %d",
> +                                      prog->Label ? prog->Label : "unnamed",
> +                                      prog->Name);
> +         g.enable_debug(name);
> +      }
> +      assembly = g.generate_assembly(v.cfg, NULL,
> +                                     final_assembly_size);

This doesn't compile.  You deleted the generate_assembly method in patch 2.

> +      if (brw->gen < 8)
> +         assembly = NULL;

Dead code.  You're in a brw->gen >= 8 block.

> +      if (assembly)
> +         prog_data->base.simd8 = true;
> +      c->base.last_scratch = v.last_scratch;
>     }

It looks like you're using scalar mode 100% of the time.  I think we should
consider falling back to SIMD4x2 if we spill.  I also think that it may be
required in order to support the maximum number of attributes or varyings,
but I'd have to double check that...

> -   const unsigned *assembly = NULL;
> -   vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
> -                    mem_ctx, INTEL_DEBUG & DEBUG_VS);
> -   assembly = g.generate_assembly(v.cfg, final_assembly_size);
> +   if (!assembly) {
> +      vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
> +      if (!v.run()) {
> +         if (prog) {
> +            prog->LinkStatus = false;
> +            ralloc_strcat(&prog->InfoLog, v.fail_msg);
> +         }
> +
> +         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
> +                       v.fail_msg);
> +
> +         return NULL;
> +      }
> +
> +      vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
> +                       mem_ctx, INTEL_DEBUG & DEBUG_VS);
> +      assembly = g.generate_assembly(v.cfg, final_assembly_size);
> +   }
>  
>     if (unlikely(brw->perf_debug) && shader) {
>        if (shader->compiled_once) {
> diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
> index a283357..6e729b5 100644
> --- a/src/mesa/drivers/dri/i965/intel_debug.c
> +++ b/src/mesa/drivers/dri/i965/intel_debug.c
> @@ -67,6 +67,7 @@ static const struct dri_debug_control debug_control[] = {
>     { "optimizer",   DEBUG_OPTIMIZER },
>     { "noann",       DEBUG_NO_ANNOTATION },
>     { "no8",         DEBUG_NO8 },
> +   { "vec4vs",      DEBUG_VEC4VS },
>     { NULL,    0 }
>  };
>  
> diff --git a/src/mesa/drivers/dri/i965/intel_debug.h b/src/mesa/drivers/dri/i965/intel_debug.h
> index e859be1..2f20616 100644
> --- a/src/mesa/drivers/dri/i965/intel_debug.h
> +++ b/src/mesa/drivers/dri/i965/intel_debug.h
> @@ -63,6 +63,7 @@ extern uint64_t INTEL_DEBUG;
>  #define DEBUG_OPTIMIZER           (1 << 27)
>  #define DEBUG_NO_ANNOTATION       (1 << 28)
>  #define DEBUG_NO8                 (1 << 29)
> +#define DEBUG_VEC4VS              (1 << 30)
>  
>  #ifdef HAVE_ANDROID_PLATFORM
>  #define LOG_TAG "INTEL-MESA"
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: This is a digitally signed message part.
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20141114/d3c69131/attachment.sig>


More information about the mesa-dev mailing list