[Mesa-dev] [PATCH v2 16/16] i965: Generate vs code using scalar backend for BDW+

Mon Dec 1 16:36:25 PST 2014

On Fri, Nov 14, 2014 at 3:22 PM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> On Thursday, November 13, 2014 04:28:22 PM Kristian Høgsberg wrote:
>> With everything in place, we can now use the scalar backend compiler for
>> vertex shaders on BDW+.  We make scalar vertex shaders the default on
>> BDW+ but add a new vec4vs debug option to force the vec4 backend.
>>
>> No piglit regressions.
>>
>> Performance impact is minimal, I see a ~1.5 improvement on the T-Rex
>> GLBenchmark case, but in general it's in the noise.  Some of our
>> internal synthetic, vs bounded benchmarks show great improvement, 20%-40%
>> in some cases, but real-world cases are mostly unaffected.
>>
>> Signed-off-by: Kristian Høgsberg <krh at bitplanet.net>
>> ---
>>  src/mesa/drivers/dri/i965/brw_context.c  | 13 +++++++
>>  src/mesa/drivers/dri/i965/brw_context.h  |  1 +
>>  src/mesa/drivers/dri/i965/brw_shader.cpp | 17 +++++++--
>>  src/mesa/drivers/dri/i965/brw_vec4.cpp   | 60 +++++++++++++++++++++++++-------
>>  src/mesa/drivers/dri/i965/intel_debug.c  |  1 +
>>  src/mesa/drivers/dri/i965/intel_debug.h  |  1 +
>>  6 files changed, 78 insertions(+), 15 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
>> index e1a994a..f56cfb2 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.c
>> +++ b/src/mesa/drivers/dri/i965/brw_context.c
>> @@ -557,6 +557,15 @@ brw_initialize_context_constants(struct brw_context *brw)
>>     ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = true;
>>     ctx->Const.ShaderCompilerOptions[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
>>
>> +   if (brw->scalar_vs) {
>> +      /* If we're using the scalar backend for vertex shaders, we need to
>> +       * configure these accordingly.
>> +       */
>> +      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
>> +      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
>> +      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = false;
>
> This is concerning.  Any shaders that use variable indexing of temporary
> or output arrays will now get giant IF-ladders, which is horribly inefficient.
> It seems like a regression in terms of functionality.
>
> I don't know how important indirect output support is for VS, or how hard
> that would be.  If we really support it in the 4x2 backend, I can't imagine
> it'd be that hard in scalar one.
>
> Supporting indirect access of temp arrays by spilling to scratch (like the
> 4x2 backend does) should be trivial, and would improve the fragment shader
> backend considerably as well.  I'd like to see that happen before this lands.
>
> OptimizeForAOS begs the question..."If we ever fall back to SIMD4x2, how do
> we set different options?"  I think we should ignore that problem.  The
> EmitNoIndirect* flags can be fixed by adding support to the scalar backend.
> The only things we lose from OptimizeForAOS are Matt's vectorizer pass
> (which I think we should run regardless of this flag), and the matrix
> flipper (which isn't very important).

The options here are the same as we set for the FS backend to match
the expectation of the fs_visitor.  That seems like a minimal starting
point for this feature.  While we may loose some optimizations when
switching to scalar, we also gain new ones that are not in the vec4
backend.  As I mention in the preface to this series, I never really
saw significant gains from this series, but in all the benchmarking I
did, I also didn't see any performance regressions.  There's room for
improvement and no performance regressions. If we find a serious
performance problem we can fix the issues above or flip the sense of
the vec4vs debug bit.

>> +   }
>> +
>>     /* ARB_viewport_array */
>>     if (brw->gen >= 7 && ctx->API == API_OPENGL_CORE) {
>>        ctx->Const.MaxViewports = GEN7_NUM_VIEWPORTS;
>> @@ -755,6 +764,10 @@ brwCreateContext(gl_api api,
>>
>>     brw_process_driconf_options(brw);
>>     brw_process_intel_debug_variable(brw);
>> +
>> +   if (brw->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
>> +      brw->scalar_vs = true;
>> +
>>     brw_initialize_context_constants(brw);
>>
>>     ctx->Const.ResetStrategy = notify_reset
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
>> index 463f3d2..f198103 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>> @@ -1067,6 +1067,7 @@ struct brw_context
>>     bool has_pln;
>>     bool no_simd8;
>>     bool use_rep_send;
>> +   bool scalar_vs;
>>
>>     /**
>>      * Some versions of Gen hardware don't do centroid interpolation correctly
>> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
>> index 3c78afd..26da729 100644
>> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
>> @@ -71,6 +71,19 @@ brw_shader_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
>>     return true;
>>  }
>>
>> +static inline bool
>> +is_scalar_shader_stage(struct brw_context *brw, int stage)
>> +{
>> +   switch (stage) {
>> +   case MESA_SHADER_FRAGMENT:
>> +      return true;
>> +   case MESA_SHADER_VERTEX:
>> +      return brw->scalar_vs;
>> +   default:
>> +      return false;
>> +   }
>> +}
>> +
>>  static void
>>  brw_lower_packing_builtins(struct brw_context *brw,
>>                             gl_shader_stage shader_type,
>> @@ -91,7 +104,7 @@ brw_lower_packing_builtins(struct brw_context *brw,
>>         * lowering is needed. For SOA code, the Half2x16 ops must be
>>         * scalarized.
>>         */
>> -      if (shader_type == MESA_SHADER_FRAGMENT) {
>> +      if (is_scalar_shader_stage(brw, shader_type)) {
>>           ops |= LOWER_PACK_HALF_2x16_TO_SPLIT
>>               |  LOWER_UNPACK_HALF_2x16_TO_SPLIT;
>>        }
>> @@ -179,7 +192,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
>>        do {
>>        progress = false;
>>
>> -      if (stage == MESA_SHADER_FRAGMENT) {
>> +      if (is_scalar_shader_stage(brw, stage)) {
>>           brw_do_channel_expressions(shader->base.ir);
>>           brw_do_vector_splitting(shader->base.ir);
>>        }
>> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> index 280db47..3e9cc23 100644
>> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> @@ -22,6 +22,7 @@
>>   */
>>
>>  #include "brw_vec4.h"
>> +#include "brw_fs.h"
>>  #include "brw_cfg.h"
>>  #include "brw_vs.h"
>>  #include "brw_dead_control_flow.h"
>> @@ -1863,6 +1864,7 @@ brw_vs_emit(struct brw_context *brw,
>>  {
>>     bool start_busy = false;
>>     double start_time = 0;
>> +   const unsigned *assembly = NULL;
>>
>>     if (unlikely(brw->perf_debug)) {
>>        start_busy = (brw->batch.last_bo &&
>> @@ -1877,23 +1879,55 @@ brw_vs_emit(struct brw_context *brw,
>>     if (unlikely(INTEL_DEBUG & DEBUG_VS))
>>        brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
>>
>> -   vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
>> -   if (!v.run()) {
>> -      if (prog) {
>> -         prog->LinkStatus = false;
>> -         ralloc_strcat(&prog->InfoLog, v.fail_msg);
>> -      }
>> +   if (prog && brw->gen >= 8 && brw->scalar_vs) {
>> +      fs_visitor v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
>> +      if (!v.run_vs()) {
>> +         if (prog) {
>> +            prog->LinkStatus = false;
>> +            ralloc_strcat(&prog->InfoLog, v.fail_msg);
>> +         }
>>
>> -      _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
>> -                    v.fail_msg);
>> +         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
>> +                       v.fail_msg);
>>
>> -      return NULL;
>> +         return NULL;
>> +      }
>> +
>> +      fs_generator g(brw, mem_ctx, (void *) &c->key, &prog_data->base.base,
>> +                     &c->vp->program.Base, v.runtime_check_aads_emit);
>> +      if (INTEL_DEBUG & DEBUG_VS) {
>> +         char *name = ralloc_asprintf(mem_ctx, "%s vertex shader %d",
>> +                                      prog->Label ? prog->Label : "unnamed",
>> +                                      prog->Name);
>> +         g.enable_debug(name);
>> +      }
>> +      assembly = g.generate_assembly(v.cfg, NULL,
>> +                                     final_assembly_size);
>
> This doesn't compile.  You deleted the generate_assembly method in patch 2.

Yeah, fixed that locally.  I'll send out v3 once I finish the piglit run.
>
>> +      if (brw->gen < 8)
>> +         assembly = NULL;
>
> Dead code.  You're in a brw->gen >= 8 block.

Fixed.

>> +      if (assembly)
>> +         prog_data->base.simd8 = true;
>> +      c->base.last_scratch = v.last_scratch;
>>     }
>
> It looks like you're using scalar mode 100% of the time.  I think we should
> consider falling back to SIMD4x2 if we spill.  I also think that it may be
> required in order to support the maximum number of attributes or varyings,
> but I'd have to double check that...

Yes, I didn't write this with the expectation that we'd fall back on
gen8+.  I guess I'll run this on shader-db and see how many vertex
shaders start spilling, but at least the max attribute limit isn't a
problem.

Kristian

>> -   const unsigned *assembly = NULL;
>> -   vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
>> -                    mem_ctx, INTEL_DEBUG & DEBUG_VS);
>> -   assembly = g.generate_assembly(v.cfg, final_assembly_size);
>> +   if (!assembly) {
>> +      vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
>> +      if (!v.run()) {
>> +         if (prog) {
>> +            prog->LinkStatus = false;
>> +            ralloc_strcat(&prog->InfoLog, v.fail_msg);
>> +         }
>> +
>> +         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
>> +                       v.fail_msg);
>> +
>> +         return NULL;
>> +      }
>> +
>> +      vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
>> +                       mem_ctx, INTEL_DEBUG & DEBUG_VS);
>> +      assembly = g.generate_assembly(v.cfg, final_assembly_size);
>> +   }
>>
>>     if (unlikely(brw->perf_debug) && shader) {
>>        if (shader->compiled_once) {
>> diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
>> index a283357..6e729b5 100644
>> --- a/src/mesa/drivers/dri/i965/intel_debug.c
>> +++ b/src/mesa/drivers/dri/i965/intel_debug.c
>> @@ -67,6 +67,7 @@ static const struct dri_debug_control debug_control[] = {
>>     { "optimizer",   DEBUG_OPTIMIZER },
>>     { "noann",       DEBUG_NO_ANNOTATION },
>>     { "no8",         DEBUG_NO8 },
>> +   { "vec4vs",      DEBUG_VEC4VS },
>>     { NULL,    0 }
>>  };
>>
>> diff --git a/src/mesa/drivers/dri/i965/intel_debug.h b/src/mesa/drivers/dri/i965/intel_debug.h
>> index e859be1..2f20616 100644
>> --- a/src/mesa/drivers/dri/i965/intel_debug.h
>> +++ b/src/mesa/drivers/dri/i965/intel_debug.h
>> @@ -63,6 +63,7 @@ extern uint64_t INTEL_DEBUG;
>>  #define DEBUG_OPTIMIZER           (1 << 27)
>>  #define DEBUG_NO_ANNOTATION       (1 << 28)
>>  #define DEBUG_NO8                 (1 << 29)
>> +#define DEBUG_VEC4VS              (1 << 30)
>>
>>  #ifdef HAVE_ANDROID_PLATFORM
>>  #define LOG_TAG "INTEL-MESA"