[Mesa-dev] [PATCH 8/7] i965: Accurately bail on SIMD16 compiles.

Wed Mar 12 02:37:29 PDT 2014

This patch is:

Reviewed-by: Chris Forbes <chrisf at ijw.co.nz>

On Wed, Mar 12, 2014 at 9:28 PM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> Ideally, we'd like to never even attempt the SIMD16 compile if we could
> know ahead of time that it won't succeed---it's purely a waste of time.
> This is especially important for state-based recompiles, which happen at
> draw time.
>
> The fragment shader compiler has a number of checks like:
>
>    if (dispatch_width == 16)
>       fail("...some reason...");
>
> This patch introduces a new no16() function which replaces the above
> pattern.  In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
> Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
> issue a helpful performance warning if INTEL_DEBUG=perf is set.  (In
> SIMD16 mode, no16() calls fail(), for safety's sake.)
>
> The great part is that this is not a heuristic---if the flag is set, we
> know with 100% certainty that the SIMD16 compile would fail.  (It might
> fail anyway if we run out of registers, but it's always worth trying.)
>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp         | 69 +++++++++++++++++++++++-----
>  src/mesa/drivers/dri/i965/brw_fs.h           |  4 ++
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 44 +++++++++---------
>  3 files changed, 83 insertions(+), 34 deletions(-)
>
> I forgot to send this one out...it applies on top of the previous 7 patches.
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 62848be..9ad80c5 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -647,9 +647,8 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
>  }
>
>  void
> -fs_visitor::fail(const char *format, ...)
> +fs_visitor::vfail(const char *format, va_list va)
>  {
> -   va_list va;
>     char *msg;
>
>     if (failed)
> @@ -657,9 +656,7 @@ fs_visitor::fail(const char *format, ...)
>
>     failed = true;
>
> -   va_start(va, format);
>     msg = ralloc_vasprintf(mem_ctx, format, va);
> -   va_end(va);
>     msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
>
>     this->fail_msg = msg;
> @@ -669,6 +666,49 @@ fs_visitor::fail(const char *format, ...)
>     }
>  }
>
> +void
> +fs_visitor::fail(const char *format, ...)
> +{
> +   va_list va;
> +
> +   va_start(va, format);
> +   vfail(format, va);
> +   va_end(va);
> +}
> +
> +/**
> + * Mark this program as impossible to compile in SIMD16 mode.
> + *
> + * During the SIMD8 compile (which happens first), we can detect and flag
> + * things that are unsupported in SIMD16 mode, so the compiler can skip
> + * the SIMD16 compile altogether.
> + *
> + * During a SIMD16 compile (if one happens anyway), this just calls fail().
> + */
> +void
> +fs_visitor::no16(const char *format, ...)
> +{
> +   va_list va;
> +
> +   va_start(va, format);
> +
> +   if (dispatch_width == 16) {
> +      vfail(format, va);
> +      return;
> +   }
> +
> +   simd16_unsupported = true;
> +
> +   if (INTEL_DEBUG & DEBUG_PERF) {
> +      if (no16_msg)
> +         ralloc_vasprintf_append(&no16_msg, format, va);
> +      else
> +         no16_msg = ralloc_vasprintf(mem_ctx, format, va);
> +   }
> +
> +   va_end(va);
> +}
> +
>  fs_inst *
>  fs_visitor::emit(enum opcode opcode)
>  {
> @@ -1356,8 +1396,8 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
>     switch (opcode) {
>     case SHADER_OPCODE_INT_QUOTIENT:
>     case SHADER_OPCODE_INT_REMAINDER:
> -      if (brw->gen >= 7 && dispatch_width == 16)
> -        fail("SIMD16 INTDIV unsupported\n");
> +      if (brw->gen >= 7)
> +        no16("SIMD16 INTDIV unsupported\n");
>        break;
>     case SHADER_OPCODE_POW:
>        break;
> @@ -3504,13 +3544,18 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
>     exec_list *simd16_instructions = NULL;
>     fs_visitor v2(brw, c, prog, fp, 16);
>     if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
> -      /* Try a SIMD16 compile */
> -      v2.import_uniforms(&v);
> -      if (!v2.run()) {
> -         perf_debug("SIMD16 shader failed to compile, falling back to "
> -                    "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
> +      if (!v.simd16_unsupported) {
> +         /* Try a SIMD16 compile */
> +         v2.import_uniforms(&v);
> +         if (!v2.run()) {
> +            perf_debug("SIMD16 shader failed to compile, falling back to "
> +                       "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
> +         } else {
> +            simd16_instructions = &v2.instructions;
> +         }
>        } else {
> -         simd16_instructions = &v2.instructions;
> +         perf_debug("SIMD16 shader unsupported, falling back to "
> +                    "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
>        }
>     }
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 9de1f3a..0d064f6 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -381,7 +381,9 @@ public:
>     void insert_gen4_send_dependency_workarounds();
>     void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
>     void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
> +   void vfail(const char *msg, va_list args);
>     void fail(const char *msg, ...);
> +   void no16(const char *msg, ...);
>     void lower_uniform_pull_constant_loads();
>
>     void push_force_uncompressed();
> @@ -541,6 +543,8 @@ public:
>
>     bool failed;
>     char *fail_msg;
> +   bool simd16_unsupported;
> +   char *no16_msg;
>
>     /* Result of last visit() method. */
>     fs_reg result;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index cd90e23..a832268 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -462,8 +462,8 @@ fs_visitor::visit(ir_expression *ir)
>           * FINISHME: Emit just the MUL if we know an operand is small
>           * enough.
>           */
> -        if (brw->gen >= 7 && dispatch_width == 16)
> -           fail("SIMD16 explicit accumulator operands unsupported\n");
> +        if (brw->gen >= 7)
> +           no16("SIMD16 explicit accumulator operands unsupported\n");
>
>          struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
>
> @@ -475,8 +475,8 @@ fs_visitor::visit(ir_expression *ir)
>        }
>        break;
>     case ir_binop_imul_high: {
> -      if (brw->gen >= 7 && dispatch_width == 16)
> -         fail("SIMD16 explicit accumulator operands unsupported\n");
> +      if (brw->gen >= 7)
> +         no16("SIMD16 explicit accumulator operands unsupported\n");
>
>        struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
>
> @@ -490,8 +490,8 @@ fs_visitor::visit(ir_expression *ir)
>        emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
>        break;
>     case ir_binop_carry: {
> -      if (brw->gen >= 7 && dispatch_width == 16)
> -         fail("SIMD16 explicit accumulator operands unsupported\n");
> +      if (brw->gen >= 7)
> +         no16("SIMD16 explicit accumulator operands unsupported\n");
>
>        struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
>
> @@ -500,8 +500,8 @@ fs_visitor::visit(ir_expression *ir)
>        break;
>     }
>     case ir_binop_borrow: {
> -      if (brw->gen >= 7 && dispatch_width == 16)
> -         fail("SIMD16 explicit accumulator operands unsupported\n");
> +      if (brw->gen >= 7)
> +         no16("SIMD16 explicit accumulator operands unsupported\n");
>
>        struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
>
> @@ -1290,8 +1290,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
>        next.reg_offset++;
>        break;
>     case ir_txd: {
> -      if (dispatch_width == 16)
> -        fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
> +      no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
>
>        /* Load dPdx and the coordinate together:
>         * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
> @@ -1364,8 +1363,8 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
>        break;
>     case ir_tg4:
>        if (has_nonconstant_offset) {
> -         if (ir->shadow_comparitor && dispatch_width == 16)
> -            fail("Gen7 does not support gather4_po_c in SIMD16 mode.");
> +         if (ir->shadow_comparitor)
> +            no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
>
>           /* More crazy intermixing */
>           ir->offset->accept(this);
> @@ -1464,8 +1463,8 @@ fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
>          0
>        };
>
> +      no16("rectangle scale uniform setup not supported on SIMD16\n");
>        if (dispatch_width == 16) {
> -        fail("rectangle scale uniform setup not supported on SIMD16\n");
>          return coordinate;
>        }
>
> @@ -2183,8 +2182,8 @@ fs_visitor::try_replace_with_sel()
>  void
>  fs_visitor::visit(ir_if *ir)
>  {
> -   if (brw->gen < 6 && dispatch_width == 16) {
> -      fail("Can't support (non-uniform) control flow on SIMD16\n");
> +   if (brw->gen < 6) {
> +      no16("Can't support (non-uniform) control flow on SIMD16\n");
>     }
>
>     /* Don't point the annotation at the if statement, because then it plus
> @@ -2226,8 +2225,8 @@ fs_visitor::visit(ir_if *ir)
>  void
>  fs_visitor::visit(ir_loop *ir)
>  {
> -   if (brw->gen < 6 && dispatch_width == 16) {
> -      fail("Can't support (non-uniform) control flow on SIMD16\n");
> +   if (brw->gen < 6) {
> +      no16("Can't support (non-uniform) control flow on SIMD16\n");
>     }
>
>     this->base_ir = NULL;
> @@ -2725,9 +2724,10 @@ fs_visitor::emit_fb_writes()
>     bool do_dual_src = this->dual_src_output.file != BAD_FILE;
>     bool src0_alpha_to_render_target = false;
>
> -   if (dispatch_width == 16 && do_dual_src) {
> -      fail("GL_ARB_blend_func_extended not yet supported in SIMD16.");
> -      do_dual_src = false;
> +   if (do_dual_src) {
> +      no16("GL_ARB_blend_func_extended not yet supported in SIMD16.");
> +      if (dispatch_width == 16)
> +         do_dual_src = false;
>     }
>
>     /* From the Sandy Bridge PRM, volume 4, page 198:
> @@ -2778,13 +2778,13 @@ fs_visitor::emit_fb_writes()
>        nr += reg_width;
>
>     if (c->source_depth_to_render_target) {
> -      if (brw->gen == 6 && dispatch_width == 16) {
> +      if (brw->gen == 6) {
>          /* For outputting oDepth on gen6, SIMD8 writes have to be
>           * used.  This would require SIMD8 moves of each half to
>           * message regs, kind of like pre-gen5 SIMD16 FB writes.
>           * Just bail on doing so for now.
>           */
> -        fail("Missing support for simd16 depth writes on gen6\n");
> +        no16("Missing support for simd16 depth writes on gen6\n");
>        }
>
>        if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
> --
> 1.9.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev