[Mesa-dev] [PATCH] radeonsi: eliminate trivial constant VS outputs

Edmondo Tommasina edmondo.tommasina at gmail.com
Wed Oct 19 19:40:28 UTC 2016


Hi Marek

Tested-by: Edmondo Tommasina <edmondo.tommasina at gmail.com>

I tested the patch with Witcher 2.

Thanks
edmondo


On Tue, Oct 18, 2016 at 6:28 PM, Marek Olšák <maraeo at gmail.com> wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> These constant value VS PARAM exports:
> - 0,0,0,0
> - 0,0,0,1
> - 1,1,1,0
> - 1,1,1,1
> can be loaded into PS inputs using the DEFAULT_VAL field, and the VS exports
> can be removed from the IR to save export & parameter memory.
>
> After LLVM optimizations, analyze the IR to see which exports are equal to
> the ones listed above (or undef) and remove them if they are.
>
> Targeted use cases:
> - All DX9 eON ports always clear 10 VS outputs to 0.0 even if most of them
>   are unused by PS (such as Witcher 2 below).
> - VS output arrays with unused elements that the GLSL compiler can't
>   eliminate (such as Batman below).
>
> The shader-db deltas are quite interesting:
> (not from upstream si-report.py, it won't be upstreamed)
>
> PERCENTAGE DELTAS    Shaders PARAM exports (affected only)
> batman_arkham_origins    589  -67.17 %
> bioshock-infinite       1769   -0.47 %
> dirt-showdown            548   -2.68 %
> dota2                   1747   -3.36 %
> f1-2015                  776   -4.94 %
> left_4_dead_2           1762   -0.07 %
> metro_2033_redux        2670   -0.43 %
> portal                   474   -0.22 %
> talos_principle          324   -3.63 %
> warsow                   176   -2.20 %
> witcher2                1040  -73.78 %
> ----------------------------------------
> All affected             991  -65.37 %  ... 9681 -> 3353
> ----------------------------------------
> Total                  26725  -10.82 %  ... 58490 -> 52162
> ---
>  src/gallium/drivers/radeonsi/si_shader.c        | 154 ++++++++++++++++++++++++
>  src/gallium/drivers/radeonsi/si_shader.h        |  11 ++
>  src/gallium/drivers/radeonsi/si_state_shaders.c |  17 ++-
>  3 files changed, 180 insertions(+), 2 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index a361418..7fc1df4 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -6593,20 +6593,167 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
>         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
>         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
>         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
>
>         bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
>         bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
>         bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
>         bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
>  }
>
> +/* Return true if the PARAM export has been eliminated. */
> +static bool si_eliminate_const_output(struct si_shader_context *ctx,
> +                                     LLVMValueRef inst, unsigned offset)
> +{
> +       struct si_shader *shader = ctx->shader;
> +       unsigned num_outputs = shader->selector->info.num_outputs;
> +       double v[4];
> +       unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
> +
> +       for (i = 0; i < 4; i++) {
> +               LLVMBool loses_info;
> +               LLVMValueRef p = LLVMGetOperand(inst, 5 + i);
> +               if (!LLVMIsConstant(p))
> +                       return false;
> +
> +               /* It's a constant expression. Undef outputs are eliminated too. */
> +               if (LLVMIsUndef(p))
> +                       v[i] = 0;
> +               else
> +                       v[i] = LLVMConstRealGetDouble(p, &loses_info);
> +
> +               if (v[i] != 0 && v[i] != 1)
> +                       return false;
> +       }
> +
> +       /* Only certain combinations of 0 and 1 can be eliminated. */
> +       if (v[0] == 0 && v[1] == 0 && v[2] == 0)
> +               default_val = v[3] == 0 ? 0 : 1;
> +       else if (v[0] == 1 && v[1] == 1 && v[2] == 1)
> +               default_val = v[3] == 0 ? 2 : 3;
> +       else
> +               return false;
> +
> +       /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
> +       LLVMInstructionEraseFromParent(inst);
> +
> +       /* Change OFFSET to DEFAULT_VAL. */
> +       for (i = 0; i < num_outputs; i++) {
> +               if (shader->info.vs_output_param_offset[i] == offset) {
> +                       shader->info.vs_output_param_offset[i] =
> +                               EXP_PARAM_DEFAULT_VAL_0000 + default_val;
> +                       break;
> +               }
> +       }
> +       return true;
> +}
> +
> +struct si_vs_exports {
> +       unsigned num;
> +       unsigned offset[SI_MAX_VS_OUTPUTS];
> +       LLVMValueRef inst[SI_MAX_VS_OUTPUTS];
> +};
> +
> +static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
> +{
> +       struct si_shader *shader = ctx->shader;
> +       struct tgsi_shader_info *info = &shader->selector->info;
> +       LLVMBasicBlockRef bb;
> +       struct si_vs_exports exports;
> +       bool removed_any = false;
> +
> +       exports.num = 0;
> +
> +       if ((ctx->type == PIPE_SHADER_VERTEX &&
> +            (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
> +           (ctx->type == PIPE_SHADER_TESS_EVAL && shader->key.tes.as_es))
> +               return;
> +
> +       /* Process all LLVM instructions. */
> +       bb = LLVMGetFirstBasicBlock(ctx->radeon_bld.main_fn);
> +       while (bb) {
> +               LLVMValueRef inst = LLVMGetFirstInstruction(bb);
> +
> +               while (inst) {
> +                       LLVMValueRef cur = inst;
> +                       inst = LLVMGetNextInstruction(inst);
> +
> +                       if (LLVMGetInstructionOpcode(cur) != LLVMCall)
> +                               continue;
> +
> +                       LLVMValueRef callee = LLVMGetCalledValue(cur);
> +                       LLVMValueKind kind = LLVMGetValueKind(callee);
> +
> +                       if (kind != LLVMFunctionValueKind)
> +                               continue;
> +
> +                       const char *name = LLVMGetValueName(callee);
> +                       unsigned num_args = LLVMCountParams(callee);
> +
> +                       /* Check if this is an export instruction. */
> +                       if (num_args != 9 || strcmp(name, "llvm.SI.export"))
> +                               continue;
> +
> +                       LLVMValueRef arg = LLVMGetOperand(cur, 3);
> +                       unsigned target = LLVMConstIntGetZExtValue(arg);
> +
> +                       if (target < V_008DFC_SQ_EXP_PARAM)
> +                               continue;
> +
> +                       target -= V_008DFC_SQ_EXP_PARAM;
> +
> +                       /* Eliminate constant value PARAM exports. */
> +                       if (si_eliminate_const_output(ctx, cur, target)) {
> +                               removed_any = true;
> +                       } else {
> +                               exports.offset[exports.num] = target;
> +                               exports.inst[exports.num] = cur;
> +                               exports.num++;
> +                       }
> +               }
> +               bb = LLVMGetNextBasicBlock(bb);
> +       }
> +
> +       /* Remove holes in export memory due to removed PARAM exports.
> +        * This is done by renumbering all PARAM exports.
> +        */
> +       if (removed_any) {
> +               ubyte current_offset[SI_MAX_VS_OUTPUTS];
> +               unsigned new_count = 0;
> +               unsigned out, i;
> +
> +               /* Make a copy of the offsets. We need the old version while
> +                * we are modifying some of them. */
> +               assert(sizeof(current_offset) ==
> +                      sizeof(shader->info.vs_output_param_offset));
> +               memcpy(current_offset, shader->info.vs_output_param_offset,
> +                      sizeof(current_offset));
> +
> +               for (i = 0; i < exports.num; i++) {
> +                       unsigned offset = exports.offset[i];
> +
> +                       for (out = 0; out < info->num_outputs; out++) {
> +                               if (current_offset[out] != offset)
> +                                       continue;
> +
> +                               LLVMSetOperand(exports.inst[i], 3,
> +                                              LLVMConstInt(ctx->i32,
> +                                                           V_008DFC_SQ_EXP_PARAM + new_count, 0));
> +                               shader->info.vs_output_param_offset[out] = new_count;
> +                               new_count++;
> +                               break;
> +                       }
> +               }
> +               shader->info.nr_param_exports = new_count;
> +       }
> +}
> +
>  int si_compile_tgsi_shader(struct si_screen *sscreen,
>                            LLVMTargetMachineRef tm,
>                            struct si_shader *shader,
>                            bool is_monolithic,
>                            struct pipe_debug_callback *debug)
>  {
>         struct si_shader_selector *sel = shader->selector;
>         struct si_shader_context ctx;
>         struct lp_build_tgsi_context *bld_base;
>         LLVMModuleRef mod;
> @@ -6616,20 +6763,23 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
>          * conversion fails. */
>         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
>             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
>                 tgsi_dump(sel->tokens, 0);
>                 si_dump_streamout(&sel->so);
>         }
>
>         si_init_shader_ctx(&ctx, sscreen, shader, tm);
>         ctx.is_monolithic = is_monolithic;
>
> +       memset(shader->info.vs_output_param_offset, 0xff,
> +              sizeof(shader->info.vs_output_param_offset));
> +
>         shader->info.uses_instanceid = sel->info.uses_instanceid;
>
>         bld_base = &ctx.radeon_bld.soa.bld_base;
>         ctx.radeon_bld.load_system_value = declare_system_value;
>
>         switch (ctx.type) {
>         case PIPE_SHADER_VERTEX:
>                 ctx.radeon_bld.load_input = declare_input_vs;
>                 if (shader->key.vs.as_ls)
>                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
> @@ -6701,20 +6851,24 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
>
>         /* Dump LLVM IR before any optimization passes */
>         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
>             r600_can_dump_shader(&sscreen->b, ctx.type))
>                 LLVMDumpModule(mod);
>
>         radeon_llvm_finalize_module(
>                 &ctx.radeon_bld,
>                 r600_extra_shader_checks(&sscreen->b, ctx.type));
>
> +       /* Post-optimization transformations. */
> +       si_eliminate_const_vs_outputs(&ctx);
> +
> +       /* Compile to bytecode. */
>         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
>                             mod, debug, ctx.type, "TGSI shader");
>         if (r) {
>                 fprintf(stderr, "LLVM failed to compile shader\n");
>                 goto out;
>         }
>
>         radeon_llvm_dispose(&ctx.radeon_bld);
>
>         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index b07210c..6c7a05f 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -408,20 +408,31 @@ struct si_shader_config {
>         unsigned                        spilled_vgprs;
>         unsigned                        lds_size;
>         unsigned                        spi_ps_input_ena;
>         unsigned                        spi_ps_input_addr;
>         unsigned                        float_mode;
>         unsigned                        scratch_bytes_per_wave;
>         unsigned                        rsrc1;
>         unsigned                        rsrc2;
>  };
>
> +enum {
> +       /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
> +       EXP_PARAM_OFFSET_0 = 0,
> +       EXP_PARAM_OFFSET_31 = 31,
> +       /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
> +       EXP_PARAM_DEFAULT_VAL_0000 = 64,
> +       EXP_PARAM_DEFAULT_VAL_0001,
> +       EXP_PARAM_DEFAULT_VAL_1110,
> +       EXP_PARAM_DEFAULT_VAL_1111,
> +};
> +
>  /* GCN-specific shader info. */
>  struct si_shader_info {
>         ubyte                   vs_output_param_offset[SI_MAX_VS_OUTPUTS];
>         ubyte                   num_input_sgprs;
>         ubyte                   num_input_vgprs;
>         char                    face_vgpr_index;
>         bool                    uses_instanceid;
>         ubyte                   nr_pos_exports;
>         ubyte                   nr_param_exports;
>  };
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index d339b84..c030ed9 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -1562,36 +1562,49 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
>         pipe_mutex_destroy(sel->mutex);
>         free(sel->tokens);
>         free(sel);
>  }
>
>  static unsigned si_get_ps_input_cntl(struct si_context *sctx,
>                                      struct si_shader *vs, unsigned name,
>                                      unsigned index, unsigned interpolate)
>  {
>         struct tgsi_shader_info *vsinfo = &vs->selector->info;
> -       unsigned j, ps_input_cntl = 0;
> +       unsigned j, offset, ps_input_cntl = 0;
>
>         if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
>             (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade))
>                 ps_input_cntl |= S_028644_FLAT_SHADE(1);
>
>         if (name == TGSI_SEMANTIC_PCOORD ||
>             (name == TGSI_SEMANTIC_TEXCOORD &&
>              sctx->sprite_coord_enable & (1 << index))) {
>                 ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
>         }
>
>         for (j = 0; j < vsinfo->num_outputs; j++) {
>                 if (name == vsinfo->output_semantic_name[j] &&
>                     index == vsinfo->output_semantic_index[j]) {
> -                       ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[j]);
> +                       offset = vs->info.vs_output_param_offset[j];
> +
> +                       if (offset <= EXP_PARAM_OFFSET_31) {
> +                               /* The input is loaded from parameter memory. */
> +                               ps_input_cntl |= S_028644_OFFSET(offset);
> +                       } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
> +                               /* The input is a DEFAULT_VAL constant. */
> +                               assert(offset >= EXP_PARAM_DEFAULT_VAL_0000 &&
> +                                      offset <= EXP_PARAM_DEFAULT_VAL_1111);
> +
> +                               offset -= EXP_PARAM_DEFAULT_VAL_0000;
> +                               ps_input_cntl = S_028644_OFFSET(0x20) |
> +                                               S_028644_DEFAULT_VAL(offset);
> +                       }
>                         break;
>                 }
>         }
>
>         if (name == TGSI_SEMANTIC_PRIMID)
>                 /* PrimID is written after the last output. */
>                 ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
>         else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
>                 /* No corresponding output found, load defaults into input.
>                  * Don't set any other bits.
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list