[Mesa-dev] [PATCH 10/11] nir/lower_vec_to_movs: Coalesce movs on-the-fly when possible

Tue Sep 15 07:00:08 PDT 2015

On 09/10/2015 02:50 AM, Jason Ekstrand wrote:
> The old pass blindly inserted a bunch of moves into the shader with no
> concern for whether or not it was really needed.  This adds code to try and
> coalesce into the destination of the instruction providing the value.
> 
> Shader-db results for vec4 shaders on Haswell:
> 
>    total instructions in shared programs: 1801527 -> 1778849 (-1.26%)
>    instructions in affected programs:     1205644 -> 1182966 (-1.88%)
>    helped:                                9817
>    HURT:                                  1334
> 
> This approach is heavily based on a different patch by Eduardo Lima Mitev
> <elima at igalia.com>.  Eduardo's patch did this in a separate pass as opposed
> to integrating it into nir_lower_vec_to_movs.
> 
> Cc: Eduardo Lima Mitev <elima at igalia.com>

Implementing this pass on partial SSA form does make things easier.

Also, having nir_instr_rewrite_dest() helps too. I tried adding a
similar method to NIR, but soon got frustrated and found out that adding
a new instruction and removing the old one was easier :).

This patch is:

Reviewed-by: Eduardo Lima Mitev <elima at igalia.com>

I think this is the last patch that was pending review from this series.
The patch 11/11 (aka, the fdot hack), has been obsoleted by your patches
10.1, 10.2 and 11 in the v2 of this series. I will review those next.

> ---
>  src/glsl/nir/nir_lower_vec_to_movs.c | 85 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 85 insertions(+)
> 
> diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
> index ed8ec9b..0ebf3e3 100644
> --- a/src/glsl/nir/nir_lower_vec_to_movs.c
> +++ b/src/glsl/nir/nir_lower_vec_to_movs.c
> @@ -84,6 +84,88 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
>     return mov->dest.write_mask;
>  }
>  
> +/* Attempts to coalesce the "move" from the given source of the vec to the
> + * destination of the instruction generating the value. If, for whatever
> + * reason, we cannot coalesce the mmove, it does nothing and returns 0.  We
> + * can then call insert_mov as normal.
> + */
> +static unsigned
> +try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
> +{
> +   assert(start_idx < nir_op_infos[vec->op].num_inputs);
> +
> +   /* We will only even try if the source is SSA */
> +   if (!vec->src[start_idx].src.is_ssa)
> +      return 0;
> +
> +   assert(vec->src[start_idx].src.ssa);
> +
> +   /* If we are going to do a reswizzle, then the vecN operation must be the
> +    * only use of the source value.  We also can't have any source modifiers.
> +    */
> +   nir_foreach_use(vec->src[start_idx].src.ssa, src) {
> +      if (src->parent_instr != &vec->instr)
> +         return 0;
> +
> +      nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
> +      if (alu_src->abs || alu_src->negate)
> +         return 0;
> +   }
> +
> +   if (!list_empty(&vec->src[start_idx].src.ssa->if_uses))
> +      return 0;
> +
> +   if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu)
> +      return 0;
> +
> +   nir_alu_instr *src_alu =
> +      nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
> +
> +   /* We only care about being able to re-swizzle the instruction if it is
> +    * something that we can reswizzle.  It must be per-component.
> +    */
> +   if (nir_op_infos[src_alu->op].output_size != 0)
> +      return 0;
> +
> +   /* If we are going to reswizzle the instruction, we can't have any
> +    * non-per-component sources either.
> +    */
> +   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
> +      if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
> +         return 0;
> +
> +   /* Stash off all of the ALU instruction's swizzles. */
> +   uint8_t swizzles[4][4];
> +   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
> +      for (unsigned i = 0; i < 4; i++)
> +         swizzles[j][i] = src_alu->src[j].swizzle[i];
> +
> +   unsigned write_mask = 0;
> +   for (unsigned i = start_idx; i < 4; i++) {
> +      if (!(vec->dest.write_mask & (1 << i)))
> +         continue;
> +
> +      if (!vec->src[i].src.is_ssa ||
> +          vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
> +         continue;
> +
> +      /* At this point, the give vec source matchese up with the ALU
> +       * instruction so we can re-swizzle that component to match.
> +       */
> +      write_mask |= 1 << i;
> +      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
> +         src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
> +
> +      /* Clear the no longer needed vec source */
> +      nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
> +   }
> +
> +   nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest);
> +   src_alu->dest.write_mask = write_mask;
> +
> +   return write_mask;
> +}
> +
>  static bool
>  lower_vec_to_movs_block(nir_block *block, void *void_state)
>  {
> @@ -141,6 +223,9 @@ lower_vec_to_movs_block(nir_block *block, void *void_state)
>              continue;
>  
>           if (!(finished_write_mask & (1 << i)))
> +            finished_write_mask |= try_coalesce(vec, i, state->shader);
> +
> +         if (!(finished_write_mask & (1 << i)))
>              finished_write_mask |= insert_mov(vec, i, state->shader);
>        }
>  
>