Mesa (master): nir/lower_vec_to_movs: don't vectorize unsupports ops

Mon Jan 11 13:31:36 UTC 2021

Module: Mesa
Branch: master
Commit: faaba0d6afe0c5f6985345c7c6226435658d196a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=faaba0d6afe0c5f6985345c7c6226435658d196a

Author: Erico Nunes <nunes.erico at gmail.com>
Date:   Sun Aug 30 15:07:23 2020 +0200

nir/lower_vec_to_movs: don't vectorize unsupports ops

If the instruction being coalesced would be vectorized but the target
doesn't support vectorizing that op, skip coalescing.
Reuse the callbacks from alu_to_scalar to describe which ops should not
be vectorized.

Signed-off-by: Erico Nunes <nunes.erico at gmail.com>
Reviewed-by: Jason Ekstrand <jason at jlekstrand.net>
Reviewed-by: Eric Anholt <eric at anholt.net>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6506>

---

 src/compiler/nir/nir.h                       | 11 ++++++++-
 src/compiler/nir/nir_lower_vec_to_movs.c     | 37 +++++++++++++++++++++++-----
 src/gallium/auxiliary/nir/nir_to_tgsi.c      |  2 +-
 src/gallium/drivers/freedreno/a2xx/ir2_nir.c |  2 +-
 src/gallium/drivers/lima/lima_program.c      | 13 +++++++++-
 src/intel/compiler/brw_nir.c                 |  2 +-
 src/panfrost/midgard/midgard_compile.c       |  2 +-
 7 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 81450cf62a2..f4963ef7060 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4088,6 +4088,14 @@ static inline bool should_print_nir(nir_shader *shader) { return false; }
  */
 typedef bool (*nir_instr_filter_cb)(const nir_instr *, const void *);
 
+/** An instruction filtering callback with writemask
+ *
+ * Returns true if the instruction should be processed with the associated
+ * writemask and false otherwise.
+ */
+typedef bool (*nir_instr_writemask_filter_cb)(const nir_instr *,
+                                              unsigned writemask, const void *);
+
 /** A simple instruction lowering callback
  *
  * Many instruction lowering passes can be written as a simple function which
@@ -4457,7 +4465,8 @@ bool nir_lower_variable_initializers(nir_shader *shader,
                                      nir_variable_mode modes);
 
 bool nir_move_vec_src_uses_to_dest(nir_shader *shader);
-bool nir_lower_vec_to_movs(nir_shader *shader);
+bool nir_lower_vec_to_movs(nir_shader *shader, nir_instr_writemask_filter_cb cb,
+                           const void *_data);
 void nir_lower_alpha_test(nir_shader *shader, enum compare_func func,
                           bool alpha_to_one,
                           const gl_state_index16 *alpha_ref_state_tokens);
diff --git a/src/compiler/nir/nir_lower_vec_to_movs.c b/src/compiler/nir/nir_lower_vec_to_movs.c
index 29ce7e508a2..3efe709b39f 100644
--- a/src/compiler/nir/nir_lower_vec_to_movs.c
+++ b/src/compiler/nir/nir_lower_vec_to_movs.c
@@ -28,6 +28,11 @@
 #include "nir.h"
 #include "nir_builder.h"
 
+struct vec_to_movs_data {
+   nir_instr_writemask_filter_cb cb;
+   const void *data;
+};
+
 /*
  * Implements a simple pass that lowers vecN instructions to a series of
  * moves with partial writes.
@@ -119,8 +124,10 @@ has_replicated_dest(nir_alu_instr *alu)
  * can then call insert_mov as normal.
  */
 static unsigned
-try_coalesce(nir_alu_instr *vec, unsigned start_idx)
+try_coalesce(nir_alu_instr *vec, unsigned start_idx, void *_data)
 {
+   struct vec_to_movs_data *data = _data;
+
    assert(start_idx < nir_op_infos[vec->op].num_inputs);
 
    /* We will only even try if the source is SSA */
@@ -178,6 +185,7 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx)
       for (unsigned i = 0; i < 4; i++)
          swizzles[j][i] = src_alu->src[j].swizzle[i];
 
+   /* Generate the final write mask */
    unsigned write_mask = 0;
    for (unsigned i = start_idx; i < 4; i++) {
       if (!(vec->dest.write_mask & (1 << i)))
@@ -187,10 +195,21 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx)
           vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
          continue;
 
-      /* At this point, the give vec source matchese up with the ALU
+      write_mask |= 1 << i;
+   }
+
+   /* If the instruction would be vectorized but the backend
+    * doesn't support vectorizing this op, abort. */
+   if (data->cb && !data->cb(&src_alu->instr, write_mask, data->data))
+      return 0;
+
+   for (unsigned i = start_idx; i < 4; i++) {
+      if (!(write_mask & (1 << i)))
+         continue;
+
+      /* At this point, the given vec source matches up with the ALU
        * instruction so we can re-swizzle that component to match.
        */
-      write_mask |= 1 << i;
       if (has_replicated_dest(src_alu)) {
          /* Since the destination is a single replicated value, we don't need
           * to do any reswizzling
@@ -266,7 +285,7 @@ nir_lower_vec_to_movs_instr(nir_builder *b, nir_instr *instr, void *data)
        * vecN had an SSA destination.
        */
       if (vec_had_ssa_dest && !(finished_write_mask & (1 << i)))
-         finished_write_mask |= try_coalesce(vec, i);
+         finished_write_mask |= try_coalesce(vec, i, data);
 
       if (!(finished_write_mask & (1 << i)))
          finished_write_mask |= insert_mov(vec, i, b->shader);
@@ -279,11 +298,17 @@ nir_lower_vec_to_movs_instr(nir_builder *b, nir_instr *instr, void *data)
 }
 
 bool
-nir_lower_vec_to_movs(nir_shader *shader)
+nir_lower_vec_to_movs(nir_shader *shader, nir_instr_writemask_filter_cb cb,
+                      const void *_data)
 {
+   struct vec_to_movs_data data = {
+      .cb = cb,
+      .data = _data,
+   };
+
    return nir_shader_instructions_pass(shader,
                                        nir_lower_vec_to_movs_instr,
                                        nir_metadata_block_index |
                                        nir_metadata_dominance,
-                                       NULL);
+                                       &data);
 }
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
index 88ba7d4bec2..587c06074be 100644
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -2622,7 +2622,7 @@ nir_to_tgsi(struct nir_shader *s,
               nir_lower_float_source_mods |
               nir_lower_int_source_mods); /* no doubles */
    NIR_PASS_V(s, nir_convert_from_ssa, true);
-   NIR_PASS_V(s, nir_lower_vec_to_movs);
+   NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);
 
    /* locals_to_regs will leave dead derefs that are good to clean up. */
    NIR_PASS_V(s, nir_lower_locals_to_regs);
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
index 4f25ad90688..be80e4bc504 100644
--- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -1111,7 +1111,7 @@ ir2_nir_compile(struct ir2_context *ctx, bool binning)
 	OPT_V(ctx->nir, nir_convert_from_ssa, true);
 
 	OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
-	OPT_V(ctx->nir, nir_lower_vec_to_movs);
+	OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);
 
 	OPT_V(ctx->nir, nir_opt_dce);
 
diff --git a/src/gallium/drivers/lima/lima_program.c b/src/gallium/drivers/lima/lima_program.c
index 30a3f527181..5d58750eff1 100644
--- a/src/gallium/drivers/lima/lima_program.c
+++ b/src/gallium/drivers/lima/lima_program.c
@@ -191,6 +191,17 @@ lima_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
    return false;
 }
 
+static bool
+lima_vec_to_movs_filter_cb(const nir_instr *instr, unsigned writemask,
+                           const void *data)
+{
+   assert(writemask > 0);
+   if (util_bitcount(writemask) == 1)
+      return true;
+
+   return !lima_alu_to_scalar_filter_cb(instr, data);
+}
+
 void
 lima_program_optimize_fs_nir(struct nir_shader *s,
                              struct nir_lower_tex_options *tex_options)
@@ -252,7 +263,7 @@ lima_program_optimize_fs_nir(struct nir_shader *s,
    NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
 
    NIR_PASS_V(s, nir_move_vec_src_uses_to_dest);
-   NIR_PASS_V(s, nir_lower_vec_to_movs);
+   NIR_PASS_V(s, nir_lower_vec_to_movs, lima_vec_to_movs_filter_cb, NULL);
 
    NIR_PASS_V(s, lima_nir_duplicate_load_uniforms);
    NIR_PASS_V(s, lima_nir_duplicate_load_inputs);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 0b319d6afac..16ae9ccc04f 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1183,7 +1183,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
 
    if (!is_scalar) {
       OPT(nir_move_vec_src_uses_to_dest);
-      OPT(nir_lower_vec_to_movs);
+      OPT(nir_lower_vec_to_movs, NULL, NULL);
    }
 
    OPT(nir_opt_dce);
diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c
index 631aae293b0..253cdedc53f 100644
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -348,7 +348,7 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
 
         /* We are a vector architecture; write combine where possible */
         NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
-        NIR_PASS(progress, nir, nir_lower_vec_to_movs);
+        NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL);
 
         NIR_PASS(progress, nir, nir_opt_dce);
 }