Mesa (master): nir_to_tgsi: Store directly to TGSI outputs when possible.

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed Jan 20 20:35:27 UTC 2021


Module: Mesa
Branch: master
Commit: f79f382c81f884bc217ad6efd8c7b3c8bbacdb01
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f79f382c81f884bc217ad6efd8c7b3c8bbacdb01

Author: Eric Anholt <eric at anholt.net>
Date:   Wed Dec  9 15:49:16 2020 -0800

nir_to_tgsi: Store directly to TGSI outputs when possible.

Saves emitting a MOV at the end of the program to store the output.

softpipe glmark2 -b buffer +9.73451% +/- 3.17924% (n=6)
softpipe glmark2 -b build +5.57621% +/- 1.35074% (n=9)

Reviewed-by: Matt Turner <mattst88 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8023>

---

 src/gallium/auxiliary/nir/nir_to_tgsi.c | 227 ++++++++++++++++++++------------
 1 file changed, 144 insertions(+), 83 deletions(-)

diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
index 5a9c83648df..fbc96b35160 100644
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -162,6 +162,124 @@ ntt_tgsi_var_usage_mask(const struct nir_variable *var)
                               glsl_type_is_64bit(type_without_array));
 }
 
+static struct ureg_dst
+ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
+{
+   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
+   int base = nir_intrinsic_base(instr);
+   *frac = nir_intrinsic_component(instr);
+   bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
+
+   struct ureg_dst out;
+   if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+      if (semantics.location == FRAG_RESULT_COLOR)
+         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
+
+      unsigned semantic_name, semantic_index;
+      tgsi_get_gl_frag_result_semantic(semantics.location,
+                                       &semantic_name, &semantic_index);
+      semantic_index += semantics.dual_source_blend_index;
+
+      switch (semantics.location) {
+      case FRAG_RESULT_DEPTH:
+         *frac = 2; /* z write is the to the .z channel in TGSI */
+         break;
+      case FRAG_RESULT_STENCIL:
+         *frac = 1;
+         break;
+      default:
+         break;
+      }
+
+      out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
+   } else {
+      unsigned semantic_name, semantic_index;
+
+      ntt_get_gl_varying_semantic(c, semantics.location,
+                                  &semantic_name, &semantic_index);
+
+      uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
+                                                instr->num_components,
+                                                is_64);
+      uint32_t gs_streams = semantics.gs_streams;
+      for (int i = 0; i < 4; i++) {
+         if (!(usage_mask & (1 << i)))
+            gs_streams &= ~(0x3 << 2 * i);
+      }
+
+      /* No driver appears to use array_id of outputs. */
+      unsigned array_id = 0;
+
+      /* This bit is lost in the i/o semantics, but it's unused in in-tree
+       * drivers.
+       */
+      bool invariant = false;
+
+      out = ureg_DECL_output_layout(c->ureg,
+                                    semantic_name, semantic_index,
+                                    gs_streams,
+                                    base,
+                                    usage_mask,
+                                    array_id,
+                                    semantics.num_slots,
+                                    invariant);
+   }
+
+   unsigned write_mask = nir_intrinsic_write_mask(instr);
+
+   if (is_64) {
+      write_mask = ntt_64bit_write_mask(write_mask);
+      if (*frac >= 2)
+         write_mask = write_mask << 2;
+   } else {
+      write_mask = write_mask << *frac;
+   }
+   return ureg_writemask(out, write_mask);
+}
+
+/* If this reg or SSA def is used only for storing an output, then in the simple
+ * cases we can write directly to the TGSI output instead of having store_output
+ * emit its own MOV.
+ */
+static bool
+ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
+                             struct list_head *uses, struct list_head *if_uses)
+{
+   *dst = ureg_dst_undef();
+
+   switch (c->s->info.stage) {
+   case MESA_SHADER_FRAGMENT:
+   case MESA_SHADER_VERTEX:
+      break;
+   default:
+      /* tgsi_exec (at least) requires that output stores happen per vertex
+       * emitted, you don't get to reuse a previous output value for the next
+       * vertex.
+       */
+      return false;
+   }
+
+   if (!list_is_empty(if_uses) || !list_is_singular(uses))
+      return false;
+
+   nir_src *src = list_first_entry(uses, nir_src, use_link);
+
+   if (src->parent_instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
+   if (intr->intrinsic != nir_intrinsic_store_output ||
+       !nir_src_is_const(intr->src[1])) {
+      return false;
+   }
+
+   uint32_t frac;
+   *dst = ntt_store_output_decl(c, intr, &frac);
+   dst->Index += nir_src_as_uint(intr->src[1]);
+
+   return frac == 0;
+}
+
 static void
 ntt_setup_inputs(struct ntt_compile *c)
 {
@@ -297,16 +415,18 @@ ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
       struct ureg_dst decl;
       if (nir_reg->num_array_elems == 0) {
          uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
-         if (nir_reg->bit_size == 64) {
-            if (nir_reg->num_components > 2) {
-               fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
-                       nir_reg->num_components, nir_reg->index);
+         if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
+            if (nir_reg->bit_size == 64) {
+               if (nir_reg->num_components > 2) {
+                  fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
+                        nir_reg->num_components, nir_reg->index);
+               }
+
+               write_mask = ntt_64bit_write_mask(write_mask);
             }
 
-            write_mask = ntt_64bit_write_mask(write_mask);
+            decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
          }
-
-         decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
       } else {
          decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
                                           true);
@@ -455,13 +575,15 @@ ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
 static struct ureg_dst *
 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
 {
-   struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
-
    uint32_t writemask = BITSET_MASK(ssa->num_components);
    if (ssa->bit_size == 64)
       writemask = ntt_64bit_write_mask(writemask);
 
-   c->ssa_temp[ssa->index] = ureg_writemask(temp, writemask);
+   struct ureg_dst dst;
+   if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
+      dst = ureg_DECL_temporary(c->ureg);
+
+   c->ssa_temp[ssa->index] = ureg_writemask(dst, writemask);
 
    return &c->ssa_temp[ssa->index];
 }
@@ -997,7 +1119,9 @@ ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
 {
    if (nir_src_is_const(src)) {
       return ureg_src_dimension(usrc, nir_src_as_uint(src));
-   } else {
+   }
+   else
+   {
       return ureg_src_dimension_indirect(usrc,
                                          ntt_reladdr(c, ntt_get_src(c, src)),
                                          0);
@@ -1445,71 +1569,19 @@ ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
 static void
 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
 {
-   /* TODO: When making an SSA def's storage, we should check if it's only
-    * used as the source of a store_output and point it at our
-    * TGSI_FILE_OUTPUT instead of generating the extra MOV here.
-    */
-   uint32_t base = nir_intrinsic_base(instr);
    struct ureg_src src = ntt_get_src(c, instr->src[0]);
-   bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
-   struct ureg_dst out;
-   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
-   uint32_t frac = nir_intrinsic_component(instr);
-
-   if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-      if (semantics.location == FRAG_RESULT_COLOR)
-         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
-
-      unsigned semantic_name, semantic_index;
-      tgsi_get_gl_frag_result_semantic(semantics.location,
-                                       &semantic_name, &semantic_index);
-      semantic_index += semantics.dual_source_blend_index;
 
-      out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
-
-      switch (semantics.location) {
-      case FRAG_RESULT_DEPTH:
-         frac = 2; /* z write is the to the .z channel in TGSI */
-         break;
-      case FRAG_RESULT_STENCIL:
-         frac = 1;
-         break;
-      default:
-         break;
-      }
-   } else {
-      unsigned semantic_name, semantic_index;
-
-      ntt_get_gl_varying_semantic(c, semantics.location,
-                                  &semantic_name, &semantic_index);
-
-      uint32_t usage_mask = ntt_tgsi_usage_mask(frac,
-                                                instr->num_components,
-                                                is_64);
-      uint32_t gs_streams = semantics.gs_streams;
-      for (int i = 0; i < 4; i++) {
-         if (!(usage_mask & (1 << i)))
-            gs_streams &= ~(0x3 << 2 * i);
-      }
-
-      /* No driver appears to use array_id of outputs. */
-      unsigned array_id = 0;
-
-      /* This bit is lost in the i/o semantics, but it's unused in in-tree
-       * drivers.
+   if (src.File == TGSI_FILE_OUTPUT) {
+      /* If our src is the output file, that's an indication that we were able
+       * to emit the output stores in the generating instructions and we have
+       * nothing to do here.
        */
-      bool invariant = false;
-
-      out = ureg_DECL_output_layout(c->ureg,
-                                    semantic_name, semantic_index,
-                                    gs_streams,
-                                    base,
-                                    usage_mask,
-                                    array_id,
-                                    semantics.num_slots,
-                                    invariant);
+      return;
    }
 
+   uint32_t frac;
+   struct ureg_dst out = ntt_store_output_decl(c, instr, &frac);
+
    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
       out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
@@ -1517,24 +1589,13 @@ ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
    }
 
-   unsigned write_mask = nir_intrinsic_write_mask(instr);
-
-   if (is_64) {
-      write_mask = ntt_64bit_write_mask(write_mask);
-      if (frac >= 2)
-         write_mask = write_mask << 2;
-   } else {
-      write_mask = write_mask << frac;
-   }
-
    uint8_t swizzle[4] = { 0, 0, 0, 0 };
    for (int i = frac; i <= 4; i++) {
-      if (write_mask & (1 << i))
+      if (out.WriteMask & (1 << i))
          swizzle[i] = i - frac;
    }
 
    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-   out = ureg_writemask(out, write_mask);
 
    ureg_MOV(c->ureg, out, src);
    ntt_reladdr_dst_put(c, out);



More information about the mesa-commit mailing list