Mesa (main): pan/va: Mark last register reads

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Jun 21 22:41:34 UTC 2022


Module: Mesa
Branch: main
Commit: 4b7e337b45da6ac773a4e210391c67ef7e426ab8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=4b7e337b45da6ac773a4e210391c67ef7e426ab8

Author: Alyssa Rosenzweig <alyssa at collabora.com>
Date:   Wed Jun 15 16:52:39 2022 -0400

pan/va: Mark last register reads

On Valhall, register reads may be marked as "last" [1]. Setting the last flag
promises the hardware that the value of the register is no longer required. This
may enable hardware optimizations. In particular, it may permit the hardware to
avoid register file writes if a write to the marked register is still in the
forwarding buffer. This may improve power efficiency.

In principle, this is trivial: run liveness analysis and mark killed sources,
like we would in an SSA-based register allocator. In practice, there are a few
wrinkles to avoid hazards around staging registers and 64-bit register pairs,
requiring some additional data flow analysis and fix ups. However, nothing here
is particularly "hard", and all the ideas are already in use for the Bifrost
scheduler and the Bifrost/Valhall scoreboard analyses.

[1] In Mesa's compiler, this is called discard for historical reasons.

Signed-off-by: Alyssa Rosenzweig <alyssa at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17091>

---

 src/panfrost/bifrost/bifrost_compile.c      |   1 +
 src/panfrost/bifrost/meson.build            |   1 +
 src/panfrost/bifrost/valhall/va_compiler.h  |   1 +
 src/panfrost/bifrost/valhall/va_mark_last.c | 213 ++++++++++++++++++++++++++++
 4 files changed, 216 insertions(+)

diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 5852c7acf48..4eea93f529a 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -5099,6 +5099,7 @@ bi_compile_variant_nir(nir_shader *nir,
                 va_assign_slots(ctx);
                 va_insert_flow_control_nops(ctx);
                 va_merge_flow(ctx);
+                va_mark_last(ctx);
         } else {
                 bi_schedule(ctx);
                 bi_assign_scoreboard(ctx);
diff --git a/src/panfrost/bifrost/meson.build b/src/panfrost/bifrost/meson.build
index 5a679a6c952..a91d07efff1 100644
--- a/src/panfrost/bifrost/meson.build
+++ b/src/panfrost/bifrost/meson.build
@@ -52,6 +52,7 @@ libpanfrost_bifrost_files = files(
   'valhall/va_lower_isel.c',
   'valhall/va_lower_split_64bit.c',
   'valhall/va_optimize.c',
+  'valhall/va_mark_last.c',
   'valhall/va_merge_flow.c',
   'valhall/va_pack.c',
   'valhall/va_perf.c',
diff --git a/src/panfrost/bifrost/valhall/va_compiler.h b/src/panfrost/bifrost/valhall/va_compiler.h
index c6f841c39b3..22aaea77c53 100644
--- a/src/panfrost/bifrost/valhall/va_compiler.h
+++ b/src/panfrost/bifrost/valhall/va_compiler.h
@@ -43,6 +43,7 @@ void va_lower_isel(bi_instr *I);
 void va_assign_slots(bi_context *ctx);
 void va_insert_flow_control_nops(bi_context *ctx);
 void va_merge_flow(bi_context *ctx);
+void va_mark_last(bi_context *ctx);
 uint64_t va_pack_instr(const bi_instr *I);
 
 static inline unsigned
diff --git a/src/panfrost/bifrost/valhall/va_mark_last.c b/src/panfrost/bifrost/valhall/va_mark_last.c
new file mode 100644
index 00000000000..46ddeb76cd0
--- /dev/null
+++ b/src/panfrost/bifrost/valhall/va_mark_last.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 2022 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "va_compiler.h"
+#include "valhall_enums.h"
+
+/*
+ * Valhall sources may marked as the last use of a register, according
+ * to the following rules:
+ *
+ * 1. The last use of a register should be marked allowing the hardware
+ *    to elide register writes.
+ * 2. Staging sources may be read at any time before the asynchronous
+ *    instruction completes. If a register is used as both a staging source and
+ *    a regular source, the regular source cannot be marked until the program
+ *    waits for the asynchronous instruction.
+ * 3. Marking a register pair marks both registers in the pair.
+ *
+ * Last use information follows immediately from (post-RA) liveness analysis:
+ * a register is dead immediately after its last use.
+ *
+ * Staging information follows from scoreboard analysis: do not mark registers
+ * that are read by a pending asynchronous instruction. Note that the Valhall
+ * scoreboard analysis does not track reads, so we handle that with our own
+ * (simplified) scoreboard analysis.
+ *
+ * Register pairs are marked conservatively: if either register in a pair cannot
+ * be marked, do not mark either register.
+ */
+
+static uint64_t
+bi_staging_read_mask(const bi_instr *I)
+{
+   uint64_t mask = 0;
+
+   bi_foreach_src(I, s) {
+      if (bi_is_staging_src(I, s) && !bi_is_null(I->src[s])) {
+         assert(I->src[s].type == BI_INDEX_REGISTER);
+         unsigned reg = I->src[s].value;
+         unsigned count = bi_count_read_registers(I, s);
+
+         mask |= (BITFIELD64_MASK(count) << reg);
+      }
+   }
+
+   return mask;
+}
+
+static bool
+bi_writes_reg(const bi_instr *I, unsigned reg)
+{
+   bi_foreach_dest(I, d) {
+      if (bi_is_null(I->dest[d]))
+         continue;
+
+      assert(I->dest[d].type == BI_INDEX_REGISTER);
+
+      unsigned count = bi_count_write_registers(I, d);
+
+      if (reg >= I->dest[d].value && (reg - I->dest[d].value) < count)
+         return true;
+   }
+
+   return false;
+}
+
+static unsigned
+waits_on_slot(enum va_flow flow, unsigned slot)
+{
+   return (flow == VA_FLOW_WAIT) || (flow == VA_FLOW_WAIT0126) ||
+          (va_flow_is_wait_or_none(flow) && (flow & BITFIELD_BIT(slot)));
+}
+
+static void
+scoreboard_update(struct bi_scoreboard_state *st, const bi_instr *I)
+{
+   /* Mark read staging registers */
+   st->read[I->slot] |= bi_staging_read_mask(I);
+
+   /* Unmark registers after they are waited on */
+   for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) {
+      if (waits_on_slot(I->flow, i))
+            st->read[i] = 0;
+   }
+}
+
+static void
+va_analyze_scoreboard_reads(bi_context *ctx)
+{
+   u_worklist worklist;
+   bi_worklist_init(ctx, &worklist);
+
+   bi_foreach_block(ctx, block) {
+      bi_worklist_push_tail(&worklist, block);
+
+      /* Reset analysis from previous pass */
+      block->scoreboard_in = (struct bi_scoreboard_state){ 0 };
+      block->scoreboard_out = (struct bi_scoreboard_state){ 0 };
+   }
+
+   /* Perform forward data flow analysis to calculate dependencies */
+   while (!u_worklist_is_empty(&worklist)) {
+      /* Pop from the front for forward analysis */
+      bi_block *blk = bi_worklist_pop_head(&worklist);
+
+      bi_foreach_predecessor(blk, pred) {
+         for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i)
+            blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
+      }
+
+      struct bi_scoreboard_state state = blk->scoreboard_in;
+
+      bi_foreach_instr_in_block(blk, I)
+         scoreboard_update(&state, I);
+
+      /* If there was progress, reprocess successors */
+      if (memcmp(&state, &blk->scoreboard_out, sizeof(state)) != 0) {
+         bi_foreach_successor(blk, succ)
+            bi_worklist_push_tail(&worklist, succ);
+      }
+
+      blk->scoreboard_out = state;
+   }
+
+   u_worklist_fini(&worklist);
+}
+
+void
+va_mark_last(bi_context *ctx)
+{
+   /* Analyze the shader globally */
+   bi_postra_liveness(ctx);
+   va_analyze_scoreboard_reads(ctx);
+
+   bi_foreach_block(ctx, block) {
+      uint64_t live = block->reg_live_out;
+
+      /* Mark all last uses */
+      bi_foreach_instr_in_block_rev(block, I) {
+         bi_foreach_src(I, s) {
+            if (I->src[s].type != BI_INDEX_REGISTER)
+               continue;
+
+            unsigned nr = bi_count_read_registers(I, s);
+            uint64_t mask = BITFIELD64_MASK(nr) << I->src[s].value;
+
+            /* If the register dead after this instruction, it's the last use */
+            I->src[s].discard = (live & mask) == 0;
+
+            /* If the register is overwritten this cycle, it is implicitly
+             * discarded, but that won't show up in the liveness analysis.
+             */
+            I->src[s].discard |= bi_writes_reg(I, I->src[s].value);
+         }
+
+         live = bi_postra_liveness_ins(live, I);
+      }
+
+      struct bi_scoreboard_state st = block->scoreboard_in;
+
+      bi_foreach_instr_in_block(block, I) {
+         /* Unmark registers read by a pending async instruction */
+         bi_foreach_src(I, s) {
+            if (!I->src[s].discard)
+               continue;
+
+            assert(I->src[s].type == BI_INDEX_REGISTER);
+
+            uint64_t pending_regs = st.read[0] | st.read[1] | st.read[2];
+            bool pending = (pending_regs & BITFIELD64_BIT(I->src[s].value));
+
+            if (bi_is_staging_src(I, s) || pending)
+               I->src[s].discard = false;
+         }
+
+         /* Unmark register pairs where one half must be preserved */
+         bi_foreach_src(I, s) {
+            /* Only look for "real" architectural registers */
+            if (s >= 3)
+               break;
+
+            if (va_src_info(I->op, s).size == VA_SIZE_64) {
+               bool both_discard = I->src[s].discard && I->src[s + 1].discard;
+
+               I->src[s + 0].discard = both_discard;
+               I->src[s + 1].discard = both_discard;
+            }
+         }
+
+         scoreboard_update(&st, I);
+      }
+   }
+}



More information about the mesa-commit mailing list