Mesa (main): pan/bi: Implement basic scoreboarding pass

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Feb 22 17:10:15 UTC 2022


Module: Mesa
Branch: main
Commit: c81c022e666d13ff5a38895295f068f1469a4b62
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=c81c022e666d13ff5a38895295f068f1469a4b62

Author: Alyssa Rosenzweig <alyssa at collabora.com>
Date:   Thu Dec 23 11:09:42 2021 -0500

pan/bi: Implement basic scoreboarding pass

Extend our existing bi_scoreboard infrastructure with a simple data flow
analysis pass that calculates which dependency slots need waiting. We
still lack a heuristic for selecting dependency slots.

Signed-off-by: Alyssa Rosenzweig <alyssa at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14298>

---

 src/panfrost/bifrost/bi_pack.c       |   6 +
 src/panfrost/bifrost/bi_schedule.c   |   3 -
 src/panfrost/bifrost/bi_scoreboard.c | 264 +++++++++++++++++++++++++++++++----
 3 files changed, 244 insertions(+), 29 deletions(-)

diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c
index c5dbdebd2ee..58a8cb823e2 100644
--- a/src/panfrost/bifrost/bi_pack.c
+++ b/src/panfrost/bifrost/bi_pack.c
@@ -36,6 +36,12 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2)
         unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
         dependency_wait |= next_2 ? next_2->dependencies : 0;
 
+        /* Signal barriers (slot #7) immediately. This is not optimal but good
+         * enough. Doing better requires extending the IR and scheduler.
+         */
+        if (clause->message_type == BIFROST_MESSAGE_BARRIER)
+                dependency_wait |= BITFIELD_BIT(7);
+
         bool staging_barrier = next_1 ? next_1->staging_barrier : false;
         staging_barrier |= next_2 ? next_2->staging_barrier : 0;
 
diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c
index 33e32114d38..c0901eb840c 100644
--- a/src/panfrost/bifrost/bi_schedule.c
+++ b/src/panfrost/bifrost/bi_schedule.c
@@ -1860,9 +1860,6 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint
         clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
         clause->block = block;
 
-        /* TODO: scoreboard assignment post-sched */
-        clause->dependencies |= (1 << 0);
-
         /* We emit in reverse and emitted to the back of the tuples array, so
          * move it up front for easy indexing */
         memmove(clause->tuples,
diff --git a/src/panfrost/bifrost/bi_scoreboard.c b/src/panfrost/bifrost/bi_scoreboard.c
index 484f8a71718..dd5054d776d 100644
--- a/src/panfrost/bifrost/bi_scoreboard.c
+++ b/src/panfrost/bifrost/bi_scoreboard.c
@@ -54,55 +54,267 @@
  */
 
 #define BI_NUM_GENERAL_SLOTS 6
+#define BI_NUM_SLOTS 8
+#define BI_NUM_REGISTERS 64
+#define BI_SLOT_SERIAL 0 /* arbitrary */
 
-/* A model for the state of the scoreboard */
+/*
+ * Due to the crude scoreboarding we do, we need to serialize varying loads and
+ * memory access. Identify these instructions here.
+ */
+static bool
+bi_should_serialize(bi_instr *I)
+{
+        /* Although nominally on the attribute unit, image loads have the same
+         * coherency requirements as general memory loads. Serialize them for
+         * now until we can do something more clever.
+         */
+        if (I->op == BI_OPCODE_LD_ATTR_TEX)
+                return true;
 
-struct bi_scoreboard_state {
-        /* TODO: what do we track here for a heuristic? */
-};
+        switch (bi_opcode_props[I->op].message) {
+        case BIFROST_MESSAGE_VARYING:
+        case BIFROST_MESSAGE_LOAD:
+        case BIFROST_MESSAGE_STORE:
+        case BIFROST_MESSAGE_ATOMIC:
+                return true;
+        default:
+                return false;
+        }
+}
 
 /* Given a scoreboard model, choose a slot for a clause wrapping a given
  * message passing instruction. No side effects. */
 
 static unsigned
-bi_choose_scoreboard_slot(struct bi_scoreboard_state *st, bi_instr *message)
+bi_choose_scoreboard_slot(bi_instr *message)
 {
-        /* A clause that does not produce a message must use slot #0 */
-        if (!message)
-                return 0;
-
-        switch (message->op) {
         /* ATEST, ZS_EMIT must be issued with slot #0 */
-        case BI_OPCODE_ATEST:
-        case BI_OPCODE_ZS_EMIT:
+        if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT)
                 return 0;
 
         /* BARRIER must be issued with slot #7 */
-        case BI_OPCODE_BARRIER:
+        if (message->op == BI_OPCODE_BARRIER)
                 return 7;
 
-        default:
-                break;
-        }
+        /* For now, make serialization is easy */
+        if (bi_should_serialize(message))
+                return BI_SLOT_SERIAL;
 
-        /* TODO: Use a heuristic */
         return 0;
 }
 
+static uint64_t
+bi_read_mask(bi_instr *I, bool staging_only)
+{
+        uint64_t mask = 0;
+
+        if (staging_only && !bi_opcode_props[I->op].sr_read)
+                return mask;
+
+        bi_foreach_src(I, s) {
+                if (I->src[s].type == BI_INDEX_REGISTER) {
+                        unsigned reg = I->src[s].value;
+                        unsigned count = bi_count_read_registers(I, s);
+
+                        mask |= (BITFIELD64_MASK(count) << reg);
+                }
+
+                if (staging_only)
+                        break;
+        }
+
+        return mask;
+}
+
+static uint64_t
+bi_write_mask(bi_instr *I)
+{
+        uint64_t mask = 0;
+
+        bi_foreach_dest(I, d) {
+                if (bi_is_null(I->dest[d])) continue;
+
+                assert(I->dest[d].type == BI_INDEX_REGISTER);
+
+                unsigned reg = I->dest[d].value;
+                unsigned count = bi_count_write_registers(I, d);
+
+                mask |= (BITFIELD64_MASK(count) << reg);
+        }
+
+        /* Instructions like AXCHG.i32 unconditionally both read and write
+         * staging registers. Even if we discard the result, the write still
+         * happens logically and needs to be included in our calculations.
+         * Obscurely, ATOM_CX is sr_write but can ignore the staging register in
+         * certain circumstances; this does not require consideration.
+         */
+        if (bi_opcode_props[I->op].sr_write && bi_is_null(I->dest[0]) &&
+            !bi_is_null(I->src[0])) {
+
+                unsigned reg = I->src[0].value;
+                unsigned count = bi_count_write_registers(I, 0);
+
+                mask |= (BITFIELD64_MASK(count) << reg);
+        }
+
+        return mask;
+}
+
+/* Update the scoreboard model to assign an instruction to a given slot */
+
+static void
+bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause)
+{
+        bi_instr *I = clause->message;
+        unsigned slot = clause->scoreboard_id;
+
+        if (!I)
+                return;
+
+        st->read[slot] |= bi_read_mask(I, true);
+
+        if (bi_opcode_props[I->op].sr_write)
+                st->write[slot] |= bi_write_mask(I);
+}
+
+/* Adds a dependency on each slot writing any specified register */
+
+static void
+bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
+{
+        for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
+                if (!(st->write[slot] & regmask))
+                        continue;
+
+                st->write[slot] = 0;
+                st->read[slot] = 0;
+
+                clause->dependencies |= BITFIELD_BIT(slot);
+        }
+}
+
+static void
+bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
+{
+        for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) {
+                if (!(st->read[slot] & regmask))
+                        continue;
+
+                st->read[slot] = 0;
+                clause->staging_barrier = true;
+        }
+}
+
+/* Sets the dependencies for a given clause, updating the model */
+
+static void
+bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st)
+{
+        bi_foreach_instr_in_clause(block, clause, I) {
+                uint64_t read = bi_read_mask(I, false);
+                uint64_t written = bi_write_mask(I);
+
+                /* Read-after-write; write-after-write */
+                bi_depend_on_writers(clause, st, read | written);
+
+                /* Write-after-read */
+                bi_set_staging_barrier(clause, st, written);
+        }
+
+        /* LD_VAR instructions must be serialized per-quad. Just always depend
+         * on any LD_VAR instructions. This isn't optimal, but doing better
+         * requires divergence-aware data flow analysis.
+         *
+         * Similarly, memory loads/stores need to be synchronized. For now,
+         * force them to be serialized. This is not optimal.
+         */
+        if (clause->message && bi_should_serialize(clause->message))
+                clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL);
+
+        /* Barriers must wait on all slots to flush existing work. It might be
+         * possible to skip this with more information about the barrier. For
+         * now, be conservative.
+         */
+        if (clause->message && clause->message->op == BI_OPCODE_BARRIER)
+                clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS);
+}
+
+static bool
+scoreboard_block_update(bi_block *blk)
+{
+        bool progress = false;
+
+        /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
+        bi_foreach_predecessor(blk, pred) {
+                for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
+                        blk->scoreboard_in.read[i] |= pred->scoreboard_out.read[i];
+                        blk->scoreboard_in.write[i] |= pred->scoreboard_out.write[i];
+                }
+        }
+
+        struct bi_scoreboard_state state = blk->scoreboard_in;
+
+        /* Assign locally */
+
+        bi_foreach_clause_in_block(blk, clause) {
+                bi_set_dependencies(blk, clause, &state);
+                bi_push_clause(&state, clause);
+        }
+
+        /* To figure out progress, diff scoreboard_out */
+
+        for (unsigned i = 0; i < BI_NUM_SLOTS; ++i)
+                progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
+
+        blk->scoreboard_out = state;
+
+        return progress;
+}
+
 void
 bi_assign_scoreboard(bi_context *ctx)
 {
-        struct bi_scoreboard_state st = {};
-
-        /* Assign slots */
+        /* First, assign slots. */
         bi_foreach_block(ctx, block) {
                 bi_foreach_clause_in_block(block, clause) {
-                        unsigned slot = bi_choose_scoreboard_slot(&st, clause->message);
-                        clause->scoreboard_id = slot;
-
-                        bi_clause *next = bi_next_clause(ctx, block, clause);
-                        if (next)
-                                next->dependencies |= (1 << slot);
+                        if (clause->message) {
+                                unsigned slot = bi_choose_scoreboard_slot(clause->message);
+                                clause->scoreboard_id = slot;
+                        }
                 }
         }
+
+        /* Next, perform forward data flow analysis to calculate dependencies */
+        /* Set of bi_block */
+        struct set *work_list = _mesa_set_create(NULL,
+                        _mesa_hash_pointer,
+                        _mesa_key_pointer_equal);
+
+        struct set *visited = _mesa_set_create(NULL,
+                        _mesa_hash_pointer,
+                        _mesa_key_pointer_equal);
+
+        /* Initialize the work list with the first block */
+        struct set_entry *cur;
+
+        cur = _mesa_set_add(work_list, bi_start_block(&ctx->blocks));
+
+        /* Iterate the work list */
+        do {
+                bi_block *blk = (struct bi_block *) cur->key;
+                _mesa_set_remove(work_list, cur);
+
+                bool progress = scoreboard_block_update(blk);
+
+                if (progress || !_mesa_set_search(visited, blk)) {
+                        bi_foreach_successor(blk, pred)
+                                _mesa_set_add(work_list, pred);
+                }
+
+                _mesa_set_add(visited, blk);
+        } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL);
+
+        _mesa_set_destroy(visited, NULL);
+        _mesa_set_destroy(work_list, NULL);
 }



More information about the mesa-commit mailing list