Mesa (main): pan/bi: Calculate dependency graph when bundling

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Mon Jul 12 23:44:38 UTC 2021


Module: Mesa
Branch: main
Commit: 599662205020df1bc919d48a9809251dbd366714
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=599662205020df1bc919d48a9809251dbd366714

Author: Alyssa Rosenzweig <alyssa.rosenzweig at collabora.com>
Date:   Wed Jan 20 21:02:12 2021 -0500

pan/bi: Calculate dependency graph when bundling

Code is ported from Midgard, modified to be scalar, post-RA, and to put
the arrays on the worklist instead of the instruction to save memory.
This enables out-of-order scheduling.

total tuples in shared programs: 128691 -> 125304 (-2.63%)
tuples in affected programs: 114091 -> 110704 (-2.97%)
helped: 844
HURT: 377
helped stats (abs) min: 1.0 max: 150.0 x̄: 4.88 x̃: 3
helped stats (rel) min: 0.30% max: 26.42% x̄: 5.56% x̃: 4.35%
HURT stats (abs)   min: 1.0 max: 8.0 x̄: 1.94 x̃: 1
HURT stats (rel)   min: 0.20% max: 33.33% x̄: 6.84% x̃: 3.23%
95% mean confidence interval for tuples value: -3.16 -2.38
95% mean confidence interval for tuples %-change: -2.19% -1.27%
Tuples are helped.

total clauses in shared programs: 27579 -> 26059 (-5.51%)
clauses in affected programs: 20606 -> 19086 (-7.38%)
helped: 941
HURT: 39
helped stats (abs) min: 1.0 max: 21.0 x̄: 1.66 x̃: 1
helped stats (rel) min: 0.69% max: 44.44% x̄: 10.48% x̃: 9.09%
HURT stats (abs)   min: 1.0 max: 2.0 x̄: 1.15 x̃: 1
HURT stats (rel)   min: 1.89% max: 10.00% x̄: 4.73% x̃: 4.55%
95% mean confidence interval for clauses value: -1.63 -1.47
95% mean confidence interval for clauses %-change: -10.27% -9.48%
Clauses are helped.

total cycles in shared programs: 12262.54 -> 12154.79 (-0.88%)
cycles in affected programs: 2210.54 -> 2102.79 (-4.87%)
helped: 374
HURT: 56
helped stats (abs) min: 0.041665999999999315 max: 6.25 x̄: 0.30 x̃: 0
helped stats (rel) min: 0.42% max: 26.00% x̄: 6.90% x̃: 7.14%
HURT stats (abs)   min: 0.041665999999999315 max: 0.5833319999999986 x̄: 0.11 x̃: 0
HURT stats (rel)   min: 0.16% max: 100.00% x̄: 55.17% x̃: 50.00%
95% mean confidence interval for cycles value: -0.29 -0.21
95% mean confidence interval for cycles %-change: -1.37% 3.73%
Inconclusive result (%-change mean confidence interval includes 0).

total arith in shared programs: 4852.29 -> 4658.13 (-4.00%)
arith in affected programs: 4525.17 -> 4331 (-4.29%)
helped: 1112
HURT: 166
helped stats (abs) min: 0.041665999999999315 max: 6.25 x̄: 0.19 x̃: 0
helped stats (rel) min: 0.42% max: 33.33% x̄: 6.59% x̃: 5.36%
HURT stats (abs)   min: 0.041665999999999315 max: 0.5833319999999986 x̄: 0.07 x̃: 0
HURT stats (rel)   min: 0.16% max: 100.00% x̄: 25.05% x̃: 2.40%
95% mean confidence interval for arith value: -0.17 -0.14
95% mean confidence interval for arith %-change: -3.44% -1.51%
Arith are helped.

total quadwords in shared programs: 117141 -> 111394 (-4.91%)
quadwords in affected programs: 104390 -> 98643 (-5.51%)
helped: 1245
HURT: 76
helped stats (abs) min: 1.0 max: 69.0 x̄: 4.74 x̃: 4
helped stats (rel) min: 0.28% max: 35.00% x̄: 7.88% x̃: 6.45%
HURT stats (abs)   min: 1.0 max: 8.0 x̄: 2.01 x̃: 1
HURT stats (rel)   min: 0.20% max: 10.00% x̄: 3.52% x̃: 4.25%
95% mean confidence interval for quadwords value: -4.61 -4.09
95% mean confidence interval for quadwords %-change: -7.56% -6.88%
Quadwords are helped.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10961>

---

 src/panfrost/bifrost/bi_schedule.c     | 181 +++++++++++++++++++++++++++++++--
 src/panfrost/bifrost/bifrost.h         |   1 +
 src/panfrost/bifrost/bifrost_compile.c |   3 +-
 3 files changed, 177 insertions(+), 8 deletions(-)

diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c
index e4099174004..468a9868085 100644
--- a/src/panfrost/bifrost/bi_schedule.c
+++ b/src/panfrost/bifrost/bi_schedule.c
@@ -38,6 +38,13 @@ struct bi_worklist {
 
         /* Bitset of instructions in the block ready for scheduling */
         BITSET_WORD *worklist;
+
+        /* The backwards dependency graph. nr_dependencies is the number of
+         * unscheduled instructions that must still be scheduled after (before)
+         * this instruction. dependents are which instructions need to be
+         * scheduled before (after) this instruction. */
+        unsigned *dep_counts;
+        BITSET_WORD **dependents;
 };
 
 /* State of a single tuple and clause under construction */
@@ -149,6 +156,138 @@ bi_supports_dtsel(bi_instr *ins)
         }
 }
 
+/* Adds an edge to the dependency graph */
+
+static void
+bi_push_dependency(unsigned parent, unsigned child,
+                BITSET_WORD **dependents, unsigned *dep_counts)
+{
+        if (!BITSET_TEST(dependents[parent], child)) {
+                BITSET_SET(dependents[parent], child);
+                dep_counts[child]++;
+        }
+}
+
+static void
+add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
+                BITSET_WORD **dependents, unsigned *dep_counts)
+{
+        assert(index < 64);
+        util_dynarray_foreach(table + index, unsigned, parent)
+                bi_push_dependency(*parent, child, dependents, dep_counts);
+}
+
+static void
+mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
+{
+        assert(index < 64);
+        util_dynarray_append(&table[index], unsigned, parent);
+}
+
+static bool
+bi_is_sched_barrier(bi_instr *I)
+{
+        switch (I->op) {
+        case BI_OPCODE_BARRIER:
+        case BI_OPCODE_DISCARD_F32:
+                return true;
+        default:
+                return false;
+        }
+}
+
+static void
+bi_create_dependency_graph(struct bi_worklist st, bool inorder)
+{
+        struct util_dynarray last_read[64], last_write[64];
+
+        for (unsigned i = 0; i < 64; ++i) {
+                util_dynarray_init(&last_read[i], NULL);
+                util_dynarray_init(&last_write[i], NULL);
+        }
+
+        /* Initialize dependency graph */
+        for (unsigned i = 0; i < st.count; ++i) {
+                st.dependents[i] =
+                        calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
+
+                st.dep_counts[i] = 0;
+        }
+
+        unsigned prev_msg = ~0;
+
+        /* Populate dependency graph */
+        for (signed i = st.count - 1; i >= 0; --i) {
+                bi_instr *ins = st.instructions[i];
+
+                bi_foreach_src(ins, s) {
+                        if (ins->src[s].type != BI_INDEX_REGISTER) continue;
+                        unsigned count = bi_count_read_registers(ins, s);
+
+                        for (unsigned c = 0; c < count; ++c)
+                                add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
+                }
+
+                /* Keep message-passing ops in order. (This pass only cares
+                 * about bundling; reordering of message-passing instructions
+                 * happens during earlier scheduling.) */
+
+                if (bi_message_type_for_instr(ins)) {
+                        if (prev_msg != ~0)
+                                bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
+
+                        prev_msg = i;
+                }
+
+                /* Handle schedule barriers, adding All the deps */
+                if (inorder || bi_is_sched_barrier(ins)) {
+                        for (unsigned j = 0; j < st.count; ++j) {
+                                if (i == j) continue;
+
+                                bi_push_dependency(MAX2(i, j), MIN2(i, j),
+                                                st.dependents, st.dep_counts);
+                        }
+                }
+
+                bi_foreach_dest(ins, d) {
+                        if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
+                        unsigned dest = ins->dest[d].value;
+
+                        unsigned count = bi_count_write_registers(ins, d);
+
+                        for (unsigned c = 0; c < count; ++c) {
+                                add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
+                                add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
+                                mark_access(last_write, dest + c, i);
+                        }
+                }
+
+                bi_foreach_src(ins, s) {
+                        if (ins->src[s].type != BI_INDEX_REGISTER) continue;
+
+                        unsigned count = bi_count_read_registers(ins, s);
+
+                        for (unsigned c = 0; c < count; ++c) 
+                                mark_access(last_read, ins->src[s].value + c, i);
+                }
+        }
+
+        /* If there is a branch, all instructions depend on it, as interblock
+         * execution must be purely in-order */
+
+        bi_instr *last = st.instructions[st.count - 1];
+        if (last->branch_target || last->op == BI_OPCODE_JUMP) {
+                for (signed i = st.count - 2; i >= 0; --i)
+                        bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
+        }
+
+        /* Free the intermediate structures */
+        for (unsigned i = 0; i < 64; ++i) {
+                util_dynarray_fini(&last_read[i]);
+                util_dynarray_fini(&last_write[i]);
+        }
+}
+
 /* Scheduler pseudoinstruction lowerings to enable instruction pairings.
  * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
  */
@@ -275,14 +414,23 @@ bi_flatten_block(bi_block *block, unsigned *len)
  */
 
 static struct bi_worklist
-bi_initialize_worklist(bi_block *block)
+bi_initialize_worklist(bi_block *block, bool inorder)
 {
         struct bi_worklist st = { };
         st.instructions = bi_flatten_block(block, &st.count);
 
-        if (st.count) {
-                st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
-                BITSET_SET(st.worklist, st.count - 1);
+        if (!st.count)
+                return st;
+
+        st.dependents = calloc(st.count, sizeof(st.dependents[0]));
+        st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
+
+        bi_create_dependency_graph(st, inorder);
+        st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
+
+        for (unsigned i = 0; i < st.count; ++i) {
+                if (st.dep_counts[i] == 0)
+                        BITSET_SET(st.worklist, i);
         }
 
         return st;
@@ -291,6 +439,8 @@ bi_initialize_worklist(bi_block *block)
 static void
 bi_free_worklist(struct bi_worklist st)
 {
+        free(st.dep_counts);
+        free(st.dependents);
         free(st.instructions);
         free(st.worklist);
 }
@@ -298,8 +448,24 @@ bi_free_worklist(struct bi_worklist st)
 static void
 bi_update_worklist(struct bi_worklist st, unsigned idx)
 {
-        if (idx >= 1)
-                BITSET_SET(st.worklist, idx - 1);
+        assert(st.dep_counts[idx] == 0);
+
+        if (!st.dependents[idx])
+                return;
+
+        /* Iterate each dependent to remove one dependency (`done`),
+         * adding dependents to the worklist where possible. */
+
+        unsigned i;
+        BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
+                assert(st.dep_counts[i] != 0);
+                unsigned new_deps = --st.dep_counts[i];
+
+                if (new_deps == 0)
+                        BITSET_SET(st.worklist, i);
+        }
+
+        free(st.dependents[idx]);
 }
 
 /* To work out the back-to-back flag, we need to detect branches and
@@ -1568,7 +1734,8 @@ bi_schedule_block(bi_context *ctx, bi_block *block)
         list_inithead(&block->clauses);
 
         /* Copy list to dynamic array */
-        struct bi_worklist st = bi_initialize_worklist(block);
+        struct bi_worklist st = bi_initialize_worklist(block,
+                        bifrost_debug & BIFROST_DBG_INORDER);
 
         if (!st.count) {
                 bi_free_worklist(st);
diff --git a/src/panfrost/bifrost/bifrost.h b/src/panfrost/bifrost/bifrost.h
index be3a2d0cb33..436404f0be0 100644
--- a/src/panfrost/bifrost/bifrost.h
+++ b/src/panfrost/bifrost/bifrost.h
@@ -36,6 +36,7 @@
 #define BIFROST_DBG_VERBOSE     0x0008
 #define BIFROST_DBG_INTERNAL    0x0010
 #define BIFROST_DBG_NOSCHED     0x0020
+#define BIFROST_DBG_INORDER     0x0040
 
 extern int bifrost_debug;
 
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 3b577944a77..594d02d9b46 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -43,7 +43,8 @@ static const struct debug_named_value bifrost_debug_options[] = {
         {"shaderdb",  BIFROST_DBG_SHADERDB,	"Print statistics"},
         {"verbose",   BIFROST_DBG_VERBOSE,	"Disassemble verbosely"},
         {"internal",  BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
-        {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial scheduling"},
+        {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
+        {"inorder",   BIFROST_DBG_INORDER, 	"Force in-order bundling"},
         DEBUG_NAMED_VALUE_END
 };
 



More information about the mesa-commit mailing list