Mesa (master): lima/gp: Fix problem with complex moves

Thu Jul 18 13:11:07 UTC 2019

Module: Mesa
Branch: master
Commit: b178fdf486f3496270dd4ec12d9bc596d11433ad
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=b178fdf486f3496270dd4ec12d9bc596d11433ad

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Sat May 11 18:43:30 2019 +0200

lima/gp: Fix problem with complex moves

When writing the scheduler, we forgot that you can't read the complex
unit in certain sources because it gets overwritten to 0 or 1. Fixing
this turned out to be possible without giving up and reducing
GPIR_VALUE_REG_NUM to 10, although it was difficult in a way I didn't
expect. There can be at most 4 next-max nodes that can't have moves
scheduled in the complex slot, so it actually isn't a problem for
getting the number of next-max nodes at 5 or lower. However, it is a
problem for stores. If a given node is a next-max node whose move cannot
go in the complex slot *and* is used by a store that we decide to
schedule, we have to reserve one of the non-complex slots for a move
instead of all the slots, or we can wind up in a situation where only
the complex slot is free and we fail the move. This means that we have
to add another term to the reservation logic, for stores whose children
cannot be in the complex slot.

Acked-by: Qiang Yu <yuq825 at gmail.com>

---

 src/gallium/drivers/lima/ir/gp/gpir.h      | 15 ++++--
 src/gallium/drivers/lima/ir/gp/instr.c     | 39 ++++++++++++++-
 src/gallium/drivers/lima/ir/gp/scheduler.c | 80 ++++++++++++++++++++++++++++--
 3 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h
index 6a1d533977b..de571c3c1c4 100644
--- a/src/gallium/drivers/lima/ir/gp/gpir.h
+++ b/src/gallium/drivers/lima/ir/gp/gpir.h
@@ -174,6 +174,7 @@ typedef struct gpir_node {
          bool ready;
          bool inserted;
          bool max_node, next_max_node;
+         bool complex_allowed;
       } sched;
       struct {
          int parent_index;
@@ -288,16 +289,22 @@ typedef struct gpir_instr {
     * (3) There is a store instruction scheduled, but not its child.
     *
     * The complex slot cannot be used for a move in case (1), since it only
-    * has a FIFO depth of 1, but it can be used for (2) and (3). In order to
-    * ensure that we have enough space for all three, we maintain the
-    * following invariants:
+    * has a FIFO depth of 1, but it can be used for (2) as well as (3) as long
+    * as the uses aren't in certain slots. It turns out that we don't have to
+    * worry about nodes that can't use the complex slot for (2), since there
+    * are at most 4 uses 1 cycle ago that can't use the complex slot, but we
+    * do have to worry about (3). This means tracking stores whose children
+    * cannot be in the complex slot. In order to ensure that we have enough
+    * space for all three, we maintain the following invariants:
     *
     * (1) alu_num_slot_free >= alu_num_slot_needed_by_store +
     *       alu_num_slot_needed_by_max +
     *       alu_num_slot_needed_by_next_max
-    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max
+    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max +
+    *       alu_num_slot_neede_by_non_cplx_store
     */
    int alu_num_slot_needed_by_store;
+   int alu_num_slot_needed_by_non_cplx_store;
    int alu_num_slot_needed_by_max;
    int alu_num_slot_needed_by_next_max;
 
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c
index 228d1459280..e07a2c9b7c2 100644
--- a/src/gallium/drivers/lima/ir/gp/instr.c
+++ b/src/gallium/drivers/lima/ir/gp/instr.c
@@ -85,10 +85,15 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    if (!gpir_instr_check_acc_same_op(instr, node, node->sched.pos))
       return false;
 
+   if (node->sched.next_max_node && !node->sched.complex_allowed &&
+       node->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
+      return false;
+
    int consume_slot = gpir_instr_get_consume_slot(instr, node);
    int non_cplx_consume_slot =
       node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot;
    int store_reduce_slot = 0;
+   int non_cplx_store_reduce_slot = 0;
    int max_reduce_slot = node->sched.max_node ? 1 : 0;
    int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
 
@@ -100,6 +105,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
          store_reduce_slot = 1;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            non_cplx_store_reduce_slot = 1;
          break;
       }
    }
@@ -118,7 +125,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    }
 
    int non_cplx_slot_difference =
-       instr->alu_num_slot_needed_by_max - max_reduce_slot -
+       instr->alu_num_slot_needed_by_max - max_reduce_slot +
+       instr->alu_num_slot_needed_by_non_cplx_store - non_cplx_store_reduce_slot -
        (instr->alu_non_cplx_slot_free - non_cplx_consume_slot);
    if (non_cplx_slot_difference > 0) {
       gpir_debug("failed %d because of alu slot\n", node->index);
@@ -131,6 +139,7 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    instr->alu_num_slot_free -= consume_slot;
    instr->alu_non_cplx_slot_free -= non_cplx_consume_slot;
    instr->alu_num_slot_needed_by_store -= store_reduce_slot;
+   instr->alu_num_slot_needed_by_non_cplx_store -= non_cplx_store_reduce_slot;
    instr->alu_num_slot_needed_by_max -= max_reduce_slot;
    instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
    return true;
@@ -144,6 +153,8 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node)
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
          instr->alu_num_slot_needed_by_store++;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            instr->alu_num_slot_needed_by_non_cplx_store++;
          break;
       }
    }
@@ -296,7 +307,7 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
    }
 
    /* Check the invariants documented in gpir.h, similar to the ALU case.
-    * Since the only thing that changes is alu_num_slot_needed_by_store, we
+    * When the only thing that changes is alu_num_slot_needed_by_store, we
     * can get away with just checking the first one.
     */
    int slot_difference = instr->alu_num_slot_needed_by_store + 1
@@ -308,6 +319,25 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
       return false;
    }
 
+   if (store->child->sched.next_max_node &&
+       !store->child->sched.complex_allowed) {
+      /* The child of the store is already partially ready, and has a use one
+       * cycle ago that disqualifies it (or a move replacing it) from being
+       * put in the complex slot. Therefore we have to check the non-complex
+       * invariant.
+       */
+      int non_cplx_slot_difference =
+          instr->alu_num_slot_needed_by_max +
+          instr->alu_num_slot_needed_by_non_cplx_store + 1 -
+          instr->alu_non_cplx_slot_free;
+      if (non_cplx_slot_difference > 0) {
+         instr->non_cplx_slot_difference = non_cplx_slot_difference;
+         return false;
+      }
+
+      instr->alu_num_slot_needed_by_non_cplx_store++;
+   }
+
    instr->alu_num_slot_needed_by_store++;
 
 out:
@@ -346,6 +376,11 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node)
 
    instr->alu_num_slot_needed_by_store--;
 
+   if (store->child->sched.next_max_node &&
+       !store->child->sched.complex_allowed) {
+      instr->alu_num_slot_needed_by_non_cplx_store--;
+   }
+
 out:
    if (!instr->slots[other_slot])
       instr->store_content[component >> 1] = GPIR_INSTR_STORE_NONE;
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c
index 60076daa3c4..7149e1dd213 100644
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -640,6 +640,7 @@ static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
    move->node.sched.dist = node->sched.dist;
    move->node.sched.max_node = node->sched.max_node;
    move->node.sched.next_max_node = node->sched.next_max_node;
+   move->node.sched.complex_allowed = node->sched.complex_allowed;
 
    gpir_debug("create move %d for %d\n", move->node.index, node->index);
 
@@ -974,6 +975,7 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
       store->child = node;
       store->node.sched.max_node = false;
       store->node.sched.next_max_node = false;
+      store->node.sched.complex_allowed = false;
       store->node.sched.pos = -1;
       store->node.sched.instr = NULL;
       store->node.sched.inserted = false;
@@ -1086,6 +1088,44 @@ static int gpir_get_min_end_as_move(gpir_node *node)
    return min;
 }
 
+/* The second source for add0, add1, mul0, and mul1 units cannot be complex.
+ * The hardware overwrites the add second sources with 0 and mul second
+ * sources with 1. This can be a problem if we need to insert more next-max
+ * moves but we only have values that can't use the complex unit for moves.
+ *
+ * Fortunately, we only need to insert a next-max move if there are more than
+ * 5 next-max nodes, but there are only 4 sources in the previous instruction
+ * that make values not complex-capable, which means there can be at most 4
+ * non-complex-capable values. Hence there will always be at least two values
+ * that can be rewritten to use a move in the complex slot. However, we have
+ * to be careful not to waste those values by putting both of them in a
+ * non-complex slot. This is handled for us by gpir_instr, which will reject
+ * such instructions. We just need to tell it which nodes can use complex, and
+ * it will do the accounting to figure out what is safe.
+ */
+
+static bool can_use_complex(gpir_node *node)
+{
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      gpir_node *succ = dep->succ;
+      if (succ->type != gpir_node_type_alu)
+         continue;
+
+      gpir_alu_node *alu = gpir_node_to_alu(succ);
+      if (alu->num_child >= 2 && alu->children[1] == node)
+         return false;
+
+      /* complex1 puts its third source in the fourth slot */
+      if (alu->node.op == gpir_op_complex1 && alu->children[2] == node)
+         return false;
+   }
+
+   return true;
+}
+
 /* Initialize node->sched.max_node and node->sched.next_max_node for every
  * input node on the ready list. We should only need to do this once per
  * instruction, at the beginning, since we never add max nodes to the ready
@@ -1104,6 +1144,8 @@ static void sched_find_max_nodes(sched_ctx *ctx)
       int min_end_move = gpir_get_min_end_as_move(node);
       node->sched.max_node = (min_end_move == ctx->instr->index);
       node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
+      if (node->sched.next_max_node)
+         node->sched.complex_allowed = can_use_complex(node);
 
       if (node->sched.max_node)
          ctx->instr->alu_num_slot_needed_by_max++;
@@ -1120,6 +1162,7 @@ static void verify_max_nodes(sched_ctx *ctx)
    int alu_num_slot_needed_by_max = 0;
    int alu_num_slot_needed_by_next_max = -5;
    int alu_num_slot_needed_by_store = 0;
+   int alu_num_slot_needed_by_non_cplx_store = 0;
 
    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
       if (!gpir_is_input_node(node))
@@ -1129,15 +1172,20 @@ static void verify_max_nodes(sched_ctx *ctx)
          alu_num_slot_needed_by_max++;
       if (node->sched.next_max_node)
          alu_num_slot_needed_by_next_max++;
-      if (used_by_store(node, ctx->instr))
+      if (used_by_store(node, ctx->instr)) {
          alu_num_slot_needed_by_store++;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            alu_num_slot_needed_by_non_cplx_store++;
+      }
    }
 
    assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
    assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max);
    assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
-   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max);
-   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max);
+   assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
+          alu_num_slot_needed_by_non_cplx_store);
+   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_slot_needed_by_next_max, 0));
+   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
 }
 
 static bool try_node(sched_ctx *ctx)
@@ -1207,6 +1255,22 @@ static void place_move(sched_ctx *ctx, gpir_node *node)
    assert(score != INT_MIN);
 }
 
+/* For next-max nodes, not every node can be offloaded to a move in the
+ * complex slot. If we run out of non-complex slots, then such nodes cannot
+ * have moves placed for them. There should always be sufficient
+ * complex-capable nodes so that this isn't a problem.
+ */
+static bool can_place_move(sched_ctx *ctx, gpir_node *node)
+{
+   if (!node->sched.next_max_node)
+      return true;
+
+   if (node->sched.complex_allowed)
+      return true;
+
+   return ctx->instr->alu_non_cplx_slot_free > 0;
+}
+
 static bool sched_move(sched_ctx *ctx)
 {
    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
@@ -1250,6 +1314,9 @@ static bool sched_move(sched_ctx *ctx)
     */
    if (ctx->instr->alu_num_slot_free > 0) {
       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
          if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
              node->sched.ready) {
             bool skip = true;
@@ -1284,6 +1351,9 @@ static bool sched_move(sched_ctx *ctx)
     */
    if (ctx->instr->alu_num_slot_free > 0) {
       list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
          if (node->sched.next_max_node &&
              !(node->op == gpir_op_complex1 && node->sched.ready)) {
             place_move(ctx, node);
@@ -1298,6 +1368,9 @@ static bool sched_move(sched_ctx *ctx)
 
    if (ctx->instr->alu_num_slot_needed_by_next_max > 0) {
       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
          if (node->sched.next_max_node) {
             place_move(ctx, node);
             return true;
@@ -1546,6 +1619,7 @@ bool gpir_schedule_prog(gpir_compiler *comp)
          node->sched.physreg_store = NULL;
          node->sched.ready = false;
          node->sched.inserted = false;
+         node->sched.complex_allowed = false;
          node->sched.max_node = false;
          node->sched.next_max_node = false;
       }