Mesa (master): lima/ppir: rework select conditions

Sat May 9 13:52:02 UTC 2020

Module: Mesa
Branch: master
Commit: e622e010fd838eb30eab46800015516703b76f4d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e622e010fd838eb30eab46800015516703b76f4d

Author: Erico Nunes <nunes.erico at gmail.com>
Date:   Tue Apr 14 02:54:56 2020 +0200

lima/ppir: rework select conditions

This is yet another simple optimization that attemts to save the
insertion of an unnecessary mov for a large number of cases.
If the node outputting the condition for select satisfies a few
requirements (which are common in the case of comparison conditions),
it can just be changed to pipeline output and used directly.
In case of difficult corner cases, just fall back to the mov as before.

The sel_cond op is removed as the scheduler can be smart enough to place
nodes that output to ^fmul in the ALU_SCL_MUL slot, and as there can be
alu ops other than just mov.

Signed-off-by: Erico Nunes <nunes.erico at gmail.com>
Reviewed-by: Vasily Khoruzhick <anarsoul at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4632>

---

 src/gallium/drivers/lima/ir/pp/codegen.c |  3 --
 src/gallium/drivers/lima/ir/pp/instr.c   | 11 +++++-
 src/gallium/drivers/lima/ir/pp/lower.c   | 59 ++++++++++++++++++++++++++------
 src/gallium/drivers/lima/ir/pp/node.c    |  8 -----
 src/gallium/drivers/lima/ir/pp/ppir.h    | 14 ++++++--
 5 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/lima/ir/pp/codegen.c b/src/gallium/drivers/lima/ir/pp/codegen.c
index 4420ee30e05..55d3489c51c 100644
--- a/src/gallium/drivers/lima/ir/pp/codegen.c
+++ b/src/gallium/drivers/lima/ir/pp/codegen.c
@@ -276,9 +276,6 @@ static void ppir_codegen_encode_scl_mul(ppir_node *node, void *code)
    case ppir_op_mov:
       f->op = ppir_codegen_float_mul_op_mov;
       break;
-   case ppir_op_sel_cond:
-      f->op = ppir_codegen_float_mul_op_mov;
-      break;
    case ppir_op_max:
       f->op = ppir_codegen_float_mul_op_max;
       break;
diff --git a/src/gallium/drivers/lima/ir/pp/instr.c b/src/gallium/drivers/lima/ir/pp/instr.c
index 74b6e5fbaea..8e1bc95158d 100644
--- a/src/gallium/drivers/lima/ir/pp/instr.c
+++ b/src/gallium/drivers/lima/ir/pp/instr.c
@@ -221,10 +221,19 @@ bool ppir_instr_insert_node(ppir_instr *instr, ppir_node *node)
                continue;
          }
 
+         /* ^fmul dests (e.g. condition for select) can only be
+          * scheduled to ALU_SCL_MUL */
+         if (pos == PPIR_INSTR_SLOT_ALU_SCL_ADD) {
+            ppir_dest *dest = ppir_node_get_dest(node);
+            if (dest && dest->type == ppir_target_pipeline &&
+                dest->pipeline == ppir_pipeline_reg_fmul)
+            continue;
+         }
+
          if (pos == PPIR_INSTR_SLOT_ALU_SCL_MUL ||
              pos == PPIR_INSTR_SLOT_ALU_SCL_ADD) {
             ppir_dest *dest = ppir_node_get_dest(node);
-            if (!ppir_target_is_scaler(dest))
+            if (!ppir_target_is_scalar(dest))
                continue;
          }
 
diff --git a/src/gallium/drivers/lima/ir/pp/lower.c b/src/gallium/drivers/lima/ir/pp/lower.c
index b5b7c34c25f..6a088c3328d 100644
--- a/src/gallium/drivers/lima/ir/pp/lower.c
+++ b/src/gallium/drivers/lima/ir/pp/lower.c
@@ -212,23 +212,56 @@ static bool ppir_lower_texture(ppir_block *block, ppir_node *node)
    return true;
 }
 
-/* insert a move as the select condition to make sure it can
- * be inserted to select instr float mul slot
- */
+/* Check if the select condition and ensure it can be inserted to
+ * the scalar mul slot */
 static bool ppir_lower_select(ppir_block *block, ppir_node *node)
 {
    ppir_alu_node *alu = ppir_node_to_alu(node);
+   ppir_src *src0 = &alu->src[0];
+   ppir_src *src1 = &alu->src[1];
+   ppir_src *src2 = &alu->src[2];
+
+   /* If the condition is already an alu scalar whose only successor
+    * is the select node, just turn it into pipeline output. */
+   /* The (src2->node == cond) case is a tricky exception.
+    * The reason is that we must force cond to output to ^fmul -- but
+    * then it no longer writes to a register and it is impossible to
+    * reference ^fmul in src2. So in that exceptional case, also fall
+    * back to the mov. */
+   ppir_node *cond = src0->node;
+   if (cond &&
+       cond->type == ppir_node_type_alu &&
+       ppir_node_has_single_succ(cond) &&
+       ppir_target_is_scalar(ppir_node_get_dest(cond)) &&
+       ppir_node_schedulable_slot(cond, PPIR_INSTR_SLOT_ALU_SCL_MUL) &&
+       src2->node != cond) {
+
+      ppir_dest *cond_dest = ppir_node_get_dest(cond);
+      cond_dest->type = ppir_target_pipeline;
+      cond_dest->pipeline = ppir_pipeline_reg_fmul;
+
+      ppir_node_target_assign(src0, cond);
+
+      /* src1 could also be a reference from the same node as
+       * the condition, so update it in that case. */
+      if (src1->node && src1->node == cond)
+         ppir_node_target_assign(src1, cond);
 
-   ppir_node *move = ppir_node_create(block, ppir_op_sel_cond, -1, 0);
+      return true;
+   }
+
+   /* If the condition can't be used for any reason, insert a mov
+    * so that the condition can end up in ^fmul */
+   ppir_node *move = ppir_node_create(block, ppir_op_mov, -1, 0);
    if (!move)
       return false;
    list_addtail(&move->list, &node->list);
 
    ppir_alu_node *move_alu = ppir_node_to_alu(move);
-   ppir_src *move_src = move_alu->src, *src = alu->src;
-   move_src->type = src->type;
-   move_src->ssa = src->ssa;
-   move_src->swizzle[0] = src->swizzle[0];
+   ppir_src *move_src = move_alu->src;
+   move_src->type = src0->type;
+   move_src->ssa = src0->ssa;
+   move_src->swizzle[0] = src0->swizzle[0];
    move_alu->num_src = 1;
 
    ppir_dest *move_dest = &move_alu->dest;
@@ -236,7 +269,7 @@ static bool ppir_lower_select(ppir_block *block, ppir_node *node)
    move_dest->pipeline = ppir_pipeline_reg_fmul;
    move_dest->write_mask = 1;
 
-   ppir_node *pred = alu->src[0].node;
+   ppir_node *pred = src0->node;
    ppir_dep *dep = ppir_dep_for_pred(node, pred);
    if (dep)
       ppir_node_replace_pred(dep, move);
@@ -247,8 +280,12 @@ static bool ppir_lower_select(ppir_block *block, ppir_node *node)
    if (pred)
       ppir_node_add_dep(move, pred, ppir_dep_src);
 
-   src->swizzle[0] = 0;
-   ppir_node_target_assign(alu->src, move);
+   ppir_node_target_assign(src0, move);
+
+   /* src1 could also be a reference from the same node as
+    * the condition, so update it in that case. */
+   if (src1->node && src1->node == pred)
+      ppir_node_target_assign(src1, move);
 
    return true;
 }
diff --git a/src/gallium/drivers/lima/ir/pp/node.c b/src/gallium/drivers/lima/ir/pp/node.c
index b71cbc3763e..d0e97616f3e 100644
--- a/src/gallium/drivers/lima/ir/pp/node.c
+++ b/src/gallium/drivers/lima/ir/pp/node.c
@@ -225,14 +225,6 @@ const ppir_op_info ppir_op_infos[] = {
          PPIR_INSTR_SLOT_END
       },
    },
-   [ppir_op_sel_cond] = {
-      /* effectively mov, but must be scheduled only to
-       * PPIR_INSTR_SLOT_ALU_SCL_MUL */
-      .name = "sel_cond",
-      .slots = (int []) {
-         PPIR_INSTR_SLOT_ALU_SCL_MUL, PPIR_INSTR_SLOT_END
-      },
-   },
    [ppir_op_select] = {
       .name = "select",
       .slots = (int []) {
diff --git a/src/gallium/drivers/lima/ir/pp/ppir.h b/src/gallium/drivers/lima/ir/pp/ppir.h
index 59887a70ec9..821107302d7 100644
--- a/src/gallium/drivers/lima/ir/pp/ppir.h
+++ b/src/gallium/drivers/lima/ir/pp/ppir.h
@@ -54,7 +54,6 @@ typedef enum {
    ppir_op_normalize3,
    ppir_op_normalize4,
 
-   ppir_op_sel_cond,
    ppir_op_select,
 
    ppir_op_sin,
@@ -630,7 +629,7 @@ static inline int ppir_src_get_mask(ppir_src *src)
    return mask;
 }
 
-static inline bool ppir_target_is_scaler(ppir_dest *dest)
+static inline bool ppir_target_is_scalar(ppir_dest *dest)
 {
    switch (dest->type) {
    case ppir_target_ssa:
@@ -656,6 +655,17 @@ static inline bool ppir_target_is_scaler(ppir_dest *dest)
    }
 }
 
+static inline bool ppir_node_schedulable_slot(ppir_node *node,
+                                              enum ppir_instr_slot slot)
+{
+   int *slots = ppir_op_infos[node->op].slots;
+   for (int i = 0; slots[i] != PPIR_INSTR_SLOT_END; i++)
+      if (slots[i] == slot)
+         return true;
+
+   return false;
+}
+
 ppir_instr *ppir_instr_create(ppir_block *block);
 bool ppir_instr_insert_node(ppir_instr *instr, ppir_node *node);
 void ppir_instr_add_dep(ppir_instr *succ, ppir_instr *pred);