Mesa (master): vc4: Fix GPU hangs with >16 varying values.

Wed Aug 24 17:48:07 UTC 2016

Module: Mesa
Branch: master
Commit: 87a88f2daabfe14b12d447b3d96b9f8938c5cf03
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=87a88f2daabfe14b12d447b3d96b9f8938c5cf03

Author: Eric Anholt <eric at anholt.net>
Date:   Mon Aug 22 14:58:28 2016 -0700

vc4: Fix GPU hangs with >16 varying values.

Fixes glsl-routing in piglit and hangs in glbenchmark 2.0.2.

---

 src/gallium/drivers/vc4/vc4_qir.h      | 12 ++++++
 src/gallium/drivers/vc4/vc4_qpu_emit.c | 75 +++++++++++++++++++++++++---------
 2 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 12b081b..6b48959 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -494,6 +494,18 @@ struct vc4_compile {
         uint32_t qpu_inst_size;
         uint32_t num_inputs;
 
+        /**
+         * Number of inputs from num_inputs remaining to be queued to the read
+         * FIFO in the VS/CS.
+         */
+        uint32_t num_inputs_remaining;
+
+        /* Number of inputs currently in the read FIFO for the VS/CS */
+        uint32_t num_inputs_in_fifo;
+
+        /** Next offset in the VPM to read from in the VS/CS */
+        uint32_t vpm_read_offset;
+
         uint32_t program_id;
         uint32_t variant_id;
 };
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 6a10e1b..8249929 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -97,6 +97,60 @@ swap_file(struct qpu_reg *src)
 }
 
 /**
+ * Sets up the VPM read FIFO before we do any VPM read.
+ *
+ * VPM reads (vertex attribute input) and VPM writes (varyings output) from
+ * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
+ * VPM block.  In the VS/CS (unlike in the FS), the block starts out
+ * uninitialized, and you need to emit setup to the block before any VPM
+ * reads/writes.
+ *
+ * VRI has a FIFO in each direction, with each FIFO able to hold four
+ * 32-bit-per-vertex values.  VPM reads come through the read FIFO and VPM
+ * writes go through the write FIFO.  The read/write setup values from QPU go
+ * through the write FIFO as well, with a sideband signal indicating that
+ * they're setup values.  Once a read setup reaches the other side of the
+ * FIFO, the VPM block will start asynchronously reading vertex attributes and
+ * filling the read FIFO -- that way hopefully the QPU doesn't have to block
+ * on reads later.
+ *
+ * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
+ * time, which is 4 vec4s.  If more than that is being read (since we support
+ * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
+ *
+ * The existence of the FIFO makes it seem like you should be able to emit
+ * both setups for the 5-8 attribute cases and then do all the attribute
+ * reads.  However, once the setup value makes it to the other end of the
+ * write FIFO, it will immediately update the VPM block's setup register.
+ * That updated setup register would be used for read FIFO fills from then on,
+ * breaking whatever remaining VPM values were supposed to be read into the
+ * read FIFO from the previous attribute set.
+ *
+ * As a result, we need to emit the read setup, pull every VPM read value from
+ * that setup, and only then emit the second setup if applicable.
+ */
+static void
+setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
+{
+        if (c->num_inputs_in_fifo) {
+                c->num_inputs_in_fifo--;
+                return;
+        }
+
+        c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
+
+        queue(block,
+              qpu_load_imm_ui(qpu_vrsetup(),
+                              c->vpm_read_offset |
+                              0x00001a00 |
+                              ((c->num_inputs_in_fifo & 0xf) << 20)));
+        c->num_inputs_remaining -= c->num_inputs_in_fifo;
+        c->vpm_read_offset += c->num_inputs_in_fifo;
+
+        c->num_inputs_in_fifo--;
+}
+
+/**
  * This is used to resolve the fact that we might register-allocate two
  * different operands of an instruction to the same physical register file
  * even though instructions have only one field for the register file source
@@ -268,6 +322,7 @@ vc4_generate_code_block(struct vc4_compile *c,
                                 assert(src[i].addr <= 47);
                                 break;
                         case QFILE_VPM:
+                                setup_for_vpm_read(c, block);
                                 assert((int)qinst->src[i].index >=
                                        last_vpm_read_index);
                                 (void)last_vpm_read_index;
@@ -484,31 +539,13 @@ void
 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 {
         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
-        uint32_t inputs_remaining = c->num_inputs;
-        uint32_t vpm_read_fifo_count = 0;
-        uint32_t vpm_read_offset = 0;
         struct qblock *start_block = list_first_entry(&c->blocks,
                                                       struct qblock, link);
 
         switch (c->stage) {
         case QSTAGE_VERT:
         case QSTAGE_COORD:
-                /* There's a 4-entry FIFO for VPMVCD reads, each of which can
-                 * load up to 16 dwords (4 vec4s) per vertex.
-                 */
-                while (inputs_remaining) {
-                        uint32_t num_entries = MIN2(inputs_remaining, 16);
-                        queue(start_block,
-                              qpu_load_imm_ui(qpu_vrsetup(),
-                                              vpm_read_offset |
-                                              0x00001a00 |
-                                              ((num_entries & 0xf) << 20)));
-                        inputs_remaining -= num_entries;
-                        vpm_read_offset += num_entries;
-                        vpm_read_fifo_count++;
-                }
-                assert(vpm_read_fifo_count <= 4);
-
+                c->num_inputs_remaining = c->num_inputs;
                 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
                 break;
         case QSTAGE_FRAG: