[Mesa-dev] [PATCH] squash: Fix up VPM read optimization.

Thu Mar 10 20:21:52 UTC 2016

- There's no reason there would be only 64 operations that read from the
  output of a mov from VPM, so we might smash the stack (fixes etqw trace)

- Fixes segfault where we assumed that a single-use temp had a def (fixes
  2 piglit tests)

- We need to only mark progress when we actually did the optimization, or
  we'll infinite loop (0ad trace).

- Misc style fixes.

- No reordering sampler instructions (fixes a glean test)

shader-db results:
total instructions in shared programs: 78513 -> 78071 (-0.56%)
instructions in affected programs:     10406 -> 9964 (-4.25%)
total estimated cycles in shared programs: 234674 -> 234274 (-0.17%)
estimated cycles in affected programs:     35188 -> 34788 (-1.14%)
---

Varad, here's what I came up with trying to test your patch.  If these
changes look good to you, I can squash them in and push.

 src/gallium/drivers/vc4/vc4_opt_vpm.c | 45 +++++++++++++++++------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c
index 277b345..a4ee6af 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c
@@ -40,10 +40,8 @@ qir_opt_vpm(struct vc4_compile *c)
 
         bool progress = false;
         struct qinst *vpm_writes[64] = { 0 };
-        struct qinst *vpm_reads[64] = { 0 };
         uint32_t use_count[c->num_temps];
         uint32_t vpm_write_count = 0;
-        uint32_t vpm_read_count = 0;
         memset(&use_count, 0, sizeof(use_count));
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
@@ -59,24 +57,14 @@ qir_opt_vpm(struct vc4_compile *c)
                         if (inst->src[i].file == QFILE_TEMP) {
                                 uint32_t temp = inst->src[i].index;
                                 use_count[temp]++;
-
-                                struct qinst *mov = c->defs[temp];
-                                if (!mov ||
-                                    (mov->op != QOP_MOV &&
-                                    mov->op != QOP_FMOV &&
-                                    mov->op != QOP_MMOV)) {
-                                        continue;
-                                }
-
-                                if (mov->src[0].file == QFILE_VPM)
-                                        vpm_reads[vpm_read_count++] = inst;
                         }
                 }
         }
 
-        for (int i = 0; i < vpm_read_count; i++) {
-                struct qinst *inst = vpm_reads[i];
-
+        /* For instructions reading from a temporary that contains a VPM read
+         * result, try to move the instruction up in place of the VPM read.
+         */
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (!inst || qir_is_multi_instruction(inst))
                         continue;
 
@@ -84,21 +72,32 @@ qir_opt_vpm(struct vc4_compile *c)
                         continue;
 
                 if (qir_has_side_effects(c, inst) ||
-                    qir_has_side_effect_reads(c, inst))
+                    qir_has_side_effect_reads(c, inst) ||
+                    qir_is_tex(inst))
                         continue;
 
                 for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
-                        if(inst->src[j].file != QFILE_TEMP)
+                        if (inst->src[j].file != QFILE_TEMP)
                                 continue;
 
                         uint32_t temp = inst->src[j].index;
+
+                        /* Since VPM reads pull from a FIFO, we only get to
+                         * read each VPM entry once (unless we reset the read
+                         * pointer).  That means we can't copy-propagate a VPM
+                         * read to multiple locations.
+                         */
                         if (use_count[temp] != 1)
                                 continue;
 
                         struct qinst *mov = c->defs[temp];
-
-                        if (mov->src[0].file != QFILE_VPM)
+                        if (!mov ||
+                            (mov->op != QOP_MOV &&
+                             mov->op != QOP_FMOV &&
+                             mov->op != QOP_MMOV) ||
+                            mov->src[0].file != QFILE_VPM) {
                                 continue;
+                        }
 
                         uint32_t temps = 0;
                         for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) {
@@ -109,15 +108,15 @@ qir_opt_vpm(struct vc4_compile *c)
                         /* The instruction is safe to reorder if its other
                          * sources are independent of previous instructions
                          */
-                        if (temps == 1 ) {
+                        if (temps == 1) {
                                 list_del(&inst->link);
                                 inst->src[j] = mov->src[0];
                                 list_replace(&mov->link, &inst->link);
                                 c->defs[temp] = NULL;
                                 free(mov);
+                                progress = true;
+                                break;
                         }
-
-                        progress = true;
                 }
         }
 
-- 
2.7.0