<div dir="ltr">On 10 March 2016 at 15:21, Eric Anholt <span dir="ltr"><<a href="mailto:eric@anholt.net" target="_blank">eric@anholt.net</a>></span> wrote:<br><div class="gmail_extra"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">- There's no reason there would be only 64 operations that read from the<br>
  output of a mov from VPM, so we might smash the stack (fixes etqw trace)<br>
<br>
- Fixes segfault where we assumed that a single-use temp had a def (fixes<br>
  2 piglit tests)<br>
<br>
- We need to only mark progress when we actually did the optimization, or<br>
  we'll infinite loop (0ad trace).<br>
<br>
- Misc style fixes.<br>
<br>
- No reordering sampler instructions (fixes a glean test)<br>
<br>
shader-db results:<br>
total instructions in shared programs: 78513 -> 78071 (-0.56%)<br>
instructions in affected programs:     10406 -> 9964 (-4.25%)<br>
total estimated cycles in shared programs: 234674 -> 234274 (-0.17%)<br>
estimated cycles in affected programs:     35188 -> 34788 (-1.14%)<br>
---<br>
<br>
Varad, here's what I came up with trying to test your patch.  If these<br>
changes look good to you, I can squash them in and push.<br>
<br>
 src/gallium/drivers/vc4/vc4_opt_vpm.c | 45 +++++++++++++++++------------------<br>
 1 file changed, 22 insertions(+), 23 deletions(-)<br>
<br>
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c<br>
index 277b345..a4ee6af 100644<br>
--- a/src/gallium/drivers/vc4/vc4_opt_vpm.c<br>
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c<br>
@@ -40,10 +40,8 @@ qir_opt_vpm(struct vc4_compile *c)<br>
<br>
         bool progress = false;<br>
         struct qinst *vpm_writes[64] = { 0 };<br>
-        struct qinst *vpm_reads[64] = { 0 };<br>
         uint32_t use_count[c->num_temps];<br>
         uint32_t vpm_write_count = 0;<br>
-        uint32_t vpm_read_count = 0;<br>
         memset(&use_count, 0, sizeof(use_count));<br>
<br>
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {<br>
@@ -59,24 +57,14 @@ qir_opt_vpm(struct vc4_compile *c)<br>
                         if (inst->src[i].file == QFILE_TEMP) {<br>
                                 uint32_t temp = inst->src[i].index;<br>
                                 use_count[temp]++;<br>
-<br>
-                                struct qinst *mov = c->defs[temp];<br>
-                                if (!mov ||<br>
-                                    (mov->op != QOP_MOV &&<br>
-                                    mov->op != QOP_FMOV &&<br>
-                                    mov->op != QOP_MMOV)) {<br>
-                                        continue;<br>
-                                }<br>
-<br>
-                                if (mov->src[0].file == QFILE_VPM)<br>
-                                        vpm_reads[vpm_read_count++] = inst;<br>
                         }<br>
                 }<br>
         }<br>
<br>
-        for (int i = 0; i < vpm_read_count; i++) {<br>
-                struct qinst *inst = vpm_reads[i];<br>
-<br>
+        /* For instructions reading from a temporary that contains a VPM read<br>
+         * result, try to move the instruction up in place of the VPM read.<br>
+         */<br>
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {<br>
                 if (!inst || qir_is_multi_instruction(inst))<br>
                         continue;<br>
<br>
@@ -84,21 +72,32 @@ qir_opt_vpm(struct vc4_compile *c)<br>
                         continue;<br>
<br>
                 if (qir_has_side_effects(c, inst) ||<br>
-                    qir_has_side_effect_reads(c, inst))<br>
+                    qir_has_side_effect_reads(c, inst) ||<br>
+                    qir_is_tex(inst))<br>
                         continue;<br>
<br>
                 for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {<br>
-                        if(inst->src[j].file != QFILE_TEMP)<br>
+                        if (inst->src[j].file != QFILE_TEMP)<br>
                                 continue;<br>
<br>
                         uint32_t temp = inst->src[j].index;<br>
+<br>
+                        /* Since VPM reads pull from a FIFO, we only get to<br>
+                         * read each VPM entry once (unless we reset the read<br>
+                         * pointer).  That means we can't copy-propagate a VPM<br>
+                         * read to multiple locations.<br>
+                         */<br>
                         if (use_count[temp] != 1)<br>
                                 continue;<br>
<br>
                         struct qinst *mov = c->defs[temp];<br>
-<br>
-                        if (mov->src[0].file != QFILE_VPM)<br>
+                        if (!mov ||<br>
+                            (mov->op != QOP_MOV &&<br>
+                             mov->op != QOP_FMOV &&<br>
+                             mov->op != QOP_MMOV) ||<br>
+                            mov->src[0].file != QFILE_VPM) {<br>
                                 continue;<br>
+                        }<br>
<br>
                         uint32_t temps = 0;<br>
                         for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) {<br>
@@ -109,15 +108,15 @@ qir_opt_vpm(struct vc4_compile *c)<br>
                         /* The instruction is safe to reorder if its other<br>
                          * sources are independent of previous instructions<br>
                          */<br>
-                        if (temps == 1 ) {<br>
+                        if (temps == 1) {<br>
                                 list_del(&inst->link);<br>
                                 inst->src[j] = mov->src[0];<br>
                                 list_replace(&mov->link, &inst->link);<br>
                                 c->defs[temp] = NULL;<br>
                                 free(mov);<br>
+                                progress = true;<br>
+                                break;<br>
                         }<br>
-<br>
-                        progress = true;<br>
                 }<br>
         }<br>
<span class=""><font color="#888888"><br>
--<br>
2.7.0<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div><div class="gmail_extra">Varad, Eric,<br></div><div class="gmail_extra"><br>I tested the set of patches [0], [1] plus this one for piglit regressions on RPi2 hw.<br><br></div><div class="gmail_extra">Two regressions caused by [0] and [1] remain:<br></div><div class="gmail_extra">- spec@!opengl 1.5@draw-vertices<br></div><div class="gmail_extra">- spec@!opengl 1.5@draw-vertices-user<br></div><div class="gmail_extra"><br>[0] <a href="https://patchwork.freedesktop.org/patch/76069/">https://patchwork.freedesktop.org/patch/76069/</a><br>[1] <a href="https://patchwork.freedesktop.org/patch/76070/">https://patchwork.freedesktop.org/patch/76070/</a><br><br></div><div class="gmail_extra">Regards,<br></div><div class="gmail_extra">Rhys<br></div></div>