<div dir="ltr">Looks good. R-B<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Jun 1, 2016 at 5:21 PM, Francisco Jerez <span dir="ltr"><<a href="mailto:currojerez@riseup.net" target="_blank">currojerez@riseup.net</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="">Skipping the temporary allocation and copy instructions is easy (just<br>
return dst), but the conditions used to find out whether the copy can<br>
be optimized out safely without breaking the program are rather<br>
complex: The destination must be exactly one component of at most the<br>
execution width of the lowered instruction, and all source regions of<br>
the instruction must be either fully disjoint from the destination or<br>
be aligned with it group by group.<br>
<br>
</span>v2: Don't handle partial source-destination overlap for simplicity<br>
(Jason). No instruction count regressions with respect to v1 in<br>
either shader-db or the few FP64 shader_runner test-cases with<br>
partial overlap I've checked manually.<br>
---<br>
src/mesa/drivers/dri/i965/brw_fs.cpp | 55 ++++++++++++++++++++++++++++++++++++<br>
1 file changed, 55 insertions(+)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
index 00d937e..bfae1d7 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
@@ -5078,6 +5078,52 @@ emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,<br>
<span class=""> }<br>
<br>
/**<br>
+ * Return true if splitting out the group of channels of instruction \p inst<br>
+ * given by lbld.group() requires allocating a temporary for the destination<br>
+ * of the lowered instruction and copying the data back to the original<br>
+ * destination region.<br>
</span><span class="">+ */<br>
+static inline bool<br>
+needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)<br>
+{<br>
+ /* If the instruction writes more than one component we'll have to shuffle<br>
+ * the results of multiple lowered instructions in order to make sure that<br>
+ * they end up arranged correctly in the original destination region.<br>
+ */<br>
+ if (inst->regs_written * REG_SIZE ><br>
+ inst->dst.component_size(inst->exec_size))<br>
+ return true;<br>
+<br>
+ /* If the lowered execution size is larger than the original the result of<br>
+ * the instruction won't fit in the original destination, so we'll have to<br>
+ * allocate a temporary in any case.<br>
+ */<br>
+ if (lbld.dispatch_width() > inst->exec_size)<br>
+ return true;<br>
+<br>
+ for (unsigned i = 0; i < inst->sources; i++) {<br>
</span><span class="">+ /* If we already made a copy of the source for other reasons there won't<br>
+ * be any overlap with the destination.<br>
+ */<br>
</span>+ if (needs_src_copy(lbld, inst, i))<br>
+ continue;<br>
+<br>
+ /* In order to keep the logic simple we emit a copy whenever the<br>
+ * destination region doesn't exactly match an overlapping source, which<br>
+ * may point at the source and destination not being aligned group by<br>
+ * group which could cause one of the lowered instructions to overwrite<br>
+ * the data read from the same source by other lowered instructions.<br>
+ */<br>
+ if (regions_overlap(inst->dst, inst->regs_written * REG_SIZE,<br>
+ inst->src[i], inst->regs_read(i) * REG_SIZE) &&<br>
+ !inst->dst.equals(inst->src[i]))<br>
<span class="">+ return true;<br>
+ }<br>
+<br>
</span><span class="">+ return false;<br>
+}<br>
+<br>
+/**<br>
* Insert data from a packed temporary into the channel group given by<br>
* lbld.group() of the destination region of instruction \p inst and return<br>
* the temporary as result. If any copy instructions are required they will<br>
</span>@@ -5097,6 +5143,8 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)<br>
<span class=""> const fs_reg dst = horiz_offset(inst->dst, lbld.group());<br>
const unsigned dst_size = inst->regs_written * REG_SIZE /<br>
inst->dst.component_size(inst->exec_size);<br>
+<br>
+ if (needs_dst_copy(lbld, inst)) {<br>
const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);<br>
<br>
if (inst->predicate) {<br>
</span>@@ -5114,6 +5162,13 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)<br>
<div class="HOEnZb"><div class="h5"> .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));<br>
<br>
return tmp;<br>
+<br>
+ } else {<br>
+ /* No need to allocate a temporary for the lowered instruction, just<br>
+ * take the right group of channels from the original region.<br>
+ */<br>
+ return dst;<br>
+ }<br>
}<br>
<br>
bool<br>
--<br>
2.7.3<br>
<br>
</div></div></blockquote></div><br></div>