<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Sep 16, 2016 at 3:03 PM, Francisco Jerez <span dir="ltr"><<a href="mailto:currojerez@riseup.net" target="_blank">currojerez@riseup.net</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">The eliminate_find_live_channel optimization eliminates<br>
FIND_LIVE_CHANNEL instructions in cases where control flow is known to<br>
be uniform, and replaces them with 'MOV 0', which in turn unblocks<br>
subsequent elimination of the BROADCAST instruction frequently used on<br>
the result of FIND_LIVE_CHANNEL. This is however not correct in<br>
per-sample fragment shader dispatch because the PSD can dispatch a<br>
fully unlit sample under certain conditions. Disable the optimization<br>
in that case.<br>
---<br>
src/mesa/drivers/dri/i965/brw_<wbr>compiler.h | 41 ++++++++++++++++++++++++++++++<wbr>++<br>
src/mesa/drivers/dri/i965/brw_<wbr>fs.cpp | 8 +++++++<br>
src/mesa/drivers/dri/i965/brw_<wbr>vec4.cpp | 8 +++++++<br>
3 files changed, 57 insertions(+)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/<wbr>brw_compiler.h b/src/mesa/drivers/dri/i965/<wbr>brw_compiler.h<br>
index 84d3dde..1429875 100644<br>
--- a/src/mesa/drivers/dri/i965/<wbr>brw_compiler.h<br>
+++ b/src/mesa/drivers/dri/i965/<wbr>brw_compiler.h<br>
@@ -868,6 +868,47 @@ encode_slm_size(unsigned gen, uint32_t bytes)<br>
return slm_size;<br>
}<br>
<br>
+/**<br>
+ * Return true if the given shader stage is dispatched contiguously by the<br>
+ * relevant fixed function starting from channel 0 of the SIMD thread, which<br>
+ * implies that the dispatch mask of a thread can be assumed to have the form<br>
+ * '2^n - 1' for some n.<br>
+ */<br>
+static inline bool<br>
+brw_stage_has_packed_<wbr>dispatch(gl_shader_stage stage,<br>
+ const struct brw_stage_prog_data *prog_data)<br></blockquote><div><br></div><div>Thank you, thank you, thank you for making this a well-documented helper function!<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+{<br>
+ switch (stage) {<br>
+ case MESA_SHADER_FRAGMENT: {<br>
+ /* The PSD discards subspans coming in with no lit samples, which in the<br>
+ * per-pixel shading case implies that each subspan will either be fully<br>
+ * lit (due to the VMask being used to allow derivative computations),<br>
+ * or not dispatched at all. In per-sample dispatch mode individual<br>
+ * samples from the same subspan have a fixed relative location within<br>
+ * the SIMD thread, so dispatch of unlit samples cannot be avoided in<br>
+ * general and we should return false.<br>
+ */<br>
+ const struct brw_wm_prog_data *wm_prog_data =<br>
+ (const struct brw_wm_prog_data *)prog_data;<br>
+ return !wm_prog_data->persample_<wbr>dispatch;<br>
+ }<br>
+ case MESA_SHADER_COMPUTE:<br>
+ /* Compute shaders will be spawned with either a fully enabled dispatch<br>
+ * mask or with whatever bottom/right execution mask was given to the<br>
+ * GPGPU walker command to be used along the workgroup edges -- In both<br>
+ * cases the dispatch mask is required to be tightly packed for our<br>
+ * invocation index calculations to work.<br>
+ */<br>
+ return true;<br>
+ default:<br>
+ /* Most remaining fixed functions are limited to use a packed dispatch<br>
+ * mask due to the hardware representation of the dispatch mask as a<br>
+ * single counter representing the number of enabled channels.<br>
+ */<br>
+ return true;<br>
+ }<br>
+}<br>
+<br>
#ifdef __cplusplus<br>
} /* extern "C" */<br>
#endif<br>
diff --git a/src/mesa/drivers/dri/i965/<wbr>brw_fs.cpp b/src/mesa/drivers/dri/i965/<wbr>brw_fs.cpp<br>
index bb65077..32f7ae2 100644<br>
--- a/src/mesa/drivers/dri/i965/<wbr>brw_fs.cpp<br>
+++ b/src/mesa/drivers/dri/i965/<wbr>brw_fs.cpp<br>
@@ -2835,6 +2835,14 @@ fs_visitor::eliminate_find_<wbr>live_channel()<br>
bool progress = false;<br>
unsigned depth = 0;<br>
<br>
+ if (!brw_stage_has_packed_<wbr>dispatch(stage, stage_prog_data)) {<br>
+ /* The optimization below assumes that channel zero is live on thread<br>
+ * dispatch, which may not be the case if the fixed function dispatches<br>
+ * threads sparsely.<br>
+ */<br>
+ return progress;<br></blockquote><div><br></div><div>Maybe just return false?<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ }<br>
+<br>
foreach_block_and_inst_safe(<wbr>block, fs_inst, inst, cfg) {<br>
switch (inst->opcode) {<br>
case BRW_OPCODE_IF:<br>
diff --git a/src/mesa/drivers/dri/i965/<wbr>brw_vec4.cpp b/src/mesa/drivers/dri/i965/<wbr>brw_vec4.cpp<br>
index 58c8a8a..d5bb82b 100644<br>
--- a/src/mesa/drivers/dri/i965/<wbr>brw_vec4.cpp<br>
+++ b/src/mesa/drivers/dri/i965/<wbr>brw_vec4.cpp<br>
@@ -1291,6 +1291,14 @@ vec4_visitor::eliminate_find_<wbr>live_channel()<br>
bool progress = false;<br>
unsigned depth = 0;<br>
<br>
+ if (!brw_stage_has_packed_<wbr>dispatch(stage, stage_prog_data)) {<br>
+ /* The optimization below assumes that channel zero is live on thread<br>
+ * dispatch, which may not be the case if the fixed function dispatches<br>
+ * threads sparsely.<br>
+ */<br>
+ return progress;<br></blockquote><div><br></div><div>Same here.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ }<br>
+<br>
foreach_block_and_inst_safe(<wbr>block, vec4_instruction, inst, cfg) {<br>
switch (inst->opcode) {<br>
case BRW_OPCODE_IF:<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.9.0<br>
<br>
</font></span></blockquote></div><br></div></div>