<p dir="ltr"></p>
<p dir="ltr">On Sep 16, 2016 3:04 PM, "Francisco Jerez" <<a href="mailto:currojerez@riseup.net">currojerez@riseup.net</a>> wrote:<br>
><br>
> Not intended for upstream. Should cause a GPU hang if some thread is<br>
> executed with a non-contiguous dispatch mask breaking assumptions of<br>
> brw_stage_has_packed_dispatch(). Doesn't cause any CTS, DEQP or<br>
> Piglit regressions, while replacing brw_stage_has_packed_dispatch()<br>
> with a dummy implementation that unconditionally returns true on top<br>
> of this patch causes multiple GPU hangs.<br>
> ---<br>
> src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 17 +++++++++++++++++<br>
> src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 21 +++++++++++++++++++++<br>
> 2 files changed, 38 insertions(+)<br>
><br>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp<br>
> index 042203d..b3eec49 100644<br>
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp<br>
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp<br>
> @@ -33,6 +33,23 @@ using namespace brw::surface_access;<br>
> void<br>
> fs_visitor::emit_nir_code()<br>
> {<br>
> + if (brw_stage_has_packed_dispatch(stage, prog_data)) {</p>
<p dir="ltr">Mind adding "0 &&" and merging this patch so we remain aware of the issue, keep it building, and can easily test future hardware.</p>
<p dir="ltr">> + const fs_builder ubld = bld.exec_all().group(1, 0);<br>
> + const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);<br>
> + const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :<br>
> + brw_dmask_reg());<br>
> +<br>
> + ubld.ADD(tmp, mask, brw_imm_ud(1));<br>
> + ubld.AND(tmp, mask, tmp);<br>
> +<br>
> + /* This will loop forever if the dispatch mask doesn't have the expected<br>
> + * form '2^n-1', in which case tmp will be non-zero.<br>
> + */<br>
> + bld.emit(BRW_OPCODE_DO);<br>
> + bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);<br>
> + set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));<br>
> + }<br>
> +<br>
> /* emit the arrays used for inputs and outputs - load/store intrinsics will<br>
> * be converted to reads/writes of these arrays<br>
> */<br>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp<br>
> index ba3bbdf..9f7a1f0 100644<br>
> --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp<br>
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp<br>
> @@ -35,6 +35,27 @@ namespace brw {<br>
> void<br>
> vec4_visitor::emit_nir_code()<br>
> {<br>
> + if (brw_stage_has_packed_dispatch(stage, &prog_data->base)) {<br>
> + const dst_reg tmp = writemask(dst_reg(this, glsl_type::uint_type),<br>
> + WRITEMASK_X);<br>
> + const src_reg mask =<br>
> + brw_swizzle(retype(stride(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_STATE, 0),<br>
> + 0, 4, 1),<br>
> + BRW_REGISTER_TYPE_UD),<br>
> + BRW_SWIZZLE_ZZZZ);</p>
<p dir="ltr">Can we just do vec4_reg(brw_vmask_reg)?</p>
<p dir="ltr">> +<br>
> + emit(ADD(tmp, mask, brw_imm_ud(1)));<br>
> + emit(AND(tmp, mask, src_reg(tmp)));<br>
> +<br>
> + /* This will loop forever if the dispatch mask doesn't have the expected<br>
> + * form '2^n-1', in which case tmp will be non-zero.<br>
> + */<br>
> + emit(BRW_OPCODE_DO);<br>
> + emit(CMP(dst_null_ud(), src_reg(tmp), brw_imm_ud(0),<br>
> + BRW_CONDITIONAL_NZ));<br>
> + emit(BRW_OPCODE_WHILE)->predicate = BRW_PREDICATE_NORMAL;<br>
> + }<br>
> +<br>
> if (nir->num_uniforms > 0)<br>
> nir_setup_uniforms();<br>
><br>
> --<br>
> 2.9.0<br>
></p>