<div dir="ltr">Hi Francisco,<div>Thank you for reviewing!<br><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Apr 19, 2017 at 4:18 PM, Francisco Jerez <span dir="ltr"><<a href="mailto:currojerez@riseup.net" target="_blank">currojerez@riseup.net</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Hi Pam, looks good overall, a couple of comments below,<br>
<div><div class="h5"><br>
Plamena Manolova <<a href="mailto:plamena.manolova@intel.com">plamena.manolova@intel.com</a>> writes:<br>
<br>
> Adds suppport for ARB_fragment_shader_interlock. We achieve<br>
> the interlock and fragment ordering by issuing a memory fence<br>
> via sendc.<br>
><br>
> Signed-off-by: Plamena Manolova <<a href="mailto:plamena.manolova@intel.com">plamena.manolova@intel.com</a>><br>
> ---<br>
>  docs/features.txt                            |  2 +-<br>
>  docs/relnotes/17.1.0.html                    |  1 +<br>
>  src/intel/compiler/brw_eu.h                  |  4 +++<br>
>  src/intel/compiler/brw_eu_<wbr>defines.h          |  2 ++<br>
>  src/intel/compiler/brw_eu_<wbr>emit.c             | 47 ++++++++++++++++++++++++++++<br>
>  src/intel/compiler/brw_fs_<wbr>generator.cpp      |  4 +++<br>
>  src/intel/compiler/brw_fs_nir.<wbr>cpp            | 15 +++++++++<br>
>  src/intel/compiler/brw_shader.<wbr>cpp            |  4 +++<br>
>  src/mesa/drivers/dri/i965/<wbr>intel_extensions.c |  5 +++<br>
>  9 files changed, 83 insertions(+), 1 deletion(-)<br>
><br>
> diff --git a/docs/features.txt b/docs/features.txt<br>
> index 5f63632..a6237c0 100644<br>
> --- a/docs/features.txt<br>
> +++ b/docs/features.txt<br>
> @@ -281,7 +281,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve<br>
>    GL_ARB_cl_event                                       not started<br>
>    GL_ARB_compute_variable_group_<wbr>size                    DONE (nvc0, radeonsi)<br>
>    GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)<br>
> -  GL_ARB_fragment_shader_<wbr>interlock                      not started<br>
> +  GL_ARB_fragment_shader_<wbr>interlock                      DONE (i965)<br>
>    GL_ARB_gl_spirv                                       not started<br>
>    GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)<br>
>    GL_ARB_indirect_parameters                            DONE (nvc0, radeonsi)<br>
> diff --git a/docs/relnotes/17.1.0.html b/docs/relnotes/17.1.0.html<br>
> index e7cfe38..1b2393f 100644<br>
> --- a/docs/relnotes/17.1.0.html<br>
> +++ b/docs/relnotes/17.1.0.html<br>
> @@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers.<br>
><br>
>  <ul><br>
>  <li>OpenGL 4.2 on i965/ivb</li><br>
> +<li>GL_ARB_fragment_shader_<wbr>interlock on i965</li><br>
>  <li>GL_ARB_gpu_shader_fp64 on i965/ivybridge</li><br>
>  <li>GL_ARB_gpu_shader_int64 on i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe</li><br>
>  <li>GL_ARB_shader_ballot on nvc0, radeonsi</li><br>
> diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h<br>
> index f422595..117cfae 100644<br>
> --- a/src/intel/compiler/brw_eu.h<br>
> +++ b/src/intel/compiler/brw_eu.h<br>
> @@ -480,6 +480,10 @@ brw_memory_fence(struct brw_codegen *p,<br>
>                   struct brw_reg dst);<br>
><br>
>  void<br>
> +brw_interlock(struct brw_codegen *p,<br>
> +              struct brw_reg dst);<br>
> +<br>
> +void<br>
>  brw_pixel_interpolator_query(<wbr>struct brw_codegen *p,<br>
>                               struct brw_reg dest,<br>
>                               struct brw_reg mrf,<br>
> diff --git a/src/intel/compiler/brw_eu_<wbr>defines.h b/src/intel/compiler/brw_eu_<wbr>defines.h<br>
> index 13a70f6..9eb5210 100644<br>
> --- a/src/intel/compiler/brw_eu_<wbr>defines.h<br>
> +++ b/src/intel/compiler/brw_eu_<wbr>defines.h<br>
> @@ -444,6 +444,8 @@ enum opcode {<br>
>      */<br>
>     SHADER_OPCODE_BROADCAST,<br>
><br>
> +   SHADER_OPCODE_INTERLOCK,<br>
> +<br>
>     VEC4_OPCODE_MOV_BYTES,<br>
>     VEC4_OPCODE_PACK_BYTES,<br>
>     VEC4_OPCODE_UNPACK_UNIFORM,<br>
> diff --git a/src/intel/compiler/brw_eu_<wbr>emit.c b/src/intel/compiler/brw_eu_<wbr>emit.c<br>
> index 231d6fd..52adf22 100644<br>
> --- a/src/intel/compiler/brw_eu_<wbr>emit.c<br>
> +++ b/src/intel/compiler/brw_eu_<wbr>emit.c<br>
> @@ -3403,6 +3403,53 @@ brw_memory_fence(struct brw_codegen *p,<br>
>  }<br>
><br>
>  void<br>
> +brw_interlock(struct brw_codegen *p,<br>
> +              struct brw_reg dst)<br>
> +{<br>
> +   const struct gen_device_info *devinfo = p->devinfo;<br>
> +   const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;<br>
> +   struct brw_inst *insn;<br>
> +<br>
> +   brw_push_insn_state(p);<br>
> +   brw_set_default_mask_control(<wbr>p, BRW_MASK_DISABLE);<br>
> +   brw_set_default_exec_size(p, BRW_EXECUTE_1);<br>
> +   dst = vec1(dst);<br>
> +<br>
> +   /* Set dst as destination for dependency tracking, the MEMORY_FENCE<br>
> +    * message doesn't write anything back.<br>
> +    */<br>
> +   /* BRW_OPCODE_SENDC is what the interlock actually depends on */<br>
> +   insn = next_insn(p, BRW_OPCODE_SENDC);<br>
> +   dst = retype(dst, BRW_REGISTER_TYPE_UW);<br>
> +   brw_set_dest(p, insn, dst);<br>
> +   brw_set_src0(p, insn, dst);<br>
> +   /* Issuing a memory fence ensures the ordering of fragments */<br>
> +   brw_set_memory_fence_message(<wbr>p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,<br>
> +                                commit_enable);<br>
> +<br>
> +   if (devinfo->gen == 7 && !devinfo->is_haswell) {<br>
> +      /* IVB does typed surface access through the render cache, so we need to<br>
> +       * flush it too.  Use a different register so both flushes can be<br>
> +       * pipelined by the hardware.<br>
> +       */<br>
> +      insn = next_insn(p, BRW_OPCODE_SENDC);<br>
> +      brw_set_dest(p, insn, offset(dst, 1));<br>
> +      brw_set_src0(p, insn, offset(dst, 1));<br>
> +      brw_set_memory_fence_message(<wbr>p, insn, GEN6_SFID_DATAPORT_RENDER_<wbr>CACHE,<br>
> +                                   commit_enable);<br>
> +<br>
> +      /* Now write the response of the second message into the response of the<br>
> +       * first to trigger a pipeline stall -- This way future render and data<br>
> +       * cache messages will be properly ordered with respect to past data and<br>
> +       * render cache messages.<br>
> +       */<br>
> +      brw_MOV(p, dst, offset(dst, 1));<br>
> +   }<br>
> +<br>
> +   brw_pop_insn_state(p);<br>
> +}<br>
> +<br>
<br>
</div></div>AFICT this is basically a copy of brw_memory_fence() with s/SEND/SENDC/.<br>
It may be a good idea to add an opcode argument to brw_memory_fence()<br>
instead and drop brw_interlock() altogether so we don't have to worry<br>
about keeping the two barrier implementations in sync if e.g. fixes are<br>
applied in the future.<br>
<div><div class="h5"><br></div></div></blockquote><div><br></div><div>I'll make that change, it makes sense not to duplicate code unnecessarily.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div><div class="h5">
> +void<br>
>  brw_pixel_interpolator_query(<wbr>struct brw_codegen *p,<br>
>                               struct brw_reg dest,<br>
>                               struct brw_reg mrf,<br>
> diff --git a/src/intel/compiler/brw_fs_<wbr>generator.cpp b/src/intel/compiler/brw_fs_<wbr>generator.cpp<br>
> index a7f95cc..093c12d 100644<br>
> --- a/src/intel/compiler/brw_fs_<wbr>generator.cpp<br>
> +++ b/src/intel/compiler/brw_fs_<wbr>generator.cpp<br>
> @@ -2070,6 +2070,10 @@ fs_generator::generate_code(<wbr>const cfg_t *cfg, int dispatch_width)<br>
>           brw_memory_fence(p, dst);<br>
>           break;<br>
><br>
> +      case SHADER_OPCODE_INTERLOCK:<br>
> +         brw_interlock(p, dst);<br>
> +         break;<br>
> +<br>
>        case SHADER_OPCODE_FIND_LIVE_<wbr>CHANNEL: {<br>
>           const struct brw_reg mask =<br>
>              brw_stage_has_packed_dispatch(<wbr>devinfo, stage,<br>
> diff --git a/src/intel/compiler/brw_fs_<wbr>nir.cpp b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
> index 23cd4b7..9d6302c 100644<br>
> --- a/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
> +++ b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
> @@ -4138,6 +4138,21 @@ fs_visitor::nir_emit_<wbr>intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr<br>
>        break;<br>
>     }<br>
><br>
> +   case nir_intrinsic_begin_<wbr>invocation_interlock_ARB: {<br>
> +      const fs_builder ubld = bld.group(8, 0);<br>
> +      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_<wbr>UD, 2);<br>
> +<br>
> +      ubld.emit(SHADER_OPCODE_<wbr>INTERLOCK, tmp)->size_written = 2 *<br>
> +         REG_SIZE;<br>
> +<br>
> +      break;<br>
> +   }<br>
> +<br>
> +   case nir_intrinsic_end_invocation_<wbr>interlock_ARB: {<br>
> +      /* We don't need to do anything here */<br>
> +      break;<br>
> +   }<br>
> +<br>
>     default:<br>
>        unreachable("unknown intrinsic");<br>
>     }<br>
> diff --git a/src/intel/compiler/brw_<wbr>shader.cpp b/src/intel/compiler/brw_<wbr>shader.cpp<br>
> index 304b4ec..8b9e42f 100644<br>
> --- a/src/intel/compiler/brw_<wbr>shader.cpp<br>
> +++ b/src/intel/compiler/brw_<wbr>shader.cpp<br>
> @@ -290,6 +290,9 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)<br>
>        return "typed_surface_write_logical";<br>
>     case SHADER_OPCODE_MEMORY_FENCE:<br>
>        return "memory_fence";<br>
> +   case SHADER_OPCODE_INTERLOCK:<br>
> +      /* For an interlock we actually issue a memory fence via sendc. */<br>
> +      return "memory_fence";<br>
<br>
</div></div>Still because your "interlock" instruction has different semantics it<br>
will be useful for debugging to print what IR instruction was actually<br>
in the shader rather than lying to the user. :P<br>
<span class=""><br></span></blockquote><div> </div><div>I was actually a bit torn on whether "memory_fence" or "interlock" would make more sense. I'll make the change :)</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="">
><br>
>     case SHADER_OPCODE_LOAD_PAYLOAD:<br>
>        return "load_payload";<br>
> @@ -989,6 +992,7 @@ backend_instruction::has_side_<wbr>effects() const<br>
>     case SHADER_OPCODE_TYPED_SURFACE_<wbr>WRITE:<br>
>     case SHADER_OPCODE_TYPED_SURFACE_<wbr>WRITE_LOGICAL:<br>
>     case SHADER_OPCODE_MEMORY_FENCE:<br>
> +   case SHADER_OPCODE_INTERLOCK:<br>
>     case SHADER_OPCODE_URB_WRITE_SIMD8:<br>
>     case SHADER_OPCODE_URB_WRITE_SIMD8_<wbr>PER_SLOT:<br>
>     case SHADER_OPCODE_URB_WRITE_SIMD8_<wbr>MASKED:<br>
> diff --git a/src/mesa/drivers/dri/i965/<wbr>intel_extensions.c b/src/mesa/drivers/dri/i965/<wbr>intel_extensions.c<br>
> index 0133fa1..830e530 100644<br>
> --- a/src/mesa/drivers/dri/i965/<wbr>intel_extensions.c<br>
> +++ b/src/mesa/drivers/dri/i965/<wbr>intel_extensions.c<br>
> @@ -297,4 +297,9 @@ intelInitExtensions(struct gl_context *ctx)<br>
>        ctx->Extensions.EXT_texture_<wbr>compression_s3tc = true;<br>
><br>
>     ctx->Extensions.ANGLE_texture_<wbr>compression_dxt = true;<br>
> +<br>
> +   if ((ctx->Extensions.ARB_shader_<wbr>image_load_store || ctx->Version >= 42) &&<br>
> +       ctx->Const.GLSLVersion >= 420) {<br>
> +      ctx->Extensions.ARB_fragment_<wbr>shader_interlock = true;<br>
<br>
</span>Since this is basically enabling it if and only if GL 4.2 is supported<br>
due to the GLSLVersion check, you could as well do a single<br>
unconditional assignment "ctx->Extensions.ARB_fragment_<wbr>shader_interlock<br>
= true" around intel_extensions.c:232 in the same block as the<br>
ARB_draw_indirect enable.<br>
<br></blockquote><div><br></div><div>I'll go ahead and do that.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
> +   }<br>
>  }<br>
<span class="HOEnZb"><font color="#888888">> --<br>
> 2.9.3<br>
</font></span></blockquote></div><br></div></div></div>