<div dir="auto">Thank you so much for reviewing these patches Curro! <div dir="auto">I'll make the suggested changes and resubmit.</div></div><br><div class="gmail_quote"><div dir="ltr">On Thu, 5 Apr 2018, 19:25 Francisco Jerez, <<a href="mailto:currojerez@riseup.net">currojerez@riseup.net</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Plamena Manolova <<a href="mailto:plamena.n.manolova@gmail.com" target="_blank" rel="noreferrer">plamena.n.manolova@gmail.com</a>> writes:<br>
<br>
> Adds suppport for ARB_fragment_shader_interlock. We achieve<br>
> the interlock and fragment ordering by issuing a memory fence<br>
> via sendc.<br>
><br>
> Signed-off-by: Plamena Manolova <<a href="mailto:plamena.n.manolova@gmail.com" target="_blank" rel="noreferrer">plamena.n.manolova@gmail.com</a>><br>
> ---<br>
>  docs/features.txt                            |  2 +-<br>
>  docs/relnotes/18.1.0.html                    |  1 +<br>
>  src/intel/compiler/brw_eu.h                  |  3 ++-<br>
>  src/intel/compiler/brw_eu_defines.h          |  2 ++<br>
>  src/intel/compiler/brw_eu_emit.c             |  7 ++++---<br>
>  src/intel/compiler/brw_fs_generator.cpp      |  7 ++++++-<br>
>  src/intel/compiler/brw_fs_nir.cpp            | 15 +++++++++++++++<br>
>  src/intel/compiler/brw_shader.cpp            |  4 ++++<br>
>  src/intel/compiler/brw_vec4_generator.cpp    |  2 +-<br>
>  src/mesa/drivers/dri/i965/intel_extensions.c |  1 +<br>
>  10 files changed, 37 insertions(+), 7 deletions(-)<br>
><br>
> diff --git a/docs/features.txt b/docs/features.txt<br>
> index 5eae34bf0d..a621251efd 100644<br>
> --- a/docs/features.txt<br>
> +++ b/docs/features.txt<br>
> @@ -297,7 +297,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve<br>
>    GL_ARB_cl_event                                       not started<br>
>    GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)<br>
>    GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)<br>
> -  GL_ARB_fragment_shader_interlock                      not started<br>
> +  GL_ARB_fragment_shader_interlock                      DONE (i965)<br>
>    GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)<br>
>    GL_ARB_parallel_shader_compile                        not started, but Chia-I Wu did some related work in 2014<br>
>    GL_ARB_post_depth_coverage                            DONE (i965)<br>
> diff --git a/docs/relnotes/18.1.0.html b/docs/relnotes/18.1.0.html<br>
> index 1d5201717f..9d8e63855d 100644<br>
> --- a/docs/relnotes/18.1.0.html<br>
> +++ b/docs/relnotes/18.1.0.html<br>
> @@ -51,6 +51,7 @@ Note: some of the new features are only available with certain drivers.<br>
>  <li>GL_EXT_shader_framebuffer_fetch on i965 on desktop GL (GLES was already supported)</li><br>
>  <li>GL_EXT_shader_framebuffer_fetch_non_coherent on i965</li><br>
>  <li>Disk shader cache support for i965 enabled by default</li><br>
> +<li>GL_ARB_fragment_shader_interlock on i965</li><br>
>  </ul><br>
><br>
>  <h2>Bug fixes</h2><br>
> diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h<br>
> index ca72666a55..b2c36d3ea1 100644<br>
> --- a/src/intel/compiler/brw_eu.h<br>
> +++ b/src/intel/compiler/brw_eu.h<br>
> @@ -509,7 +509,8 @@ brw_byte_scattered_write(struct brw_codegen *p,<br>
><br>
>  void<br>
>  brw_memory_fence(struct brw_codegen *p,<br>
> -                 struct brw_reg dst);<br>
> +                 struct brw_reg dst,<br>
> +                 uint32_t send_op);<br>
><br>
<br>
The new argument should probably be of type "enum opcode" in order to<br>
avoid losing type information.<br>
<br>
>  void<br>
>  brw_pixel_interpolator_query(struct brw_codegen *p,<br>
> diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h<br>
> index 332d627bc3..2980e98a58 100644<br>
> --- a/src/intel/compiler/brw_eu_defines.h<br>
> +++ b/src/intel/compiler/brw_eu_defines.h<br>
> @@ -480,6 +480,8 @@ enum opcode {<br>
><br>
>     SHADER_OPCODE_GET_BUFFER_SIZE,<br>
><br>
> +   SHADER_OPCODE_INTERLOCK,<br>
> +<br>
>     VEC4_OPCODE_MOV_BYTES,<br>
>     VEC4_OPCODE_PACK_BYTES,<br>
>     VEC4_OPCODE_UNPACK_UNIFORM,<br>
> diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c<br>
> index f039af56d0..6a57397a41 100644<br>
> --- a/src/intel/compiler/brw_eu_emit.c<br>
> +++ b/src/intel/compiler/brw_eu_emit.c<br>
> @@ -3285,7 +3285,8 @@ brw_set_memory_fence_message(struct brw_codegen *p,<br>
><br>
>  void<br>
>  brw_memory_fence(struct brw_codegen *p,<br>
> -                 struct brw_reg dst)<br>
> +                 struct brw_reg dst,<br>
> +                 uint32_t send_op)<br>
>  {<br>
>     const struct gen_device_info *devinfo = p->devinfo;<br>
>     const bool commit_enable =<br>
> @@ -3301,7 +3302,7 @@ brw_memory_fence(struct brw_codegen *p,<br>
>     /* Set dst as destination for dependency tracking, the MEMORY_FENCE<br>
>      * message doesn't write anything back.<br>
>      */<br>
> -   insn = next_insn(p, BRW_OPCODE_SEND);<br>
> +   insn = next_insn(p, send_op);<br>
>     dst = retype(dst, BRW_REGISTER_TYPE_UW);<br>
>     brw_set_dest(p, insn, dst);<br>
>     brw_set_src0(p, insn, dst);<br>
> @@ -3313,7 +3314,7 @@ brw_memory_fence(struct brw_codegen *p,<br>
>         * flush it too.  Use a different register so both flushes can be<br>
>         * pipelined by the hardware.<br>
>         */<br>
> -      insn = next_insn(p, BRW_OPCODE_SEND);<br>
> +      insn = next_insn(p, send_op);<br>
>        brw_set_dest(p, insn, offset(dst, 1));<br>
>        brw_set_src0(p, insn, offset(dst, 1));<br>
>        brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,<br>
> diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp<br>
> index 0c85eb8e1e..f099d092d1 100644<br>
> --- a/src/intel/compiler/brw_fs_generator.cpp<br>
> +++ b/src/intel/compiler/brw_fs_generator.cpp<br>
> @@ -2277,7 +2277,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)<br>
>           break;<br>
><br>
>        case SHADER_OPCODE_MEMORY_FENCE:<br>
> -         brw_memory_fence(p, dst);<br>
> +         brw_memory_fence(p, dst, BRW_OPCODE_SEND);<br>
> +         break;<br>
> +<br>
> +      case SHADER_OPCODE_INTERLOCK:<br>
> +         /* The interlock is basically a memory fence issued via sendc */<br>
> +         brw_memory_fence(p, dst, BRW_OPCODE_SENDC);<br>
>           break;<br>
><br>
>        case SHADER_OPCODE_FIND_LIVE_CHANNEL: {<br>
> diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp<br>
> index dbd2105f7e..c07201f889 100644<br>
> --- a/src/intel/compiler/brw_fs_nir.cpp<br>
> +++ b/src/intel/compiler/brw_fs_nir.cpp<br>
> @@ -4771,6 +4771,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr<br>
>        break;<br>
>     }<br>
><br>
> +   case nir_intrinsic_begin_invocation_interlock_ARB: {<br>
> +      const fs_builder ubld = bld.group(8, 0);<br>
> +      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);<br>
> +<br>
> +      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *<br>
> +         REG_SIZE;<br>
> +<br>
> +      break;<br>
> +   }<br>
> +<br>
> +   case nir_intrinsic_end_invocation_interlock_ARB: {<br>
> +      /* We don't need to do anything here */<br>
> +      break;<br>
> +   }<br>
> +<br>
>     default:<br>
>        unreachable("unknown intrinsic");<br>
>     }<br>
> diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp<br>
> index ffe8a7403d..750ef1e886 100644<br>
> --- a/src/intel/compiler/brw_shader.cpp<br>
> +++ b/src/intel/compiler/brw_shader.cpp<br>
> @@ -292,6 +292,9 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)<br>
>        return "typed_surface_write_logical";<br>
>     case SHADER_OPCODE_MEMORY_FENCE:<br>
>        return "memory_fence";<br>
> +   case SHADER_OPCODE_INTERLOCK:<br>
> +      /* For an interlock we actually issue a memory fence via sendc. */<br>
> +      return "interlock";<br>
><br>
>     case SHADER_OPCODE_BYTE_SCATTERED_READ:<br>
>        return "byte_scattered_read";<br>
> @@ -991,6 +994,7 @@ backend_instruction::has_side_effects() const<br>
>     case SHADER_OPCODE_TYPED_SURFACE_WRITE:<br>
>     case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:<br>
>     case SHADER_OPCODE_MEMORY_FENCE:<br>
> +   case SHADER_OPCODE_INTERLOCK:<br>
>     case SHADER_OPCODE_URB_WRITE_SIMD8:<br>
>     case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:<br>
>     case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:<br>
> diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp<br>
> index a3ed8609a2..d6a864ce62 100644<br>
> --- a/src/intel/compiler/brw_vec4_generator.cpp<br>
> +++ b/src/intel/compiler/brw_vec4_generator.cpp<br>
> @@ -1904,7 +1904,7 @@ generate_code(struct brw_codegen *p,<br>
>           break;<br>
><br>
>        case SHADER_OPCODE_MEMORY_FENCE:<br>
> -         brw_memory_fence(p, dst);<br>
> +         brw_memory_fence(p, dst, BRW_OPCODE_SEND);<br>
>           break;<br>
><br>
>        case SHADER_OPCODE_FIND_LIVE_CHANNEL: {<br>
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c<br>
> index 73a6c73f53..f786e9917e 100644<br>
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c<br>
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c<br>
> @@ -240,6 +240,7 @@ intelInitExtensions(struct gl_context *ctx)<br>
>           ctx->Extensions.ARB_transform_feedback2 = true;<br>
>           ctx->Extensions.ARB_transform_feedback3 = true;<br>
>           ctx->Extensions.ARB_transform_feedback_instanced = true;<br>
> +         ctx->Extensions.ARB_fragment_shader_interlock = true;<br>
><br>
<br>
I don't think there is any reason to make this conditional on<br>
can_do_pipelined_register_writes().  The extension could technically be<br>
implemented trivially all the way back to the original i965 (the<br>
implementation for Gen4-5 would in fact be a no-op because the hardware<br>
implicitly orders *all* fragment shader invocations with overlapping<br>
fragments), but because it would be kind of difficult to take advantage<br>
of the functionality on such hardware in any way it's probably more<br>
reasonable to enable it on Gen7+ only for the moment.<br>
<br>
With these fixed patch is:<br>
<br>
Reviewed-by: Francisco Jerez <<a href="mailto:currojerez@riseup.net" target="_blank" rel="noreferrer">currojerez@riseup.net</a>><br>
<br>
>           if (can_do_compute_dispatch(brw->screen) &&<br>
>               ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) {<br>
> --<br>
> 2.11.0<br>
</blockquote></div>