[Mesa-dev] [PATCH 2/2] i965: Add ARB_fragment_shader_interlock support.

Thu Apr 5 16:09:33 UTC 2018

Plamena Manolova <plamena.n.manolova at gmail.com> writes:

> Adds suppport for ARB_fragment_shader_interlock. We achieve
> the interlock and fragment ordering by issuing a memory fence
> via sendc.
>
> Signed-off-by: Plamena Manolova <plamena.n.manolova at gmail.com>
> ---
>  docs/features.txt                            |  2 +-
>  docs/relnotes/18.1.0.html                    |  1 +
>  src/intel/compiler/brw_eu.h                  |  3 ++-
>  src/intel/compiler/brw_eu_defines.h          |  2 ++
>  src/intel/compiler/brw_eu_emit.c             |  7 ++++---
>  src/intel/compiler/brw_fs_generator.cpp      |  7 ++++++-
>  src/intel/compiler/brw_fs_nir.cpp            | 15 +++++++++++++++
>  src/intel/compiler/brw_shader.cpp            |  4 ++++
>  src/intel/compiler/brw_vec4_generator.cpp    |  2 +-
>  src/mesa/drivers/dri/i965/intel_extensions.c |  1 +
>  10 files changed, 37 insertions(+), 7 deletions(-)
>
> diff --git a/docs/features.txt b/docs/features.txt
> index 5eae34bf0d..a621251efd 100644
> --- a/docs/features.txt
> +++ b/docs/features.txt
> @@ -297,7 +297,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
>    GL_ARB_cl_event                                       not started
>    GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
>    GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
> -  GL_ARB_fragment_shader_interlock                      not started
> +  GL_ARB_fragment_shader_interlock                      DONE (i965)
>    GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
>    GL_ARB_parallel_shader_compile                        not started, but Chia-I Wu did some related work in 2014
>    GL_ARB_post_depth_coverage                            DONE (i965)
> diff --git a/docs/relnotes/18.1.0.html b/docs/relnotes/18.1.0.html
> index 1d5201717f..9d8e63855d 100644
> --- a/docs/relnotes/18.1.0.html
> +++ b/docs/relnotes/18.1.0.html
> @@ -51,6 +51,7 @@ Note: some of the new features are only available with certain drivers.
>  <li>GL_EXT_shader_framebuffer_fetch on i965 on desktop GL (GLES was already supported)</li>
>  <li>GL_EXT_shader_framebuffer_fetch_non_coherent on i965</li>
>  <li>Disk shader cache support for i965 enabled by default</li>
> +<li>GL_ARB_fragment_shader_interlock on i965</li>
>  </ul>
>  
>  <h2>Bug fixes</h2>
> diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
> index ca72666a55..b2c36d3ea1 100644
> --- a/src/intel/compiler/brw_eu.h
> +++ b/src/intel/compiler/brw_eu.h
> @@ -509,7 +509,8 @@ brw_byte_scattered_write(struct brw_codegen *p,
>  
>  void
>  brw_memory_fence(struct brw_codegen *p,
> -                 struct brw_reg dst);
> +                 struct brw_reg dst,
> +                 uint32_t send_op);
>

The new argument should probably be of type "enum opcode" in order to
avoid losing type information.

>  void
>  brw_pixel_interpolator_query(struct brw_codegen *p,
> diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
> index 332d627bc3..2980e98a58 100644
> --- a/src/intel/compiler/brw_eu_defines.h
> +++ b/src/intel/compiler/brw_eu_defines.h
> @@ -480,6 +480,8 @@ enum opcode {
>  
>     SHADER_OPCODE_GET_BUFFER_SIZE,
>  
> +   SHADER_OPCODE_INTERLOCK,
> +
>     VEC4_OPCODE_MOV_BYTES,
>     VEC4_OPCODE_PACK_BYTES,
>     VEC4_OPCODE_UNPACK_UNIFORM,
> diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
> index f039af56d0..6a57397a41 100644
> --- a/src/intel/compiler/brw_eu_emit.c
> +++ b/src/intel/compiler/brw_eu_emit.c
> @@ -3285,7 +3285,8 @@ brw_set_memory_fence_message(struct brw_codegen *p,
>  
>  void
>  brw_memory_fence(struct brw_codegen *p,
> -                 struct brw_reg dst)
> +                 struct brw_reg dst,
> +                 uint32_t send_op)
>  {
>     const struct gen_device_info *devinfo = p->devinfo;
>     const bool commit_enable =
> @@ -3301,7 +3302,7 @@ brw_memory_fence(struct brw_codegen *p,
>     /* Set dst as destination for dependency tracking, the MEMORY_FENCE
>      * message doesn't write anything back.
>      */
> -   insn = next_insn(p, BRW_OPCODE_SEND);
> +   insn = next_insn(p, send_op);
>     dst = retype(dst, BRW_REGISTER_TYPE_UW);
>     brw_set_dest(p, insn, dst);
>     brw_set_src0(p, insn, dst);
> @@ -3313,7 +3314,7 @@ brw_memory_fence(struct brw_codegen *p,
>         * flush it too.  Use a different register so both flushes can be
>         * pipelined by the hardware.
>         */
> -      insn = next_insn(p, BRW_OPCODE_SEND);
> +      insn = next_insn(p, send_op);
>        brw_set_dest(p, insn, offset(dst, 1));
>        brw_set_src0(p, insn, offset(dst, 1));
>        brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
> diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
> index 0c85eb8e1e..f099d092d1 100644
> --- a/src/intel/compiler/brw_fs_generator.cpp
> +++ b/src/intel/compiler/brw_fs_generator.cpp
> @@ -2277,7 +2277,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
>           break;
>  
>        case SHADER_OPCODE_MEMORY_FENCE:
> -         brw_memory_fence(p, dst);
> +         brw_memory_fence(p, dst, BRW_OPCODE_SEND);
> +         break;
> +
> +      case SHADER_OPCODE_INTERLOCK:
> +         /* The interlock is basically a memory fence issued via sendc */
> +         brw_memory_fence(p, dst, BRW_OPCODE_SENDC);
>           break;
>  
>        case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
> diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
> index dbd2105f7e..c07201f889 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -4771,6 +4771,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>        break;
>     }
>  
> +   case nir_intrinsic_begin_invocation_interlock_ARB: {
> +      const fs_builder ubld = bld.group(8, 0);
> +      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
> +
> +      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
> +         REG_SIZE;
> +
> +      break;
> +   }
> +
> +   case nir_intrinsic_end_invocation_interlock_ARB: {
> +      /* We don't need to do anything here */
> +      break;
> +   }
> +
>     default:
>        unreachable("unknown intrinsic");
>     }
> diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
> index ffe8a7403d..750ef1e886 100644
> --- a/src/intel/compiler/brw_shader.cpp
> +++ b/src/intel/compiler/brw_shader.cpp
> @@ -292,6 +292,9 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
>        return "typed_surface_write_logical";
>     case SHADER_OPCODE_MEMORY_FENCE:
>        return "memory_fence";
> +   case SHADER_OPCODE_INTERLOCK:
> +      /* For an interlock we actually issue a memory fence via sendc. */
> +      return "interlock";
>  
>     case SHADER_OPCODE_BYTE_SCATTERED_READ:
>        return "byte_scattered_read";
> @@ -991,6 +994,7 @@ backend_instruction::has_side_effects() const
>     case SHADER_OPCODE_TYPED_SURFACE_WRITE:
>     case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
>     case SHADER_OPCODE_MEMORY_FENCE:
> +   case SHADER_OPCODE_INTERLOCK:
>     case SHADER_OPCODE_URB_WRITE_SIMD8:
>     case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
>     case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
> diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
> index a3ed8609a2..d6a864ce62 100644
> --- a/src/intel/compiler/brw_vec4_generator.cpp
> +++ b/src/intel/compiler/brw_vec4_generator.cpp
> @@ -1904,7 +1904,7 @@ generate_code(struct brw_codegen *p,
>           break;
>  
>        case SHADER_OPCODE_MEMORY_FENCE:
> -         brw_memory_fence(p, dst);
> +         brw_memory_fence(p, dst, BRW_OPCODE_SEND);
>           break;
>  
>        case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
> index 73a6c73f53..f786e9917e 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -240,6 +240,7 @@ intelInitExtensions(struct gl_context *ctx)
>           ctx->Extensions.ARB_transform_feedback2 = true;
>           ctx->Extensions.ARB_transform_feedback3 = true;
>           ctx->Extensions.ARB_transform_feedback_instanced = true;
> +         ctx->Extensions.ARB_fragment_shader_interlock = true;
>  

I don't think there is any reason to make this conditional on
can_do_pipelined_register_writes().  The extension could technically be
implemented trivially all the way back to the original i965 (the
implementation for Gen4-5 would in fact be a no-op because the hardware
implicitly orders *all* fragment shader invocations with overlapping
fragments), but because it would be kind of difficult to take advantage
of the functionality on such hardware in any way it's probably more
reasonable to enable it on Gen7+ only for the moment.

With these fixed patch is:

Reviewed-by: Francisco Jerez <currojerez at riseup.net>

>           if (can_do_compute_dispatch(brw->screen) &&
>               ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) {
> -- 
> 2.11.0
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180405/2ff09a1f/attachment.sig>