[Mesa-dev] [PATCH 09/25] i965/gen7: Fix the untyped surface messages to deal with indirect surface access.

Tue Dec 31 08:28:42 PST 2013

On 2 December 2013 11:39, Francisco Jerez <currojerez at riseup.net> wrote:

> Change brw_untyped_atomic() and brw_untyped_surface_read() to take the
> surface index as a register instead of a constant, construct the
> message descriptor dynamically by OR'ing the surface index and other
> descriptor bits together and use the non-immediate variant of SEND to
> submit the surface message.
> ---
>  src/mesa/drivers/dri/i965/brw_eu.h               |  18 +-
>  src/mesa/drivers/dri/i965/brw_eu_emit.c          | 200
> +++++++++++++++--------
>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp   |   7 +-
>  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |   7 +-
>  4 files changed, 147 insertions(+), 85 deletions(-)
>

There's something a bit hacky about the way brw_set_message_descriptor()
has always worked, and IMHO this patch makes that hackiness a lot more
troublesome.

The hackiness is: instead of computing the message descriptor we want as an
int32 and then passing it to brw_set_src1() using brw_imm_d(),
brw_set_message_descriptor() would pass brw_imm_d(0) to brw_set_src1(), and
then it would poke the proper message descriptor bits directly into the
brw_instruction::bits3 (which holds the immediate value used by the
instruction).

Previous to this patch, that hackiness was confined to just SEND
instructions.  But with this patch, brw_load_indirect_message_descriptor()
now does the same sort of hack to MOV and OR instructions in order to set
the msg_length, response_length, and header_present fields of the message
descriptor that's being dynamically computed.

I would much rather if we first refactored the code to deal with message
descriptors in a non-hacky way.  That is, instead of setting the immediate
value to 0 and then poking in the message descriptor bits, first compute
the correct message descriptor as an int32_t, and then store it in the SEND
instruction using brw_set_src1().  As part of this refactor, we would move
the message descriptor bitfield definitions out of brw_instruction::bits3
and into their own independent union.

Then, in this patch, instead of having
brw_load_indirect_message_descriptor() poke the constant parts of the
message descriptor into the OR or MOV instruction, it could just use the
new union to set the msg_length, response_length, and header_present fields
of the message descriptor, and then pass the resulting int32 value to
brw_MOV() or brw_OR() via brw_imm_d().  I think the resulting code would be
a lot easier to understand and maintain.

Additional comments below, though I'm not sure if they're all relevant
considering the refactor I'm suggesting above.

>
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h
> b/src/mesa/drivers/dri/i965/brw_eu.h
> index a6a65ca..45b421b 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -360,20 +360,20 @@ void brw_CMP(struct brw_compile *p,
>
>  void
>  brw_untyped_atomic(struct brw_compile *p,
> -                   struct brw_reg dest,
> +                   struct brw_reg dst,
>                     struct brw_reg mrf,
> -                   GLuint atomic_op,
> -                   GLuint bind_table_index,
> -                   GLuint msg_length,
> -                   GLuint response_length);
> +                   struct brw_reg surface,
> +                   unsigned atomic_op,
> +                   unsigned msg_length,
> +                   bool response_expected);
>
>  void
>  brw_untyped_surface_read(struct brw_compile *p,
> -                         struct brw_reg dest,
> +                         struct brw_reg dst,
>                           struct brw_reg mrf,
> -                         GLuint bind_table_index,
> -                         GLuint msg_length,
> -                         GLuint response_length);
> +                         struct brw_reg surface,
> +                         unsigned msg_length,
> +                         unsigned num_channels);
>
>  /***********************************************************************
>   * brw_eu_util.c:
> diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c
> b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> index cc093e0..b94a6d1 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> @@ -2527,23 +2527,87 @@ brw_svb_write(struct brw_compile *p,
>                              send_commit_msg); /* send_commit_msg */
>  }
>
> +static struct brw_instruction *
> +brw_load_indirect_message_descriptor(struct brw_compile *p,
> +                                     struct brw_reg dst,
> +                                     struct brw_reg src,
> +                                     unsigned msg_length,
> +                                     unsigned response_length,
> +                                     bool header_present)
> +{
> +   struct brw_instruction *insn;
> +
> +   brw_push_insn_state(p);
> +   brw_set_access_mode(p, BRW_ALIGN_1);
> +   brw_set_mask_control(p, BRW_MASK_DISABLE);
> +   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
> +
> +   if (src.file == BRW_IMMEDIATE_VALUE) {
> +      insn = brw_MOV(p, dst, brw_imm_ud(src.dw1.ud));
>

Why not just brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_UD))?

> +   } else {
> +      struct brw_reg tmp = suboffset(vec1(retype(src,
> BRW_REGISTER_TYPE_UD)),
> +                                     BRW_GET_SWZ(src.dw1.bits.swizzle,
> 0));
> +      insn = brw_OR(p, dst, tmp, brw_imm_ud(0));
> +   }
> +
> +   insn->bits3.generic_gen5.msg_length = msg_length;
> +   insn->bits3.generic_gen5.response_length = response_length;
> +   insn->bits3.generic_gen5.header_present = header_present;
> +
> +   brw_pop_insn_state(p);
> +
> +   return insn;
> +}
> +
> +static struct brw_instruction *
> +brw_send_indirect_message(struct brw_compile *p,
> +                          unsigned sfid,
> +                          struct brw_reg dst,
> +                          struct brw_reg mrf,
> +                          struct brw_reg desc)
> +{
> +   /* Due to a hardware limitation the message descriptor desc MUST be
> +    * stored in a0.0.  That means that there's only room for one
> +    * descriptor and the surface indices of different channels in the
> +    * same SIMD thread cannot diverge.  That's OK for the moment
> +    * because OpenGL requires image (and atomic counter) array
> +    * indexing to be dynamically uniform.
> +    */
>

A few points about this:

1. I would expect a comment like this to be followed by an assertion that
verifies that desc refers to address register 0.

2. Since the phrase "dynamically uniform" doesn't appear in
ARB_shader_image_load_store, a spec reference would be really good here.
Here's what I found:

GLSL 4.20 requires image (and atomic counter) array indexing to be
dynamically uniform.  Further, in the "changes from revision 9 of Version
4.20" section, it says "Correct inadvertently broad indexing by restricting
indexes of images and indexes of uniform blocks to being
dynamically-uniform integral expressions. This correction also applies to
earlier releases for uniform blocks (4.00 and 4.10)."  It seems reasonable
to assume that this correction should be applied to all implementations of
ARB_shader_image_load_store regardless of GLSL version.

3. Even with that clarified, I have a concern.  Consider the following
shader:

uniform int i;
uniform image2D images[4];

void main() {
   if (int(gl_FragCoord.x) % 2 != 0) {
      vec4 foo = imageLoad(images[i + 1], ivec2(0, 0));
      ...
   }
   ...
}

Is the expression "i + 1" (inside the "if" statement) allowed?  It is not
clear to me from the GLSL spec whether it's intended to be dynamically
uniform or not.  If it is intended to be allowed, then we have a problem,
because our code generator will only evaluate "i + 1" for fragments that
have int(gl_FragCoord.x) % 2 != 0.  It will leave garbage in the register
for any other fragments.  So when we compute the message descriptor for the
image load, if the fragment in SIMD channel 0 doesn't satisfy
int(gl_fragCoord.x) % 2 != 0, a0.0 will contain garbage, and the image load
will fail.

I have two ideas to address this:

(a) generate code that selects the first active SIMD channel and copies it
into a0.0.  Unfortunately this requires a nontrivial number of
instructions.  The best I can think of is something like this (assuming the
value we want to load is in register N):

   cmp.e.f0 (8) null tmp<8;8,1>UD tmp<8;8,1>UD (set f0 bit in all active
channels)
   fbl (1) tmp f0 {NoMask} (find first active channel)
   mul (1) tmp tmp 4 {NoMask} (compute byte offset of first active channel
within a register
   add (1) a0 tmp <32*N> {NoMask} (set a0 to point to first active channel
of register N)
   mov (1) tmp g[a0] {NoMask} (load tmp with the first active channel of
register N)

(b) during compilation, keep track of which expressions are dynamically
uniform.  When we emit code to compute a dynamically uniform value, emit it
with the {NoMask} option, so that all channels are computed.

The advantage of (a) is that it requires touching less Mesa code.  The
advantages of (b) are that it generates a more compact shader executable,
and it paves the way for more potential future optimizations (e.g. we could
use up less register space for dynamically uniform values, since we don't
have to compute them separately for each channel, and that would ease
register pressure).

> +   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
> +
> +   brw_set_dest(p, insn, retype(dst, BRW_REGISTER_TYPE_UD));
> +   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
> +   brw_set_src1(p, insn, retype(desc, BRW_REGISTER_TYPE_UD));
> +
> +   /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
> +   insn->header.destreg__conditionalmod = sfid;
> +
> +   return insn;
> +}
> +
> +static unsigned
> +brw_surface_payload_size(struct brw_compile *p,
> +                         unsigned num_channels,
> +                         bool has_simd4x2,
> +                         bool has_simd16)
>

Some comments on this function would be helpful.  It wasn't clear on first
reading that has_simd4x2 and has_simd16 refer to the capabilities of the
hardware (e.g. has_simd4x2 is true iff the hardware has simd4x2 support for
the message being sent).  Also, it wasn't clear that this function is
computing the proper response length of the SEND message.

> +{
> +   if (has_simd4x2 && p->current->header.access_mode == BRW_ALIGN_16)
> +      return 1;
> +   else if (has_simd16 && p->compressed)
> +      return 2 * num_channels;
> +   else
> +      return num_channels;
> +}
> +
>  static void
>  brw_set_dp_untyped_atomic_message(struct brw_compile *p,
>                                    struct brw_instruction *insn,
> -                                  GLuint atomic_op,
> -                                  GLuint bind_table_index,
> -                                  GLuint msg_length,
> -                                  GLuint response_length,
> -                                  bool header_present)
> +                                  unsigned atomic_op,
> +                                  bool response_expected)
>  {
>     if (p->brw->is_haswell) {
> -      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
> -                                 msg_length, response_length,
> -                                 header_present, false);
> -
> -
> -      if (insn->header.access_mode == BRW_ALIGN_1) {
> -         if (insn->header.execution_size != BRW_EXECUTE_16)
> +      if (p->current->header.access_mode == BRW_ALIGN_1) {
> +         if (!p->compressed)
>              insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
>
>           insn->bits3.gen7_dp.msg_type =
> @@ -2554,95 +2618,90 @@ brw_set_dp_untyped_atomic_message(struct
> brw_compile *p,
>        }
>
>     } else {
> -      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
> -                                 msg_length, response_length,
> -                                 header_present, false);
> -
>        insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
>
> -      if (insn->header.execution_size != BRW_EXECUTE_16)
> +      if (!p->compressed)
>           insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
>     }
>
> -   if (response_length)
> +   if (response_expected)
>        insn->bits3.ud |= 1 << 13; /* Return data expected */
>
> -   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
>     insn->bits3.ud |= atomic_op << 8;
>  }
>
>  void
>  brw_untyped_atomic(struct brw_compile *p,
> -                   struct brw_reg dest,
> +                   struct brw_reg dst,
>                     struct brw_reg mrf,
> -                   GLuint atomic_op,
> -                   GLuint bind_table_index,
> -                   GLuint msg_length,
> -                   GLuint response_length) {
> -   struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
> +                   struct brw_reg surface,
> +                   unsigned atomic_op,
> +                   unsigned msg_length,
> +                   bool response_expected)
> +{
> +   const unsigned sfid = (p->brw->is_haswell ?
> +                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
> +                          GEN7_SFID_DATAPORT_DATA_CACHE);
> +   const bool header_present = p->current->header.access_mode ==
> BRW_ALIGN_1;
> +   struct brw_reg desc = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
> +   struct brw_instruction *insn;
> +
> +   insn = brw_load_indirect_message_descriptor(
> +      p, desc, surface, msg_length,
> +      brw_surface_payload_size(p, response_expected, p->brw->is_haswell,
> true),
> +      header_present);
>

Passing surface directly into brw_load_indirect_message_descriptor isn't
safe, because the value in the "surface" register could potentially be
garbage, and brw_load_indirect_message_descriptor() forms the message
descriptor by OR'ing in additional flags.  That means that any bits in the
message descriptor could potentially become set, leading to a GPU hang.
Consider a shader like this:

uniform int i;
uniform image2D images[4];

void main() {
   vec4 foo = imageLoad(images[i], ivec2(0, 0));
   ...
}

If the value of "i" is out of range, then images[i] will cause a garbage
value to be loaded into the "surface" register.

To fix this, I'd recommend emitting an AND instruction to select just the
lower 8 bits (the binding table index) from the surface register.

>
> -   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
> -   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
> -   brw_set_src1(p, insn, brw_imm_d(0));
>     brw_set_dp_untyped_atomic_message(
> -      p, insn, atomic_op, bind_table_index, msg_length, response_length,
> -      insn->header.access_mode == BRW_ALIGN_1);
> +      p, insn, atomic_op, response_expected);
> +
> +   brw_send_indirect_message(p, sfid, dst, mrf, desc);
>  }
>
>  static void
>  brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
>                                          struct brw_instruction *insn,
> -                                        GLuint bind_table_index,
> -                                        GLuint msg_length,
> -                                        GLuint response_length,
> -                                        bool header_present)
> +                                        unsigned num_channels)
>  {
> -   const unsigned dispatch_width =
> -      (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
> -   const unsigned num_channels = response_length / (dispatch_width / 8);
> -
> -   if (p->brw->is_haswell) {
> -      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
> -                                 msg_length, response_length,
> -                                 header_present, false);
> -
> -      insn->bits3.gen7_dp.msg_type =
> HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
> -   } else {
> -      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
> -                                 msg_length, response_length,
> -                                 header_present, false);
> -
> -      insn->bits3.gen7_dp.msg_type =
> GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
> -   }
> +   insn->bits3.gen7_dp.msg_type = (p->brw->is_haswell ?
> +
> HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
> +                                   GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ);
>
> -   if (insn->header.access_mode == BRW_ALIGN_1) {
> -      if (dispatch_width == 16)
> +   if (p->current->header.access_mode == BRW_ALIGN_1) {
> +      if (p->compressed)
>           insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
>        else
>           insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
> +   } else {
> +      insn->bits3.ud |= 0 << 12; /* SIMD4x2 mode */
>     }
>
> -   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
> -
>     /* Set mask of 32-bit channels to drop. */
>     insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
>  }
>
>  void
>  brw_untyped_surface_read(struct brw_compile *p,
> -                         struct brw_reg dest,
> +                         struct brw_reg dst,
>                           struct brw_reg mrf,
> -                         GLuint bind_table_index,
> -                         GLuint msg_length,
> -                         GLuint response_length)
> +                         struct brw_reg surface,
> +                         unsigned msg_length,
> +                         unsigned num_channels)
>  {
> -   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
> +   const unsigned sfid = (p->brw->is_haswell ?
> HSW_SFID_DATAPORT_DATA_CACHE_1 :
> +                          GEN7_SFID_DATAPORT_DATA_CACHE);
> +   const bool header_present = p->current->header.access_mode ==
> BRW_ALIGN_1;
> +   struct brw_reg desc = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
> +   struct brw_instruction *insn;
> +
> +   insn = brw_load_indirect_message_descriptor(
> +      p, desc, surface, msg_length,
> +      brw_surface_payload_size(p, num_channels, true, true),
> +      header_present);
>

The same situation with the "surface" register applies here.

>
> -   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
> -   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
>     brw_set_dp_untyped_surface_read_message(
> -      p, insn, bind_table_index, msg_length, response_length,
> -      insn->header.access_mode == BRW_ALIGN_1);
> +      p, insn, num_channels);
> +
> +   brw_send_indirect_message(p, sfid, dst, mrf, desc);
>  }
>
>  /**
> @@ -2681,8 +2740,13 @@ void brw_shader_time_add(struct brw_compile *p,
>                                        BRW_ARF_NULL, 0));
>     brw_set_src0(p, send, brw_vec1_reg(payload.file,
>                                        payload.nr, 0));
> -   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
> -                                     2 /* message length */,
> -                                     0 /* response length */,
> -                                     false /* header present */);
> +   brw_set_src1(p, send, brw_imm_ud(0));
> +   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
> +
> +   /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
> +   send->header.destreg__conditionalmod =
> +      (p->brw->is_haswell ? HSW_SFID_DATAPORT_DATA_CACHE_1 :
> +       GEN7_SFID_DATAPORT_DATA_CACHE);
> +   send->bits3.generic_gen5.msg_length = 2;
> +   send->bits3.ud |= surf_index;
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index 9c2c318..4eb651f 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -1265,8 +1265,8 @@ fs_generator::generate_untyped_atomic(fs_inst *inst,
> struct brw_reg dst,
>           surf_index.type == BRW_REGISTER_TYPE_UD);
>
>     brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
> -                      atomic_op.dw1.ud, surf_index.dw1.ud,
> -                      inst->mlen, dispatch_width / 8);
> +                      surf_index, atomic_op.dw1.ud,
> +                      inst->mlen, true);
>
>     brw_mark_surface_used(&c->prog_data.base, surf_index.dw1.ud);
>  }
> @@ -1279,8 +1279,7 @@ fs_generator::generate_untyped_surface_read(fs_inst
> *inst, struct brw_reg dst,
>           surf_index.type == BRW_REGISTER_TYPE_UD);
>
>     brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
> -                            surf_index.dw1.ud,
> -                            inst->mlen, dispatch_width / 8);
> +                            surf_index, inst->mlen, 1);
>
>     brw_mark_surface_used(&c->prog_data.base, surf_index.dw1.ud);
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> index 83a2a27..3ac45a9 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> @@ -859,8 +859,8 @@
> vec4_generator::generate_untyped_atomic(vec4_instruction *inst,
>           surf_index.type == BRW_REGISTER_TYPE_UD);
>
>     brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
> -                      atomic_op.dw1.ud, surf_index.dw1.ud,
> -                      inst->mlen, 1);
> +                      surf_index, atomic_op.dw1.ud,
> +                      inst->mlen, true);
>
>     brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
>  }
> @@ -874,8 +874,7 @@
> vec4_generator::generate_untyped_surface_read(vec4_instruction *inst,
>           surf_index.type == BRW_REGISTER_TYPE_UD);
>
>     brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
> -                            surf_index.dw1.ud,
> -                            inst->mlen, 1);
> +                            surf_index, inst->mlen, 1);
>
>     brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
>  }
> --
> 1.8.3.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20131231/c2e7be91/attachment-0001.html>