[Mesa-dev] [PATCH 15/53] intel/fs: Set up FB write message headers in the visitor

Sat May 26 21:12:00 UTC 2018

Francisco Jerez <currojerez at riseup.net> writes:

> Jason Ekstrand <jason at jlekstrand.net> writes:
>
>> On Sat, May 26, 2018 at 12:15 PM, Francisco Jerez <currojerez at riseup.net>
>> wrote:
>>
>>> Jason Ekstrand <jason at jlekstrand.net> writes:
>>>
>>> > On Sat, May 26, 2018 at 11:27 AM, Francisco Jerez <currojerez at riseup.net
>>> >
>>> > wrote:
>>> >
>>> >> Jason Ekstrand <jason at jlekstrand.net> writes:
>>> >>
>>> >> > Doing instruction header setup in the generator is aweful for a number
>>> >> > of reasons.  For one, we can't schedule the header setup at all.  For
>>> >> > another, it means lots of implied writes which the instruction
>>> scheduler
>>> >> > and other passes can't properly read about.  The second isn't a huge
>>> >> > problem for FB writes since they always happen at the end.  We made a
>>> >> > similar change to sampler handling in ff4726077d86.
>>> >> > ---
>>> >> >  src/intel/compiler/brw_fs.cpp           | 103
>>> >> ++++++++++++++++++++++++++------
>>> >> >  src/intel/compiler/brw_fs_generator.cpp |  66 --------------------
>>> >> >  2 files changed, 86 insertions(+), 83 deletions(-)
>>> >> >
>>> >> > diff --git a/src/intel/compiler/brw_fs.cpp
>>> b/src/intel/compiler/brw_fs.
>>> >> cpp
>>> >> > index 3e9ccc4..5276a66 100644
>>> >> > --- a/src/intel/compiler/brw_fs.cpp
>>> >> > +++ b/src/intel/compiler/brw_fs.cpp
>>> >> > @@ -3247,7 +3247,18 @@ fs_visitor::emit_repclear_shader()
>>> >> >        write->mlen = 1;
>>> >> >     } else {
>>> >> >        assume(key->nr_color_regions > 0);
>>> >> > +
>>> >> > +      struct brw_reg header =
>>> >> > +         retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
>>> >> > +      bld.exec_all().group(16, 0)
>>> >> > +         .MOV(header, retype(brw_vec8_grf(0, 0),
>>> BRW_REGISTER_TYPE_UD));
>>> >> > +
>>> >> >        for (int i = 0; i < key->nr_color_regions; ++i) {
>>> >> > +         if (i > 0) {
>>> >> > +            bld.exec_all().group(1, 0)
>>> >> > +               .MOV(component(header, 2), brw_imm_ud(i));
>>> >> > +         }
>>> >> > +
>>> >> >           write = bld.emit(FS_OPCODE_REP_FB_WRITE);
>>> >> >           write->saturate = key->clamp_fragment_color;
>>> >> >           write->base_mrf = base_mrf;
>>> >> > @@ -3972,25 +3983,83 @@ lower_fb_write_logical_send(const fs_builder
>>> >> &bld, fs_inst *inst,
>>> >> >     int header_size = 2, payload_header_size;
>>> >> >     unsigned length = 0;
>>> >> >
>>> >> > -   /* From the Sandy Bridge PRM, volume 4, page 198:
>>> >> > -    *
>>> >> > -    *     "Dispatched Pixel Enables. One bit per pixel indicating
>>> >> > -    *      which pixels were originally enabled when the thread was
>>> >> > -    *      dispatched. This field is only required for the end-of-
>>> >> > -    *      thread message and on all dual-source messages."
>>> >> > -    */
>>> >> > -   if (devinfo->gen >= 6 &&
>>> >> > -       (devinfo->is_haswell || devinfo->gen >= 8 ||
>>> >> !prog_data->uses_kill) &&
>>> >> > -       color1.file == BAD_FILE &&
>>> >> > -       key->nr_color_regions == 1) {
>>> >> > -      header_size = 0;
>>> >> > -   }
>>> >> > +   if (devinfo->gen < 6) {
>>> >> > +      /* For gen4-5, we always have a header consisting of g0 and g1.
>>> >> We have
>>> >> > +       * an implied MOV from g0,g1 to the start of the message.  The
>>> >> MOV from
>>> >> > +       * g0 is handled by the hardware and the MOV from g1 is
>>> provided
>>> >> by the
>>> >> > +       * generator.  This is required because, on gen4-5, the
>>> generator
>>> >> may
>>> >> > +       * generate two write messages with different message lengths
>>> in
>>> >> order
>>> >> > +       * to handle AA data properly.
>>> >> > +       *
>>> >> > +       * Also, since the pixel mask goes in the g0 portion of the
>>> >> message and
>>> >> > +       * since render target writes are the last thing in the shader,
>>> >> we write
>>> >> > +       * the pixel mask directly into g0 and it will get copied as
>>> part
>>> >> of the
>>> >> > +       * implied write.
>>> >> > +       */
>>> >> > +      if (prog_data->uses_kill) {
>>> >> > +         bld.exec_all().group(1, 0)
>>> >> > +            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
>>> >> > +                 brw_flag_reg(0, 1));
>>> >> > +      }
>>> >> > +
>>> >> > +      assert(length == 0);
>>> >> > +      length = 2;
>>> >> > +   } else if ((devinfo->gen <= 7 && !devinfo->is_haswell &&
>>> >> > +               prog_data->uses_kill) ||
>>> >> > +              color1.file != BAD_FILE ||
>>> >> > +              key->nr_color_regions > 1) {
>>> >> > +      /* From the Sandy Bridge PRM, volume 4, page 198:
>>> >> > +       *
>>> >> > +       *     "Dispatched Pixel Enables. One bit per pixel indicating
>>> >> > +       *      which pixels were originally enabled when the thread
>>> was
>>> >> > +       *      dispatched. This field is only required for the end-of-
>>> >> > +       *      thread message and on all dual-source messages."
>>> >> > +       */
>>> >> > +      const fs_builder ubld = bld.exec_all().group(8, 0);
>>> >> > +
>>> >> > +      /* The header starts off as g0 and g1 */
>>> >> > +      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
>>> >> > +      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
>>> >> > +                                           BRW_REGISTER_TYPE_UD));
>>> >> > +
>>> >> > +      uint32_t g00_bits = 0;
>>> >> > +
>>> >> > +      /* Set "Source0 Alpha Present to RenderTarget" bit in message
>>> >> > +       * header.
>>> >> > +       */
>>> >> > +      if (inst->target > 0 && key->replicate_alpha)
>>> >> > +         g00_bits |= 1 << 11;
>>> >> > +
>>> >> > +      /* Set computes stencil to render target */
>>> >> > +      if (prog_data->computed_stencil)
>>> >> > +         g00_bits |= 1 << 14;
>>> >> > +
>>> >> > +      if (g00_bits) {
>>> >> > +         /* OR extra bits into g0.0 */
>>> >> > +         ubld.group(1, 0).OR(component(header, 0),
>>> >> > +                             retype(brw_vec1_grf(0, 0),
>>> >> > +                                    BRW_REGISTER_TYPE_UD),
>>> >> > +                             brw_imm_ud(g00_bits));
>>> >> > +      }
>>> >> > +
>>> >> > +      /* Set the render target index for choosing BLEND_STATE. */
>>> >> > +      if (inst->target > 0) {
>>> >> > +         ubld.group(1, 0).MOV(component(header, 2),
>>> >> brw_imm_ud(inst->target));
>>> >> > +      }
>>> >> > +
>>> >> > +      if (prog_data->uses_kill) {
>>> >> > +         ubld.group(1, 0).MOV(retype(component(header, 15),
>>> >> > +                                     BRW_REGISTER_TYPE_UW),
>>> >> > +                              brw_flag_reg(0, 1));
>>> >> > +      }
>>> >>
>>> >> The original patch from my branch implemented the header setup in a
>>> >> helper function called setup_fb_write_header() which allowed it to share
>>> >> the code among emit_repclear_shader() and lower_fb_write_logical_send().
>>> >> I don't think there is any benefit from open-coding the header setup in
>>> >> the already complex enough callers, seems like an obfuscation to me
>>> >> (particularly of the emit_repclear_shader() codepath with PATCH 52 in
>>> >> mind).
>>> >>
>>> >
>>> > Yeah, I think this one could go either way, TBH.  Having it in a helper
>>> > avoids some setup in emit_repclear_shader (though not that much) and is
>>> > nice because it's sort-of functional in nature.  The thing I like about
>>> > this approach is that it combines the decision to use the header with the
>>> > code to create said header.  We could do that in a helper I suppose and
>>> > have it return BAD_FILE if no header is needed.
>>> >
>>>
>>> Yes, a helper could combine the decision of whether to send a header or
>>> not pretty easily.
>>>
>>> > Also, the "allocate and return a register" model ended up resulting in
>>> > worse code on gen4-5 because it didn't play nicely with the implied MRF
>>> > write.  On those platforms, it's easier to set the header up by doing the
>>> > needed MOVs to g0 and letting the implied MRF write copy it into the
>>> > message.
>>> >
>>>
>>> I don't think there's much of a performance benefit from an implied MRF
>>> write relative to a favorably scheduled single-GRF MOV on Gen4-5.
>>> Actually under some conditions it can take more EU cycles because you
>>> eat the latency of the implied MOV without being able to schedule
>>> anything in between.
>>>
>>
>> The bigger problem is that we either have to keep the impled MRF write or
>> we have to handle the AA data workaround in the visitor as well.
>> Ultimately, that's probably what we want to do.  I'm trying to avoid
>> debugging gen4...
>>
>
> That's no big deal, the implied MRF write can probably stay for the
> moment,

In the "runtime_check_aads_emit == true" case that is.

> whether the header is set up in the visitor or in the generator.
> The question is just whether the source of the implied move will be r0
> or the first payload register of the FB write.
>
>>
>>> > As far as complexity goes, I'm not sure.  Setting up FB writes is an
>>> > inherently complex operation.  Probably the biggest thing I didn't like
>>> > about having it be in a helper was that the header code was around 7-800
>>> > lines away from the code to set up the rest of the FB write.  That could
>>> > have been solved by a pre-declaration, however.
>>> >
>>>
>>> It's a couple of keystrokes away if your editor supports some sort of
>>> search function.  And I appreciate the possibility not to see the header
>>> setup code while dealing with the FB write lowering code if it's not
>>> what I'm looking for.  And the possibility to share code which my
>>> original patch took advantage of.
>>>
>>> > You are free to continue to disagree with my approach.  Hopefully, I've
>>> at
>>> > least convinced you that I didn't do it just because I like straight
>>> inline
>>> > code (which I do like).
>>> >
>>>
>>> Ugh...  I was just thinking you had inlined it in order to
>>> micro-optimize it, but knowing you actually prefer a bunch of hairy code
>>> over a single call of a well-behaved abstraction (it was indeed
>>> referentially transparent and close to a pure function) to achieve the
>>> same effect makes me even more shocked ;P
>>>
>>
>> We have always had different definitions of code cleanliness. :-)
>>
>
> Didn't ever think we disagreed on structured programming being useful to
> write code which is easily maintainable and reusable... :P
>
>> We can switch it back to being a function that returns a register though
>> it'll take a little reworking of gen4 as mentioned above.  I don't think it
>> really saves us anything but whatever...
>>
>> --Jason
>>
>>
>>> > --Jason
>>> >
>>> >
>>> >> >
>>> >> > -   if (header_size != 0) {
>>> >> > -      assert(header_size == 2);
>>> >> > -      /* Allocate 2 registers for a header */
>>> >> > -      length += 2;
>>> >> > +      assert(length == 0);
>>> >> > +      sources[0] = header;
>>> >> > +      sources[1] = horiz_offset(header, 8);
>>> >> > +      length = 2;
>>> >> >     }
>>> >> > +   assert(length == 0 || length == 2);
>>> >> > +   header_size = length;
>>> >> >
>>> >> >     if (payload.aa_dest_stencil_reg) {
>>> >> >        sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
>>> >> > diff --git a/src/intel/compiler/brw_fs_generator.cpp
>>> >> b/src/intel/compiler/brw_fs_generator.cpp
>>> >> > index 8cc9d31..1a2b961 100644
>>> >> > --- a/src/intel/compiler/brw_fs_generator.cpp
>>> >> > +++ b/src/intel/compiler/brw_fs_generator.cpp
>>> >> > @@ -307,9 +307,6 @@ fs_generator::fire_fb_write(fs_inst *inst,
>>> >> >  void
>>> >> >  fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg
>>> payload)
>>> >> >  {
>>> >> > -   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_
>>> >> data);
>>> >> > -   const brw_wm_prog_key * const key = (brw_wm_prog_key * const)
>>> >> this->key;
>>> >> > -
>>> >> >     if (devinfo->gen < 8 && !devinfo->is_haswell) {
>>> >> >        brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
>>> >> >     }
>>> >> > @@ -320,69 +317,6 @@ fs_generator::generate_fb_write(fs_inst *inst,
>>> >> struct brw_reg payload)
>>> >> >     if (inst->base_mrf >= 0)
>>> >> >        payload = brw_message_reg(inst->base_mrf);
>>> >> >
>>> >> > -   /* Header is 2 regs, g0 and g1 are the contents. g0 will be
>>> implied
>>> >> > -    * move, here's g1.
>>> >> > -    */
>>> >> > -   if (inst->header_size != 0) {
>>> >> > -      brw_push_insn_state(p);
>>> >> > -      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
>>> >> > -      brw_set_default_exec_size(p, BRW_EXECUTE_1);
>>> >> > -      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
>>> >> > -      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
>>> >> > -      brw_set_default_flag_reg(p, 0, 0);
>>> >> > -
>>> >> > -      /* On HSW, the GPU will use the predicate on SENDC, unless the
>>> >> header is
>>> >> > -       * present.
>>> >> > -       */
>>> >> > -      if (prog_data->uses_kill) {
>>> >> > -         struct brw_reg pixel_mask;
>>> >> > -
>>> >> > -         if (devinfo->gen >= 6)
>>> >> > -            pixel_mask = retype(brw_vec1_grf(1, 7),
>>> >> BRW_REGISTER_TYPE_UW);
>>> >> > -         else
>>> >> > -            pixel_mask = retype(brw_vec1_grf(0, 0),
>>> >> BRW_REGISTER_TYPE_UW);
>>> >> > -
>>> >> > -         brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
>>> >> > -      }
>>> >> > -
>>> >> > -      if (devinfo->gen >= 6) {
>>> >> > -         brw_push_insn_state(p);
>>> >> > -         brw_set_default_exec_size(p, BRW_EXECUTE_16);
>>> >> > -      brw_set_default_compression_control(p,
>>> >> BRW_COMPRESSION_COMPRESSED);
>>> >> > -      brw_MOV(p,
>>> >> > -              retype(payload, BRW_REGISTER_TYPE_UD),
>>> >> > -              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
>>> >> > -         brw_pop_insn_state(p);
>>> >> > -
>>> >> > -         if (inst->target > 0 && key->replicate_alpha) {
>>> >> > -            /* Set "Source0 Alpha Present to RenderTarget" bit in
>>> >> message
>>> >> > -             * header.
>>> >> > -             */
>>> >> > -            brw_OR(p,
>>> >> > -                vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
>>> >> > -                vec1(retype(brw_vec8_grf(0, 0),
>>> BRW_REGISTER_TYPE_UD)),
>>> >> > -                brw_imm_ud(0x1 << 11));
>>> >> > -         }
>>> >> > -
>>> >> > -      if (inst->target > 0) {
>>> >> > -         /* Set the render target index for choosing BLEND_STATE. */
>>> >> > -         brw_MOV(p, retype(vec1(suboffset(payload, 2)),
>>> >> > -                              BRW_REGISTER_TYPE_UD),
>>> >> > -                 brw_imm_ud(inst->target));
>>> >> > -      }
>>> >> > -
>>> >> > -         /* Set computes stencil to render target */
>>> >> > -         if (prog_data->computed_stencil) {
>>> >> > -            brw_OR(p,
>>> >> > -                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
>>> >> > -                   vec1(retype(brw_vec8_grf(0, 0),
>>> >> BRW_REGISTER_TYPE_UD)),
>>> >> > -                   brw_imm_ud(0x1 << 14));
>>> >> > -         }
>>> >> > -      }
>>> >> > -
>>> >> > -      brw_pop_insn_state(p);
>>> >> > -   }
>>> >> > -
>>> >> >     if (!runtime_check_aads_emit) {
>>> >> >        fire_fb_write(inst, payload, implied_header, inst->mlen);
>>> >> >     } else {
>>> >> > --
>>> >> > 2.5.0.400.gff86faf
>>> >> >
>>> >> > _______________________________________________
>>> >> > mesa-dev mailing list
>>> >> > mesa-dev at lists.freedesktop.org
>>> >> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>> >>
>>>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180526/65a4b83e/attachment-0001.sig>