[Mesa-dev] [PATCH 15/53] intel/fs: Set up FB write message headers in the visitor

Sat May 26 21:05:14 UTC 2018

Jason Ekstrand <jason at jlekstrand.net> writes:

> On Sat, May 26, 2018 at 12:15 PM, Francisco Jerez <currojerez at riseup.net>
> wrote:
>
>> Jason Ekstrand <jason at jlekstrand.net> writes:
>>
>> > On Sat, May 26, 2018 at 11:27 AM, Francisco Jerez <currojerez at riseup.net
>> >
>> > wrote:
>> >
>> >> Jason Ekstrand <jason at jlekstrand.net> writes:
>> >>
>> >> > Doing instruction header setup in the generator is aweful for a number
>> >> > of reasons.  For one, we can't schedule the header setup at all.  For
>> >> > another, it means lots of implied writes which the instruction
>> scheduler
>> >> > and other passes can't properly read about.  The second isn't a huge
>> >> > problem for FB writes since they always happen at the end.  We made a
>> >> > similar change to sampler handling in ff4726077d86.
>> >> > ---
>> >> >  src/intel/compiler/brw_fs.cpp           | 103
>> >> ++++++++++++++++++++++++++------
>> >> >  src/intel/compiler/brw_fs_generator.cpp |  66 --------------------
>> >> >  2 files changed, 86 insertions(+), 83 deletions(-)
>> >> >
>> >> > diff --git a/src/intel/compiler/brw_fs.cpp
>> b/src/intel/compiler/brw_fs.
>> >> cpp
>> >> > index 3e9ccc4..5276a66 100644
>> >> > --- a/src/intel/compiler/brw_fs.cpp
>> >> > +++ b/src/intel/compiler/brw_fs.cpp
>> >> > @@ -3247,7 +3247,18 @@ fs_visitor::emit_repclear_shader()
>> >> >        write->mlen = 1;
>> >> >     } else {
>> >> >        assume(key->nr_color_regions > 0);
>> >> > +
>> >> > +      struct brw_reg header =
>> >> > +         retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
>> >> > +      bld.exec_all().group(16, 0)
>> >> > +         .MOV(header, retype(brw_vec8_grf(0, 0),
>> BRW_REGISTER_TYPE_UD));
>> >> > +
>> >> >        for (int i = 0; i < key->nr_color_regions; ++i) {
>> >> > +         if (i > 0) {
>> >> > +            bld.exec_all().group(1, 0)
>> >> > +               .MOV(component(header, 2), brw_imm_ud(i));
>> >> > +         }
>> >> > +
>> >> >           write = bld.emit(FS_OPCODE_REP_FB_WRITE);
>> >> >           write->saturate = key->clamp_fragment_color;
>> >> >           write->base_mrf = base_mrf;
>> >> > @@ -3972,25 +3983,83 @@ lower_fb_write_logical_send(const fs_builder
>> >> &bld, fs_inst *inst,
>> >> >     int header_size = 2, payload_header_size;
>> >> >     unsigned length = 0;
>> >> >
>> >> > -   /* From the Sandy Bridge PRM, volume 4, page 198:
>> >> > -    *
>> >> > -    *     "Dispatched Pixel Enables. One bit per pixel indicating
>> >> > -    *      which pixels were originally enabled when the thread was
>> >> > -    *      dispatched. This field is only required for the end-of-
>> >> > -    *      thread message and on all dual-source messages."
>> >> > -    */
>> >> > -   if (devinfo->gen >= 6 &&
>> >> > -       (devinfo->is_haswell || devinfo->gen >= 8 ||
>> >> !prog_data->uses_kill) &&
>> >> > -       color1.file == BAD_FILE &&
>> >> > -       key->nr_color_regions == 1) {
>> >> > -      header_size = 0;
>> >> > -   }
>> >> > +   if (devinfo->gen < 6) {
>> >> > +      /* For gen4-5, we always have a header consisting of g0 and g1.
>> >> We have
>> >> > +       * an implied MOV from g0,g1 to the start of the message.  The
>> >> MOV from
>> >> > +       * g0 is handled by the hardware and the MOV from g1 is
>> provided
>> >> by the
>> >> > +       * generator.  This is required because, on gen4-5, the
>> generator
>> >> may
>> >> > +       * generate two write messages with different message lengths
>> in
>> >> order
>> >> > +       * to handle AA data properly.
>> >> > +       *
>> >> > +       * Also, since the pixel mask goes in the g0 portion of the
>> >> message and
>> >> > +       * since render target writes are the last thing in the shader,
>> >> we write
>> >> > +       * the pixel mask directly into g0 and it will get copied as
>> part
>> >> of the
>> >> > +       * implied write.
>> >> > +       */
>> >> > +      if (prog_data->uses_kill) {
>> >> > +         bld.exec_all().group(1, 0)
>> >> > +            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
>> >> > +                 brw_flag_reg(0, 1));
>> >> > +      }
>> >> > +
>> >> > +      assert(length == 0);
>> >> > +      length = 2;
>> >> > +   } else if ((devinfo->gen <= 7 && !devinfo->is_haswell &&
>> >> > +               prog_data->uses_kill) ||
>> >> > +              color1.file != BAD_FILE ||
>> >> > +              key->nr_color_regions > 1) {
>> >> > +      /* From the Sandy Bridge PRM, volume 4, page 198:
>> >> > +       *
>> >> > +       *     "Dispatched Pixel Enables. One bit per pixel indicating
>> >> > +       *      which pixels were originally enabled when the thread
>> was
>> >> > +       *      dispatched. This field is only required for the end-of-
>> >> > +       *      thread message and on all dual-source messages."
>> >> > +       */
>> >> > +      const fs_builder ubld = bld.exec_all().group(8, 0);
>> >> > +
>> >> > +      /* The header starts off as g0 and g1 */
>> >> > +      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
>> >> > +      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
>> >> > +                                           BRW_REGISTER_TYPE_UD));
>> >> > +
>> >> > +      uint32_t g00_bits = 0;
>> >> > +
>> >> > +      /* Set "Source0 Alpha Present to RenderTarget" bit in message
>> >> > +       * header.
>> >> > +       */
>> >> > +      if (inst->target > 0 && key->replicate_alpha)
>> >> > +         g00_bits |= 1 << 11;
>> >> > +
>> >> > +      /* Set computes stencil to render target */
>> >> > +      if (prog_data->computed_stencil)
>> >> > +         g00_bits |= 1 << 14;
>> >> > +
>> >> > +      if (g00_bits) {
>> >> > +         /* OR extra bits into g0.0 */
>> >> > +         ubld.group(1, 0).OR(component(header, 0),
>> >> > +                             retype(brw_vec1_grf(0, 0),
>> >> > +                                    BRW_REGISTER_TYPE_UD),
>> >> > +                             brw_imm_ud(g00_bits));
>> >> > +      }
>> >> > +
>> >> > +      /* Set the render target index for choosing BLEND_STATE. */
>> >> > +      if (inst->target > 0) {
>> >> > +         ubld.group(1, 0).MOV(component(header, 2),
>> >> brw_imm_ud(inst->target));
>> >> > +      }
>> >> > +
>> >> > +      if (prog_data->uses_kill) {
>> >> > +         ubld.group(1, 0).MOV(retype(component(header, 15),
>> >> > +                                     BRW_REGISTER_TYPE_UW),
>> >> > +                              brw_flag_reg(0, 1));
>> >> > +      }
>> >>
>> >> The original patch from my branch implemented the header setup in a
>> >> helper function called setup_fb_write_header() which allowed it to share
>> >> the code among emit_repclear_shader() and lower_fb_write_logical_send().
>> >> I don't think there is any benefit from open-coding the header setup in
>> >> the already complex enough callers, seems like an obfuscation to me
>> >> (particularly of the emit_repclear_shader() codepath with PATCH 52 in
>> >> mind).
>> >>
>> >
>> > Yeah, I think this one could go either way, TBH.  Having it in a helper
>> > avoids some setup in emit_repclear_shader (though not that much) and is
>> > nice because it's sort-of functional in nature.  The thing I like about
>> > this approach is that it combines the decision to use the header with the
>> > code to create said header.  We could do that in a helper I suppose and
>> > have it return BAD_FILE if no header is needed.
>> >
>>
>> Yes, a helper could combine the decision of whether to send a header or
>> not pretty easily.
>>
>> > Also, the "allocate and return a register" model ended up resulting in
>> > worse code on gen4-5 because it didn't play nicely with the implied MRF
>> > write.  On those platforms, it's easier to set the header up by doing the
>> > needed MOVs to g0 and letting the implied MRF write copy it into the
>> > message.
>> >
>>
>> I don't think there's much of a performance benefit from an implied MRF
>> write relative to a favorably scheduled single-GRF MOV on Gen4-5.
>> Actually under some conditions it can take more EU cycles because you
>> eat the latency of the implied MOV without being able to schedule
>> anything in between.
>>
>
> The bigger problem is that we either have to keep the impled MRF write or
> we have to handle the AA data workaround in the visitor as well.
> Ultimately, that's probably what we want to do.  I'm trying to avoid
> debugging gen4...
>

That's no big deal, the implied MRF write can probably stay for the
moment, whether the header is set up in the visitor or in the generator.
The question is just whether the source of the implied move will be r0
or the first payload register of the FB write.

>
>> > As far as complexity goes, I'm not sure.  Setting up FB writes is an
>> > inherently complex operation.  Probably the biggest thing I didn't like
>> > about having it be in a helper was that the header code was around 7-800
>> > lines away from the code to set up the rest of the FB write.  That could
>> > have been solved by a pre-declaration, however.
>> >
>>
>> It's a couple of keystrokes away if your editor supports some sort of
>> search function.  And I appreciate the possibility not to see the header
>> setup code while dealing with the FB write lowering code if it's not
>> what I'm looking for.  And the possibility to share code which my
>> original patch took advantage of.
>>
>> > You are free to continue to disagree with my approach.  Hopefully, I've
>> at
>> > least convinced you that I didn't do it just because I like straight
>> inline
>> > code (which I do like).
>> >
>>
>> Ugh...  I was just thinking you had inlined it in order to
>> micro-optimize it, but knowing you actually prefer a bunch of hairy code
>> over a single call of a well-behaved abstraction (it was indeed
>> referentially transparent and close to a pure function) to achieve the
>> same effect makes me even more shocked ;P
>>
>
> We have always had different definitions of code cleanliness. :-)
>

Didn't ever think we disagreed on structured programming being useful to
write code which is easily maintainable and reusable... :P

> We can switch it back to being a function that returns a register though
> it'll take a little reworking of gen4 as mentioned above.  I don't think it
> really saves us anything but whatever...
>
> --Jason
>
>
>> > --Jason
>> >
>> >
>> >> >
>> >> > -   if (header_size != 0) {
>> >> > -      assert(header_size == 2);
>> >> > -      /* Allocate 2 registers for a header */
>> >> > -      length += 2;
>> >> > +      assert(length == 0);
>> >> > +      sources[0] = header;
>> >> > +      sources[1] = horiz_offset(header, 8);
>> >> > +      length = 2;
>> >> >     }
>> >> > +   assert(length == 0 || length == 2);
>> >> > +   header_size = length;
>> >> >
>> >> >     if (payload.aa_dest_stencil_reg) {
>> >> >        sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
>> >> > diff --git a/src/intel/compiler/brw_fs_generator.cpp
>> >> b/src/intel/compiler/brw_fs_generator.cpp
>> >> > index 8cc9d31..1a2b961 100644
>> >> > --- a/src/intel/compiler/brw_fs_generator.cpp
>> >> > +++ b/src/intel/compiler/brw_fs_generator.cpp
>> >> > @@ -307,9 +307,6 @@ fs_generator::fire_fb_write(fs_inst *inst,
>> >> >  void
>> >> >  fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg
>> payload)
>> >> >  {
>> >> > -   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_
>> >> data);
>> >> > -   const brw_wm_prog_key * const key = (brw_wm_prog_key * const)
>> >> this->key;
>> >> > -
>> >> >     if (devinfo->gen < 8 && !devinfo->is_haswell) {
>> >> >        brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
>> >> >     }
>> >> > @@ -320,69 +317,6 @@ fs_generator::generate_fb_write(fs_inst *inst,
>> >> struct brw_reg payload)
>> >> >     if (inst->base_mrf >= 0)
>> >> >        payload = brw_message_reg(inst->base_mrf);
>> >> >
>> >> > -   /* Header is 2 regs, g0 and g1 are the contents. g0 will be
>> implied
>> >> > -    * move, here's g1.
>> >> > -    */
>> >> > -   if (inst->header_size != 0) {
>> >> > -      brw_push_insn_state(p);
>> >> > -      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
>> >> > -      brw_set_default_exec_size(p, BRW_EXECUTE_1);
>> >> > -      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
>> >> > -      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
>> >> > -      brw_set_default_flag_reg(p, 0, 0);
>> >> > -
>> >> > -      /* On HSW, the GPU will use the predicate on SENDC, unless the
>> >> header is
>> >> > -       * present.
>> >> > -       */
>> >> > -      if (prog_data->uses_kill) {
>> >> > -         struct brw_reg pixel_mask;
>> >> > -
>> >> > -         if (devinfo->gen >= 6)
>> >> > -            pixel_mask = retype(brw_vec1_grf(1, 7),
>> >> BRW_REGISTER_TYPE_UW);
>> >> > -         else
>> >> > -            pixel_mask = retype(brw_vec1_grf(0, 0),
>> >> BRW_REGISTER_TYPE_UW);
>> >> > -
>> >> > -         brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
>> >> > -      }
>> >> > -
>> >> > -      if (devinfo->gen >= 6) {
>> >> > -         brw_push_insn_state(p);
>> >> > -         brw_set_default_exec_size(p, BRW_EXECUTE_16);
>> >> > -      brw_set_default_compression_control(p,
>> >> BRW_COMPRESSION_COMPRESSED);
>> >> > -      brw_MOV(p,
>> >> > -              retype(payload, BRW_REGISTER_TYPE_UD),
>> >> > -              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
>> >> > -         brw_pop_insn_state(p);
>> >> > -
>> >> > -         if (inst->target > 0 && key->replicate_alpha) {
>> >> > -            /* Set "Source0 Alpha Present to RenderTarget" bit in
>> >> message
>> >> > -             * header.
>> >> > -             */
>> >> > -            brw_OR(p,
>> >> > -                vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
>> >> > -                vec1(retype(brw_vec8_grf(0, 0),
>> BRW_REGISTER_TYPE_UD)),
>> >> > -                brw_imm_ud(0x1 << 11));
>> >> > -         }
>> >> > -
>> >> > -      if (inst->target > 0) {
>> >> > -         /* Set the render target index for choosing BLEND_STATE. */
>> >> > -         brw_MOV(p, retype(vec1(suboffset(payload, 2)),
>> >> > -                              BRW_REGISTER_TYPE_UD),
>> >> > -                 brw_imm_ud(inst->target));
>> >> > -      }
>> >> > -
>> >> > -         /* Set computes stencil to render target */
>> >> > -         if (prog_data->computed_stencil) {
>> >> > -            brw_OR(p,
>> >> > -                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
>> >> > -                   vec1(retype(brw_vec8_grf(0, 0),
>> >> BRW_REGISTER_TYPE_UD)),
>> >> > -                   brw_imm_ud(0x1 << 14));
>> >> > -         }
>> >> > -      }
>> >> > -
>> >> > -      brw_pop_insn_state(p);
>> >> > -   }
>> >> > -
>> >> >     if (!runtime_check_aads_emit) {
>> >> >        fire_fb_write(inst, payload, implied_header, inst->mlen);
>> >> >     } else {
>> >> > --
>> >> > 2.5.0.400.gff86faf
>> >> >
>> >> > _______________________________________________
>> >> > mesa-dev mailing list
>> >> > mesa-dev at lists.freedesktop.org
>> >> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>> >>
>>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180526/78d17e3a/attachment.sig>