[Mesa-dev] [PATCH v3 01/48] intel/fs: Pass builders instead of blocks into emit_[un]zip

Thu Oct 26 08:06:50 UTC 2017

On Wed, 2017-10-25 at 16:25 -0700, Jason Ekstrand wrote:
> This makes it far more explicit where we're inserting the
> instructions
> rather than the magic "before and after" stuff that the emit_[un]zip
> helpers did based on block and inst.
> 
> Cc: mesa-stable at lists.freedesktop.org
> ---
>  src/intel/compiler/brw_fs.cpp | 50 ++++++++++++++++++++++++---------
> ----------
>  1 file changed, 28 insertions(+), 22 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs.cpp
> b/src/intel/compiler/brw_fs.cpp
> index 4616529..49ca58d 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -5025,8 +5025,7 @@ needs_src_copy(const fs_builder &lbld, const
> fs_inst *inst, unsigned i)
>   * will be emitted before the given \p inst in \p block.
>   */
>  static fs_reg
> -emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
> -           unsigned i)
> +emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)

The comment right above this function needs to be updated accordingly.

>  {
>     /* Specified channel group from the source region. */
>     const fs_reg src = horiz_offset(inst->src[i], lbld.group());
> @@ -5041,8 +5040,7 @@ emit_unzip(const fs_builder &lbld, bblock_t
> *block, fs_inst *inst,
>        const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst-
> >components_read(i));
>  
>        for (unsigned k = 0; k < inst->components_read(i); ++k)
> -         cbld.at(block, inst)
> -             .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, 
> k));
> +         cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, 
> k));
>  
>        return tmp;
>  
> @@ -5112,36 +5110,43 @@ needs_dst_copy(const fs_builder &lbld, const
> fs_inst *inst)
>   * be emitted around the given \p inst in \p block.
>   */
>  static fs_reg
> -emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
> +emit_zip(const fs_builder &lbld_before, const fs_builder
> &lbld_after,
> +         fs_inst *inst)
>  {

Same here.

> -   /* Builder of the right width to perform the copy avoiding
> uninitialized
> -    * data if the lowered execution size is greater than the
> original
> -    * execution size of the instruction.
> -    */
> -   const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
> -                                           inst->exec_size), 0);
> +   assert(lbld_before.dispatch_width() ==
> lbld_after.dispatch_width());
> +   assert(lbld_before.group() == lbld_after.group());
>  
>     /* Specified channel group from the destination region. */
> -   const fs_reg dst = horiz_offset(inst->dst, lbld.group());
> +   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group());
>     const unsigned dst_size = inst->size_written /
>        inst->dst.component_size(inst->exec_size);
>  
> -   if (needs_dst_copy(lbld, inst)) {
> -      const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
> +   if (needs_dst_copy(lbld_after, inst)) {
> +      const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
>  
>        if (inst->predicate) {
>           /* Handle predication by copying the original contents of
>            * the destination into the temporary before emitting the
>            * lowered instruction.
>            */
> -         for (unsigned k = 0; k < dst_size; ++k)
> -            cbld.at(block, inst)
> -                .MOV(offset(tmp, lbld, k), offset(dst, inst-
> >exec_size, k));
> +         for (unsigned k = 0; k < dst_size; ++k) {
> +            lbld_before.group(MIN2(lbld_before.dispatch_width(),
> +                                   inst->exec_size), 0)

It doesn't make sense to do:

lbld_before.group(MIN2(lbld_before.dispatch_width(),
                       inst->exec_size), 0)

for every iteration in the loop since it is constant for all
iterations.

> +                       .MOV(offset(tmp, lbld_before, k),
> +                            offset(dst, inst->exec_size, k));
> +         }
>        }
>  
> -      for (unsigned k = 0; k < dst_size; ++k)
> -         cbld.at(block, inst->next)
> -             .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld,
> k));
> +      for (unsigned k = 0; k < dst_size; ++k) {
> +         /* Use a builder of the right width to perform the copy
> avoiding
> +          * uninitialized data if the lowered execution size is
> greater than
> +          * the original execution size of the instruction.
> +          */
> +         lbld_after.group(MIN2(lbld_after.dispatch_width(),
> +                               inst->exec_size), 0)

Same here.

Iago

> +                   .MOV(offset(dst, inst->exec_size, k),
> +                        offset(tmp, lbld_after, k));
> +      }
>  
>        return tmp;
>  
> @@ -5197,9 +5202,10 @@ fs_visitor::lower_simd_width()
>              const fs_builder lbld = ibld.group(lower_width, i);
>  
>              for (unsigned j = 0; j < inst->sources; j++)
> -               split_inst.src[j] = emit_unzip(lbld, block, inst, j);
> +               split_inst.src[j] = emit_unzip(lbld.at(block, inst),
> inst, j);
>  
> -            split_inst.dst = emit_zip(lbld, block, inst);
> +            split_inst.dst = emit_zip(lbld.at(block, inst),
> +                                      lbld.at(block, inst->next),
> inst);
>              split_inst.size_written =
>                 split_inst.dst.component_size(lower_width) *
> dst_size;