[Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

Tue May 10 19:41:36 UTC 2016

On Tuesday, May 3, 2016 2:22:04 PM PDT Samuel Iglesias Gonsálvez wrote:
> From: Iago Toral Quiroga <itoral at igalia.com>
> 
> There are a few places where we need to shuffle the result of a 32-bit load
> into valid 64-bit data, so extract this logic into a separate helper that we
> can reuse.
> 
> Also, the shuffling needs to operate with WE_all set, which we were missing
> before, because we are changing the layout of the data across the various
> channels. Otherwise we will run into problems in non-uniform control-flow
> scenarios.
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp     | 95 ++++++++++++++++++++
+-----------
>  src/mesa/drivers/dri/i965/brw_fs.h       |  5 ++
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--------------
>  3 files changed, 73 insertions(+), 73 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/
i965/brw_fs.cpp
> index dff13ea..709e4b8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
&bld,
>  
>     vec4_result.type = dst.type;
>  
> -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If 
we
> -    * are reading doubles this means that we get this:
> -    *
> -    *  r0: x0 x0 x0 x0 x0 x0 x0 x0
> -    *  r1: x1 x1 x1 x1 x1 x1 x1 x1
> -    *  r2: y0 y0 y0 y0 y0 y0 y0 y0
> -    *  r3: y1 y1 y1 y1 y1 y1 y1 y1
> -    *
> -    * Fix this up so we return valid double elements:
> -    *
> -    *  r0: x0 x1 x0 x1 x0 x1 x0 x1
> -    *  r1: x0 x1 x0 x1 x0 x1 x0 x1
> -    *  r2: y0 y1 y0 y1 y0 y1 y0 y1
> -    *  r3: y0 y1 y0 y1 y0 y1 y0 y1
> -    */
> -   if (type_sz(dst.type) == 8) {
> -      int multiplier = bld.dispatch_width() / 8;
> -      fs_reg fixed_res =
> -         fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> -      /* We only have 2 doubles in a 32-bit vec4 */
> -      for (int i = 0; i < 2; i++) {
> -         fs_reg vec4_float =
> -            horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> -                         multiplier * 16 * i);
> -
> -         bld.MOV(stride(fixed_res, 2), vec4_float);
> -         bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> -                 horiz_offset(vec4_float, 8 * multiplier));
> -
> -         bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> -                 retype(fixed_res, BRW_REGISTER_TYPE_DF));
> -      }
> -   }
> +   if (type_sz(dst.type) == 8)
> +      SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
vec4_result, 2);
>  
>     int type_slots = MAX2(type_sz(dst.type) / 4, 1);
>     bld.MOV(dst, offset(vec4_result, bld,
> @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
&bld,
>  }
>  
>  /**
> + * This helper takes the result of a load operation that reads 32-bit 
elements
> + * in this format:
> + *
> + * x x x x x x x x
> + * y y y y y y y y
> + * z z z z z z z z
> + * w w w w w w w w
> + *
> + * and shuffles the data to get this:
> + *
> + * x y x y x y x y
> + * x y x y x y x y
> + * z w z w z w z w
> + * z w z w z w z w
> + *
> + * Which is exactly what we want if the load is reading 64-bit components
> + * like doubles, where x represents the low 32-bit of the x double 
component
> + * and y represents the high 32-bit of the x double component (likewise 
with
> + * z and w for double component y). The parameter @components represents
> + * the number of 64-bit components present in @src. This would typically be
> + * 2 at most, since we can only fit 2 double elements in the result of a
> + * vec4 load.
> + *
> + * Notice that @dst and @src can be the same register.
> + */
> +void
> +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder &bld,
> +                                                    const fs_reg dst,
> +                                                    const fs_reg src,
> +                                                    uint32_t components)
> +{
> +   int multiplier = bld.dispatch_width() / 8;
> +
> +   /* A temporary that we will use to shuffle the 32-bit data of each
> +    * component in the vector into valid 64-bit data
> +    */
> +   fs_reg tmp =
> +      fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> +
> +   /* We are going to manipulate the data in elements of 32-bit */
> +   fs_reg src_data = retype(src, BRW_REGISTER_TYPE_F);
> +
> +   /* We are going to manipulate the dst in elements of 64-bit */
> +   fs_reg dst_data = retype(dst, BRW_REGISTER_TYPE_DF);

How about:

const fs_builder nomask_bld = bld.exec_all();

...

    nomask_bld.MOV(...);

instead of bld.MOV(...)->force_writemask_all = true?

> +
> +   /* Shuffle the data */
> +   for (unsigned i = 0; i < components; i++) {
> +      fs_reg component_i = horiz_offset(src_data, multiplier * 16 * i);
> +
> +      bld.MOV(stride(tmp, 2), component_i)->force_writemask_all = true;
> +      bld.MOV(stride(horiz_offset(tmp, 1), 2),
> +              horiz_offset(component_i, 8 * multiplier))
> +                 ->force_writemask_all = true;
> +
> +      bld.MOV(horiz_offset(dst_data, multiplier * 8 * i),
> +              retype(tmp, BRW_REGISTER_TYPE_DF))->force_writemask_all = 
true;
> +   }
> +}
> +
> +/**
>   * A helper for MOV generation for fixing up broken hardware SEND 
dependency
>   * handling.
>   */
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/
brw_fs.h
> index 1f18112..c95b30a 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -106,6 +106,11 @@ public:
>                                     uint32_t const_offset);
>     void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
>  
> +   void SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const brw::fs_builder &bld,
> +                                                const fs_reg dst,
> +                                                const fs_reg src,
> +                                                uint32_t components);
> +
>     bool run_fs(bool do_rep_send);
>     bool run_vs(gl_clip_plane *clip_planes);
>     bool run_tes();
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/
dri/i965/brw_fs_nir.cpp
> index 5a12d63..9cd751b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -3082,44 +3082,19 @@ fs_visitor::nir_emit_intrinsic(const fs_builder 
&bld, nir_intrinsic_instr *instr
>           for (int i = 0; i < instr->num_components; i++)
>              bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
>        } else {
> -         /* Reading a dvec, then the untyped read below gives us this:
> +         /* Reading a dvec, so we need to:
>            *
> -          * x0 x0 x0 x0 x0 x0 x0 x0
> -          * x1 x1 x1 x1 x1 x1 x1 x1
> -          * y0 y0 y0 y0 y0 y0 y0 y0
> -          * y1 y1 y1 y1 y1 y1 y1 y1
> -          *
> -          * But that is not valid 64-bit data, instead we want:
> -          *
> -          * x0 x1 x0 x1 x0 x1 x0 x1
> -          * x0 x1 x0 x1 x0 x1 x0 x1
> -          * y0 y1 y0 y1 y0 y1 y0 y1
> -          * y0 y1 y0 y1 y0 y1 y0 y1
> -          *
> -          * Also, that would only load half of a dvec4. So, we have to:
> -          *
> -          * 1. Multiply num_components by 2, to account for the fact that 
we need
> -          *    to read 64-bit components.
> +          * 1. Multiply num_components by 2, to account for the fact that 
we
> +          *    need to read 64-bit components.
>            * 2. Shuffle the result of the load to form valid 64-bit elements
>            * 3. Emit a second load (for components z/w) if needed.
> -          *
> -          * FIXME: extract the shuffling logic and share it with
> -          *        varying_pull_constant_load
>            */
> -         int multiplier = bld.dispatch_width() / 8;
> -
>           fs_reg read_offset = vgrf(glsl_type::uint_type);
>           bld.MOV(read_offset, offset_reg);
>  
>           int num_components = instr->num_components;
>           int iters = num_components <= 2 ? 1 : 2;
>  
> -         /* A temporary that we will use to shuffle the 32-bit data of each
> -          * component in the vector into valid 64-bit data
> -          */
> -         fs_reg tmp =
> -            fs_reg(VGRF, alloc.allocate(2 * multiplier), 
BRW_REGISTER_TYPE_F);
> -
>           /* Load the dvec, the first iteration loads components x/y, the 
second
>            * iteration, if needed, loads components z/w
>            */
> @@ -3138,18 +3113,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
nir_intrinsic_instr *instr
>              read_result.type = BRW_REGISTER_TYPE_F;
>  
>              /* Shuffle the 32-bit load result into valid 64-bit data */
> -            int multiplier = bld.dispatch_width() / 8;
> -            for (int i = 0; i < iter_components; i++) {
> -               fs_reg component_i =
> -                  horiz_offset(read_result, multiplier * 16 * i);
> -
> -               bld.MOV(stride(tmp, 2), component_i);
> -               bld.MOV(stride(horiz_offset(tmp, 1), 2),
> -                       horiz_offset(component_i, 8 * multiplier));
> -
> -               bld.MOV(horiz_offset(dest, multiplier * 8 * (i + it * 2)),
> -                       retype(tmp, BRW_REGISTER_TYPE_DF));
> -            }
> +            SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld,
> +                                                    offset(dest, bld, it * 
2),
> +                                                    read_result, 
iter_components);
>  
>              bld.ADD(read_offset, read_offset, brw_imm_ud(16));
>           }
> 

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160510/a576ec8d/attachment.sig>