[Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler

Thu Nov 30 22:58:03 UTC 2017

On Wed, Nov 29, 2017 at 6:50 PM, Jose Maria Casanova Crespo <
jmcasanova at igalia.com> wrote:

> load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
> surface format defined. So when reading 16-bit components with the
> sampler we need to unshuffle two 16-bit components from each 32-bit
> component.
>
> Using the sampler avoids the use of the byte_scattered_read message
> that needs one message for each component and is supposed to be
> slower.
>
> In the case of SKL+ we take advantage of a hardware feature that
> automatically defines a channel mask based on the rlen value, so on
> SKL+ we only use half of the registers without using a header in the
> payload.
> ---
>  src/intel/compiler/brw_fs.cpp           | 31
> +++++++++++++++++++++++++++----
>  src/intel/compiler/brw_fs_generator.cpp | 10 ++++++++--
>  2 files changed, 35 insertions(+), 6 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 1ca4d416b2..9c543496ba 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> fs_builder &bld,
>      * a double this means we are only loading 2 elements worth of data.
>      * We also want to use a 32-bit data type for the dst of the load
> operation
>      * so other parts of the driver don't get confused about the size of
> the
> -    * result.
> +    * result. On the case of 16-bit data we only need half of the 32-bit
> +    * components on SKL+ as we take advance of using message return size
> to
> +    * define an xy channel mask.
>      */
> -   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +   fs_reg vec4_result;
> +   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
> +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> +      vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
> +   } else {
> +      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +   }
>
    fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
>                              vec4_result, surf_index, vec4_offset);
>     inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
> @@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const
> fs_builder &bld,
>     }
>
>     vec4_result.type = dst.type;
> -   bld.MOV(dst, offset(vec4_result, bld,
> -                       (const_offset & 0xf) / type_sz(vec4_result.type)));
> +
> +   if (type_sz(dst.type) == 2) {
> +      /* 16-bit types need to be unshuffled as each pair of 16-bit
> components
> +       * is packed on a 32-bit compoment because we are using a 32-bit
> format
> +       * in the surface of uniform that is read by the sampler.
> +       * TODO: On BDW+ mark when an uniform has 16-bit type so we could
> setup a
> +       * surface format of 16-bit and use the 16-bit return format at the
> +       * sampler.
> +       */
> +      vec4_result.stride = 2;
> +      bld.MOV(dst, byte_offset(offset(vec4_result, bld,
> +                                      (const_offset & 0x7) / 4),
> +                               (const_offset & 0x7) / 2 % 2 * 2));
> +   } else {
> +      bld.MOV(dst, offset(vec4_result, bld,
> +                          (const_offset & 0xf) /
> type_sz(vec4_result.type)));
> +   }
>

This seems overly complicated.  How about something like

fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
switch (type_sz(dst.type)) {
case 2:
   shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
   bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
   break;
case 4:
   bld.MOV(dst, dw);
   break;
case 8:
   shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
   break;
default:
   unreachable();
}

>  }
>
>  /**
> diff --git a/src/intel/compiler/brw_fs_generator.cpp
> b/src/intel/compiler/brw_fs_generator.cpp
> index a3861cd68e..00a4e29147 100644
> --- a/src/intel/compiler/brw_fs_generator.cpp
> +++ b/src/intel/compiler/brw_fs_generator.cpp
> @@ -1381,12 +1381,18 @@ fs_generator::generate_varying
> _pull_constant_load_gen7(fs_inst *inst,
>     uint32_t simd_mode, rlen, mlen;
>     if (inst->exec_size == 16) {
>        mlen = 2;
> -      rlen = 8;
> +      if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
> +         rlen = 4;
> +      else
> +         rlen = 8;
>

I'm not sure what I think of this.  We intentionally use a vec4 today
instead of a single float because it lets us potentially batch up a bunch
of loads in a single texture operation.  Are we sure that we never need
more than 2 of those result registers?

--Jason

       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
>     } else {
>        assert(inst->exec_size == 8);
>        mlen = 1;
> -      rlen = 4;
> +      if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
> +         rlen = 2;
> +      else
> +         rlen = 4;
>        simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
>     }
>
> --
> 2.14.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20171130/1a19e232/attachment.html>