[Mesa-dev] [PATCH v3 30/43] i965/fs: Support 16-bit types at load_input and store_output

Thu Nov 2 00:50:12 UTC 2017

I haven't read it all in detail but I have a strong suspicion that this is
probably broken for TCS outputs.  In the TCS, we write values out
immediately and we have to be able to write single components.  This is
because TCS outputs also act as a sort of SLM for tessellation shaders
where each thread can write it's own data and then you barrier and then any
thread can read the output of any other thread.  I think we can make this
work but it's going to be rather annoying and will take careful thought and
lots of testing.

--Jason

On Thu, Oct 12, 2017 at 11:38 AM, Jose Maria Casanova Crespo <
jmcasanova at igalia.com> wrote:

> Enables the support of 16-bit types on load_input and
> store_outputs intrinsics intra-stages.
>
> The approach was based on re-using the 32-bit URB read
> and writes between stages, shuffling pairs of 16-bit values into
> 32-bit values at load_store intrinsic and un-shuffling the values
> at load_inputs.
>
> shuffle_32bit_load_result_to_16bit_data and
> shuffle_32bit_load_result_to_16bit_data are implemented in a similar
> way than the analogous functions for handling 64-bit types.
> ---
>  src/intel/compiler/brw_fs.h       |  11 ++++
>  src/intel/compiler/brw_fs_nir.cpp | 119 ++++++++++++++++++++++++++++++
> +++++++-
>  2 files changed, 129 insertions(+), 1 deletion(-)
>
> diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
> index b9476e69ed..90ada3ef4b 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -498,6 +498,17 @@ void shuffle_64bit_data_for_32bit_write(const
> brw::fs_builder &bld,
>                                          const fs_reg &dst,
>                                          const fs_reg &src,
>                                          uint32_t components);
> +
> +void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder &bld,
> +                                             const fs_reg &dst,
> +                                             const fs_reg &src,
> +                                             uint32_t components);
> +
> +void shuffle_16bit_data_for_32bit_write(const brw::fs_builder &bld,
> +                                        const fs_reg &dst,
> +                                        const fs_reg &src,
> +                                        uint32_t components);
> +
>  fs_reg setup_imm_df(const brw::fs_builder &bld,
>                      double v);
>
> diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> index 83ff0607a7..9c694a1c53 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -2124,12 +2124,17 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
>        first_component = first_component / 2;
>     }
>
> +   if (type_sz(dst.type) == 2) {
> +      num_components = DIV_ROUND_UP(num_components, 2);
> +      tmp_dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
> +   }
> +
>     for (unsigned iter = 0; iter < num_iterations; iter++) {
>        if (offset_const) {
>           /* Constant indexing - use global offset. */
>           if (first_component != 0) {
>              unsigned read_components = num_components + first_component;
> -            fs_reg tmp = bld.vgrf(dst.type, read_components);
> +            fs_reg tmp = bld.vgrf(tmp_dst.type, read_components);
>              inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
> icp_handle);
>              inst->size_written = read_components *
>                                   tmp.component_size(inst->exec_size);
> @@ -2179,6 +2184,11 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
>              bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld,
> c));
>        }
>
> +      if (type_sz(dst.type) == 2) {
> +         shuffle_32bit_load_result_to_16bit_data(
> +            bld, dst, retype(tmp_dst, BRW_REGISTER_TYPE_F),
> orig_num_components);
> +      }
> +
>        if (num_iterations > 1) {
>           num_components = orig_num_components - 2;
>           if(offset_const) {
> @@ -2484,6 +2494,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const
> fs_builder &bld,
>           dst = tmp;
>        }
>
> +      if (type_sz(dst.type) == 2) {
> +         num_components = DIV_ROUND_UP(num_components, 2);
> +         dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
> +      }
> +
>        for (unsigned iter = 0; iter < num_iterations; iter++) {
>           if (indirect_offset.file == BAD_FILE) {
>              /* Constant indexing - use global offset. */
> @@ -2539,6 +2554,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const
> fs_builder &bld,
>              }
>           }
>
> +         if (type_sz(orig_dst.type) == 2) {
> +            shuffle_32bit_load_result_to_16bit_data(
> +               bld, orig_dst, dst, instr->num_components);
> +         }
> +
>           /* Copy the temporary to the destination to deal with
> writemasking.
>            *
>            * Also attempt to deal with gl_PointSize being in the .w
> component.
> @@ -2629,6 +2649,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder
> &bld,
>        fs_reg value = get_nir_src(instr->src[0]);
>        bool is_64bit = (instr->src[0].is_ssa ?
>           instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size)
> == 64;
> +      bool is_16bit = (instr->src[0].is_ssa ?
> +         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size)
> == 16;
>        fs_reg indirect_offset = get_indirect_offset(instr);
>        unsigned imm_offset = instr->const_index[0];
>        unsigned swiz = BRW_SWIZZLE_XYZW;
> @@ -2659,6 +2681,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const
> fs_builder &bld,
>              num_iterations = 2;
>              iter_components = 2;
>           }
> +      } else {
> +         if (is_16bit) {
> +            iter_components = DIV_ROUND_UP(num_components, 2);
> +         }
>        }
>
>        /* 64-bit data needs to me shuffled before we can write it to the
> URB.
> @@ -2711,6 +2737,12 @@ fs_visitor::nir_emit_tcs_intrinsic(const
> fs_builder &bld,
>                 continue;
>
>              if (!is_64bit) {
> +               if (is_16bit) {
> +                  shuffle_16bit_data_for_32bit_write(bld,
> +                     retype(offset(value,bld,BRW_GET_SWZ(swiz, i)),
> BRW_REGISTER_TYPE_F),
> +                     retype(offset(value,bld,BRW_GET_SWZ(swiz, i)),
> BRW_REGISTER_TYPE_HF),
> +                     2);
> +               }
>                 srcs[header_regs + i + first_component] =
>                    offset(value, bld, BRW_GET_SWZ(swiz, i));
>              } else {
> @@ -2862,6 +2894,11 @@ fs_visitor::nir_emit_tes_intrinsic(const
> fs_builder &bld,
>              dest = tmp;
>           }
>
> +         if (type_sz(dest.type) == 2) {
> +            num_components = DIV_ROUND_UP(num_components, 2);
> +            dest = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
> +         }
> +
>           for (unsigned iter = 0; iter < num_iterations; iter++) {
>              const fs_reg srcs[] = {
>                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
> @@ -2904,6 +2941,11 @@ fs_visitor::nir_emit_tes_intrinsic(const
> fs_builder &bld,
>                 }
>              }
>
> +            if (type_sz(dest.type) == 2) {
> +               shuffle_32bit_load_result_to_16bit_data(
> +                  bld, orig_dest, retype(dest, BRW_REGISTER_TYPE_F),
> num_components);
> +            }
> +
>              /* If we are loading double data and we need a second read
> message
>               * adjust the offset
>               */
> @@ -3257,6 +3299,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder
> &bld,
>           num_components *= 2;
>        }
>
> +      fs_reg orig_dest = dest;
> +      if (nir_dest_bit_size(instr->dest) == 16) {
> +         type = BRW_REGISTER_TYPE_F;
> +         num_components = DIV_ROUND_UP(num_components, 2);
> +         dest = bld.vgrf(type, num_components);
> +      }
> +
>        for (unsigned int i = 0; i < num_components; i++) {
>           struct brw_reg interp = interp_reg(base, component + i);
>           interp = suboffset(interp, 3);
> @@ -3270,6 +3319,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder
> &bld,
>                                                   retype(dest, type),
>                                                   instr->num_components);
>        }
> +      if (nir_dest_bit_size(instr->dest) == 16) {
> +         shuffle_32bit_load_result_to_16bit_data(bld,
> +                                                 orig_dest,
> +                                                 retype(dest, type),
> +                                                 instr->num_components);
> +      }
>        break;
>     }
>
> @@ -4182,6 +4237,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder
> &bld, nir_intrinsic_instr *instr
>        unsigned first_component = nir_intrinsic_component(instr);
>        unsigned bit_size = instr->src[0].is_ssa ?
>           instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
> +
>        if (bit_size == 64) {
>           fs_reg tmp =
>              fs_reg(VGRF, alloc.allocate(2 * num_components),
> @@ -4192,6 +4248,16 @@ fs_visitor::nir_emit_intrinsic(const fs_builder
> &bld, nir_intrinsic_instr *instr
>           num_components *= 2;
>        }
>
> +      if (bit_size == 16) {
> +         fs_reg tmp =
> +            fs_reg(VGRF, alloc.allocate(DIV_ROUND_UP(num_components, 2)),
> +                   BRW_REGISTER_TYPE_F);
> +         shuffle_16bit_data_for_32bit_write(
> +            bld, tmp, retype(src, BRW_REGISTER_TYPE_HF), num_components);
> +         src = retype(tmp, src.type);
> +         num_components = DIV_ROUND_UP(num_components, 2);
> +      }
> +
>        for (unsigned j = 0; j < num_components; j++) {
>           bld.MOV(offset(new_dest, bld, j + first_component),
>                   offset(src, bld, j));
> @@ -4815,6 +4881,33 @@ shuffle_32bit_load_result_to_64bit_data(const
> fs_builder &bld,
>     }
>  }
>
> +void
> +shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
> +                                        const fs_reg &dst,
> +                                        const fs_reg &src,
> +                                        uint32_t components)
> +{
> +   assert(type_sz(src.type) == 4);
> +   assert(type_sz(dst.type) == 2);
> +
> +   fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
> +
> +   for (unsigned i = 0; i < components; i++) {
> +      const fs_reg component_i = subscript(offset(src, bld, i / 2),
> dst.type, i % 2);
> +
> +      bld.MOV(offset(tmp, bld, i % 2), component_i);
> +
> +      if (i % 2) {
> +         bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
> +         bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
> +      }
> +   }
> +   if (components % 2) {
> +      bld.MOV(offset(dst, bld, components - 1), tmp);
> +   }
> +}
> +
> +
>  /**
>   * This helper does the inverse operation of
>   * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
> @@ -4849,6 +4942,30 @@ shuffle_64bit_data_for_32bit_write(const
> fs_builder &bld,
>     }
>  }
>
> +void
> +shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
> +                                   const fs_reg &dst,
> +                                   const fs_reg &src,
> +                                   uint32_t components)
> +{
> +   assert(type_sz(src.type) == 2);
> +   assert(type_sz(dst.type) == 4);
> +
> +   fs_reg tmp = bld.vgrf(dst.type);
> +
> +   for (unsigned i = 0; i < components; i++) {
> +      const fs_reg component_i = offset(src, bld, i);
> +      bld.MOV(subscript(tmp, src.type, i % 2), component_i);
> +      if (i % 2) {
> +         bld.MOV(offset(dst, bld, i / 2), tmp);
> +      }
> +      if (components % 2) {
> +         bld.MOV(offset(dst, bld, components / 2), tmp);
> +      }
> +   }
> +}
> +
> +
>  fs_reg
>  setup_imm_df(const fs_builder &bld, double v)
>  {
> --
> 2.13.6
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20171101/7f372a11/attachment-0001.html>