[Mesa-dev] [PATCH v3 30/43] i965/fs: Support 16-bit types at load_input and store_output

Mon Oct 16 15:19:53 UTC 2017

On Thu, Oct 12, 2017 at 08:38:19PM +0200, Jose Maria Casanova Crespo wrote:
> Enables the support of 16-bit types on load_input and
> store_outputs intrinsics intra-stages.
> 
> The approach was based on re-using the 32-bit URB read
> and writes between stages, shuffling pairs of 16-bit values into
> 32-bit values at load_store intrinsic and un-shuffling the values
> at load_inputs.
> 
> shuffle_32bit_load_result_to_16bit_data and
> shuffle_32bit_load_result_to_16bit_data are implemented in a similar
> way than the analogous functions for handling 64-bit types.
> ---
>  src/intel/compiler/brw_fs.h       |  11 ++++
>  src/intel/compiler/brw_fs_nir.cpp | 119 +++++++++++++++++++++++++++++++++++++-
>  2 files changed, 129 insertions(+), 1 deletion(-)
> 
> diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
> index b9476e69ed..90ada3ef4b 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -498,6 +498,17 @@ void shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld,
>                                          const fs_reg &dst,
>                                          const fs_reg &src,
>                                          uint32_t components);
> +
> +void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder &bld,
> +                                             const fs_reg &dst,
> +                                             const fs_reg &src,
> +                                             uint32_t components);
> +
> +void shuffle_16bit_data_for_32bit_write(const brw::fs_builder &bld,
> +                                        const fs_reg &dst,
> +                                        const fs_reg &src,
> +                                        uint32_t components);
> +
>  fs_reg setup_imm_df(const brw::fs_builder &bld,
>                      double v);
>  
> diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
> index 83ff0607a7..9c694a1c53 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -2124,12 +2124,17 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
>        first_component = first_component / 2;
>     }
>  
> +   if (type_sz(dst.type) == 2) {
> +      num_components = DIV_ROUND_UP(num_components, 2);
> +      tmp_dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
> +   }
> +
>     for (unsigned iter = 0; iter < num_iterations; iter++) {
>        if (offset_const) {
>           /* Constant indexing - use global offset. */
>           if (first_component != 0) {
>              unsigned read_components = num_components + first_component;
> -            fs_reg tmp = bld.vgrf(dst.type, read_components);
> +            fs_reg tmp = bld.vgrf(tmp_dst.type, read_components);
>              inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
>              inst->size_written = read_components *
>                                   tmp.component_size(inst->exec_size);
> @@ -2179,6 +2184,11 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
>              bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
>        }
>  
> +      if (type_sz(dst.type) == 2) {
> +         shuffle_32bit_load_result_to_16bit_data(
> +            bld, dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), orig_num_components);

We don't need retype() here, do we? Looking the block added a little earlier
where "tmp_dst" gets created.

> +      }
> +
>        if (num_iterations > 1) {
>           num_components = orig_num_components - 2;
>           if(offset_const) {
> @@ -2484,6 +2494,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
>           dst = tmp;
>        }
>  
> +      if (type_sz(dst.type) == 2) {
> +         num_components = DIV_ROUND_UP(num_components, 2);
> +         dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
> +      }
> +
>        for (unsigned iter = 0; iter < num_iterations; iter++) {
>           if (indirect_offset.file == BAD_FILE) {
>              /* Constant indexing - use global offset. */
> @@ -2539,6 +2554,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
>              }
>           }
>  
> +         if (type_sz(orig_dst.type) == 2) {
> +            shuffle_32bit_load_result_to_16bit_data(
> +               bld, orig_dst, dst, instr->num_components);
> +         }
> +
>           /* Copy the temporary to the destination to deal with writemasking.
>            *
>            * Also attempt to deal with gl_PointSize being in the .w component.
> @@ -2629,6 +2649,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
>        fs_reg value = get_nir_src(instr->src[0]);
>        bool is_64bit = (instr->src[0].is_ssa ?
>           instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
> +      bool is_16bit = (instr->src[0].is_ssa ?
> +         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 16;
>        fs_reg indirect_offset = get_indirect_offset(instr);
>        unsigned imm_offset = instr->const_index[0];
>        unsigned swiz = BRW_SWIZZLE_XYZW;
> @@ -2659,6 +2681,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
>              num_iterations = 2;
>              iter_components = 2;
>           }
> +      } else {
> +         if (is_16bit) {
> +            iter_components = DIV_ROUND_UP(num_components, 2);
> +         }
>        }
>  
>        /* 64-bit data needs to me shuffled before we can write it to the URB.
> @@ -2711,6 +2737,12 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
>                 continue;
>  
>              if (!is_64bit) {
> +               if (is_16bit) {
> +                  shuffle_16bit_data_for_32bit_write(bld,
> +                     retype(offset(value,bld,BRW_GET_SWZ(swiz, i)), BRW_REGISTER_TYPE_F),
> +                     retype(offset(value,bld,BRW_GET_SWZ(swiz, i)), BRW_REGISTER_TYPE_HF),
> +                     2);
> +               }
>                 srcs[header_regs + i + first_component] =
>                    offset(value, bld, BRW_GET_SWZ(swiz, i));
>              } else {
> @@ -2862,6 +2894,11 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
>              dest = tmp;
>           }
>  
> +         if (type_sz(dest.type) == 2) {
> +            num_components = DIV_ROUND_UP(num_components, 2);
> +            dest = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
> +         }
> +
>           for (unsigned iter = 0; iter < num_iterations; iter++) {
>              const fs_reg srcs[] = {
>                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
> @@ -2904,6 +2941,11 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
>                 }
>              }
>  
> +            if (type_sz(dest.type) == 2) {
> +               shuffle_32bit_load_result_to_16bit_data(
> +                  bld, orig_dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);

Here also retype() looks unnecessary.

> +            }
> +
>              /* If we are loading double data and we need a second read message
>               * adjust the offset
>               */
> @@ -3257,6 +3299,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
>           num_components *= 2;
>        }
>  
> +      fs_reg orig_dest = dest;
> +      if (nir_dest_bit_size(instr->dest) == 16) {
> +         type = BRW_REGISTER_TYPE_F;
> +         num_components = DIV_ROUND_UP(num_components, 2);
> +         dest = bld.vgrf(type, num_components);
> +      }
> +
>        for (unsigned int i = 0; i < num_components; i++) {
>           struct brw_reg interp = interp_reg(base, component + i);
>           interp = suboffset(interp, 3);
> @@ -3270,6 +3319,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
>                                                   retype(dest, type),
>                                                   instr->num_components);
>        }
> +      if (nir_dest_bit_size(instr->dest) == 16) {
> +         shuffle_32bit_load_result_to_16bit_data(bld,
> +                                                 orig_dest,
> +                                                 retype(dest, type),

Same here, "dest" gets created against "type".

> +                                                 instr->num_components);
> +      }
>        break;
>     }
>  
> @@ -4182,6 +4237,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>        unsigned first_component = nir_intrinsic_component(instr);
>        unsigned bit_size = instr->src[0].is_ssa ?
>           instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
> +

Leftover?

>        if (bit_size == 64) {
>           fs_reg tmp =
>              fs_reg(VGRF, alloc.allocate(2 * num_components),
> @@ -4192,6 +4248,16 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>           num_components *= 2;
>        }
>  
> +      if (bit_size == 16) {
> +         fs_reg tmp =
> +            fs_reg(VGRF, alloc.allocate(DIV_ROUND_UP(num_components, 2)),
> +                   BRW_REGISTER_TYPE_F);

Earlier you used builder, perhaps here also:

            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F,
                                  DIV_ROUND_UP(num_components, 2));

> +         shuffle_16bit_data_for_32bit_write(
> +            bld, tmp, retype(src, BRW_REGISTER_TYPE_HF), num_components);

Isn't "src" already a 16-bit type, it is gotten from "instr->src[0]" same as
"bit_size"?

> +         src = retype(tmp, src.type);
> +         num_components = DIV_ROUND_UP(num_components, 2);
> +      }
> +
>        for (unsigned j = 0; j < num_components; j++) {
>           bld.MOV(offset(new_dest, bld, j + first_component),
>                   offset(src, bld, j));
> @@ -4815,6 +4881,33 @@ shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
>     }
>  }
>  
> +void
> +shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
> +                                        const fs_reg &dst,
> +                                        const fs_reg &src,
> +                                        uint32_t components)
> +{
> +   assert(type_sz(src.type) == 4);
> +   assert(type_sz(dst.type) == 2);
> +
> +   fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
> +
> +   for (unsigned i = 0; i < components; i++) {
> +      const fs_reg component_i = subscript(offset(src, bld, i / 2), dst.type, i % 2);
> +
> +      bld.MOV(offset(tmp, bld, i % 2), component_i);
> +
> +      if (i % 2) {
> +         bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
> +         bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
> +      }
> +   }
> +   if (components % 2) {
> +      bld.MOV(offset(dst, bld, components - 1), tmp);
> +   }
> +}
> +
> +
>  /**
>   * This helper does the inverse operation of
>   * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
> @@ -4849,6 +4942,30 @@ shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
>     }
>  }
>  
> +void
> +shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
> +                                   const fs_reg &dst,
> +                                   const fs_reg &src,
> +                                   uint32_t components)
> +{
> +   assert(type_sz(src.type) == 2);
> +   assert(type_sz(dst.type) == 4);
> +
> +   fs_reg tmp = bld.vgrf(dst.type);
> +
> +   for (unsigned i = 0; i < components; i++) {
> +      const fs_reg component_i = offset(src, bld, i);
> +      bld.MOV(subscript(tmp, src.type, i % 2), component_i);
> +      if (i % 2) {
> +         bld.MOV(offset(dst, bld, i / 2), tmp);
> +      }
> +      if (components % 2) {
> +         bld.MOV(offset(dst, bld, components / 2), tmp);
> +      }
> +   }
> +}
> +
> +
>  fs_reg
>  setup_imm_df(const fs_builder &bld, double v)
>  {
> -- 
> 2.13.6
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev