[Mesa-dev] [PATCH 2/2] gallivm: add PK2H/UP2H support

Tue Feb 2 12:39:04 UTC 2016

On 31/01/16 02:08, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> Add support for these opcodes, the conversion functions were already
> there albeit need some new packing stuff.
> Just like the tgsi version, piglit won't like it for all the same
> reasons, so it's disabled (UP2H passes piglit arb_shader_language_packing
> tests, albeit since PK2H won't due those rounding differences I don't
> know if that one works or not as the piglit test is rather difficult to
> deal with).
> ---
>   src/gallium/auxiliary/gallivm/lp_bld_conv.c        | 16 ++++-
>   src/gallium/auxiliary/gallivm/lp_bld_pack.c        | 26 ++++++++
>   src/gallium/auxiliary/gallivm/lp_bld_pack.h        |  5 ++
>   src/gallium/auxiliary/gallivm/lp_bld_tgsi.c        |  1 -
>   src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 73 ++++++++++++++++++++++
>   5 files changed, 119 insertions(+), 2 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> index 7854142..7cf0dee 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> @@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
>    *
>    * Convert float32 to half floats, preserving Infs and NaNs,
>    * with rounding towards zero (trunc).
> + * XXX: For GL, would prefer rounding towards nearest(-even).
>    */
>   LLVMValueRef
>   lp_build_float_to_half(struct gallivm_state *gallivm,
> @@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
>      struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
>      LLVMValueRef result;
>
> +   /*
> +    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
> +    * directly, without any (x86 or generic) intrinsics.
> +    * Albeit the rounding mode cannot be specified (and is undefined,
> +    * though in practice on x86 seems to do nearest-even but it may
> +    * be dependent on instruction set support), so is essentially
> +    * useless.
> +    */
> +
>      if (util_cpu_caps.has_f16c &&
>          (length == 4 || length == 8)) {
>         struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
> @@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
>           LLVMValueRef index = LLVMConstInt(i32t, i, 0);
>           LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
>   #if 0
> -        /* XXX: not really supported by backends */
> +        /*
> +         * XXX: not really supported by backends.
> +         * Even if they would now, rounding mode cannot be specified and
> +         * is undefined.
> +         */
>           LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
>   #else
>           LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> index 0b0f7f0..daa2043 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> @@ -257,6 +257,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
>
>
>   /**
> + * Un-interleave vector.
> + * This will return a vector consisting of every second element
> + * (depending on lo_hi, beginning at 0 or 1).
> + * The returned vector size (elems and width) will only be half
> + * that of the source vector.
> + */
> +LLVMValueRef
> +lp_build_uninterleave1(struct gallivm_state *gallivm,
> +                       unsigned num_elems,
> +                       LLVMValueRef a,
> +                       unsigned lo_hi)
> +{
> +   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
> +   unsigned i;
> +   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
> +
> +   for(i = 0; i < num_elems / 2; ++i)
> +      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
> +
> +   shuffle = LLVMConstVector(elems, num_elems / 2);
> +
> +   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");

One don't need the second `a` here. One could use 
`LLVMGetUndef(LLVMTypeOf(a))` to make it explicit.

> +}
> +
> +
> +/**
>    * Interleave vector elements.
>    *
>    * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> index 7cede35..367fba1 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> @@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
>                        LLVMValueRef b,
>                        unsigned lo_hi);
>
> +LLVMValueRef
> +lp_build_uninterleave1(struct gallivm_state *gallivm,
> +                       unsigned num_elems,
> +                       LLVMValueRef a,
> +                       unsigned lo_hi);
>
>   void
>   lp_build_unpack2(struct gallivm_state *gallivm,
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> index c88dfbf..1cbe47c 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> @@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
>      /* Ignore deprecated instructions */
>      switch (inst->Instruction.Opcode) {
>
> -   case TGSI_OPCODE_UP2H:
>      case TGSI_OPCODE_UP2US:
>      case TGSI_OPCODE_UP4B:
>      case TGSI_OPCODE_UP4UB:
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> index 6f75bec..f6b42ee 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> @@ -45,8 +45,10 @@
>   #include "lp_bld_arit.h"
>   #include "lp_bld_bitarit.h"
>   #include "lp_bld_const.h"
> +#include "lp_bld_conv.h"
>   #include "lp_bld_gather.h"
>   #include "lp_bld_logic.h"
> +#include "lp_bld_pack.h"
>
>   #include "tgsi/tgsi_exec.h"
>
> @@ -530,6 +532,75 @@ static struct lp_build_tgsi_action log_action = {
>      log_emit	 /* emit */
>   };
>
> +/* TGSI_OPCODE_PK2H */
> +
> +static void
> +pk2h_fetch_args(
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   /* src0.x */
> +   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
> +                                            0, TGSI_CHAN_X);
> +   /* src0.y */
> +   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
> +                                            0, TGSI_CHAN_Y);
> +}
> +
> +static void
> +pk2h_emit(const struct lp_build_tgsi_action *action,
> +          struct lp_build_tgsi_context *bld_base,
> +          struct lp_build_emit_data *emit_data)
> +{
> +   struct gallivm_state *gallivm = bld_base->base.gallivm;
> +   struct lp_type f16i_t;
> +   LLVMValueRef lo, hi, res;
> +
> +   f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
> +   lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
> +   hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
> +   /* maybe some interleave doubling vector width would be useful... */
> +   lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length * 2);
> +   hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length * 2);
> +   res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
> +
> +   emit_data->output[emit_data->chan] = res;
> +}
> +
> +static struct lp_build_tgsi_action pk2h_action = {
> +   pk2h_fetch_args, /* fetch_args */
> +   pk2h_emit        /* emit */
> +};
> +
> +/* TGSI_OPCODE_UP2H */
> +
> +static void
> +up2h_emit(const struct lp_build_tgsi_action *action,
> +          struct lp_build_tgsi_context *bld_base,
> +          struct lp_build_emit_data *emit_data)
> +{
> +   struct gallivm_state *gallivm = bld_base->base.gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMContextRef context = gallivm->context;
> +   LLVMValueRef lo, hi, res[2], arg;
> +   unsigned nr = bld_base->base.type.length;
> +   LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context), nr * 2);
> +
> +   arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
> +   lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
> +   hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
> +   res[0] = lp_build_half_to_float(gallivm, lo);
> +   res[1] = lp_build_half_to_float(gallivm, hi);
> +
> +   emit_data->output[0] = emit_data->output[2] = res[0];
> +   emit_data->output[1] = emit_data->output[3] = res[1];
> +}
> +
> +static struct lp_build_tgsi_action up2h_action = {
> +   scalar_unary_fetch_args, /* fetch_args */
> +   up2h_emit                /* emit */
> +};
> +
>   /* TGSI_OPCODE_LRP */
>
>   static void
> @@ -1032,10 +1103,12 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
>      bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
>      bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
>      bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
> +   bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
>      bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
>      bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
>      bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
>      bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
> +   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
>      bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;
>
>      bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args = scalar_unary_fetch_args;
>

Otherwise series looks good.

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>