[Mesa-dev] [PATCH 2/2] gallivm: add PK2H/UP2H support

Tue Feb 2 02:48:52 UTC 2016

Am 01.02.2016 um 16:36 schrieb Brian Paul:
> Just a few more nitpicks below...
> 
> Reviewed-by: Brian Paul <brianp at vmware.com>
> 
> On 01/30/2016 07:08 PM, sroland at vmware.com wrote:
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> Add support for these opcodes, the conversion functions were already
>> there albeit need some new packing stuff.
>> Just like the tgsi version, piglit won't like it for all the same
>> reasons, so it's disabled (UP2H passes piglit arb_shader_language_packing
>> tests, albeit since PK2H won't due those rounding differences I don't
>> know if that one works or not as the piglit test is rather difficult to
>> deal with).
>> ---
>>   src/gallium/auxiliary/gallivm/lp_bld_conv.c        | 16 ++++-
>>   src/gallium/auxiliary/gallivm/lp_bld_pack.c        | 26 ++++++++
>>   src/gallium/auxiliary/gallivm/lp_bld_pack.h        |  5 ++
>>   src/gallium/auxiliary/gallivm/lp_bld_tgsi.c        |  1 -
>>   src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 73
>> ++++++++++++++++++++++
>>   5 files changed, 119 insertions(+), 2 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> index 7854142..7cf0dee 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> @@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
>>    *
>>    * Convert float32 to half floats, preserving Infs and NaNs,
>>    * with rounding towards zero (trunc).
>> + * XXX: For GL, would prefer rounding towards nearest(-even).
>>    */
>>   LLVMValueRef
>>   lp_build_float_to_half(struct gallivm_state *gallivm,
>> @@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state
>> *gallivm,
>>      struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
>>      LLVMValueRef result;
>>
>> +   /*
>> +    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
>> +    * directly, without any (x86 or generic) intrinsics.
>> +    * Albeit the rounding mode cannot be specified (and is undefined,
>> +    * though in practice on x86 seems to do nearest-even but it may
>> +    * be dependent on instruction set support), so is essentially
>> +    * useless.
>> +    */
>> +
>>      if (util_cpu_caps.has_f16c &&
>>          (length == 4 || length == 8)) {
>>         struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
>> @@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state
>> *gallivm,
>>           LLVMValueRef index = LLVMConstInt(i32t, i, 0);
>>           LLVMValueRef f32 = LLVMBuildExtractElement(builder, src,
>> index, "");
>>   #if 0
>> -        /* XXX: not really supported by backends */
>> +        /*
>> +         * XXX: not really supported by backends.
>> +         * Even if they would now, rounding mode cannot be specified and
>> +         * is undefined.
>> +         */
>>           LLVMValueRef f16 = lp_build_intrinsic_unary(builder,
>> "llvm.convert.to.fp16", i16t, f32);
>>   #else
>>           LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> index 0b0f7f0..daa2043 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> @@ -257,6 +257,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
>>
>>
>>   /**
>> + * Un-interleave vector.
>> + * This will return a vector consisting of every second element
>> + * (depending on lo_hi, beginning at 0 or 1).
>> + * The returned vector size (elems and width) will only be half
>> + * that of the source vector.
>> + */
>> +LLVMValueRef
>> +lp_build_uninterleave1(struct gallivm_state *gallivm,
>> +                       unsigned num_elems,
>> +                       LLVMValueRef a,
>> +                       unsigned lo_hi)
>> +{
>> +   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
>> +   unsigned i;
>> +   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
>> +
>> +   for(i = 0; i < num_elems / 2; ++i)
> 
> space after for.
Fixed.

> 
>> +      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
>> +
>> +   shuffle = LLVMConstVector(elems, num_elems / 2);
>> +
>> +   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
>> +}
>> +
>> +
>> +/**
>>    * Interleave vector elements.
>>    *
>>    * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> index 7cede35..367fba1 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> @@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
>>                        LLVMValueRef b,
>>                        unsigned lo_hi);
>>
>> +LLVMValueRef
>> +lp_build_uninterleave1(struct gallivm_state *gallivm,
>> +                       unsigned num_elems,
>> +                       LLVMValueRef a,
>> +                       unsigned lo_hi);
>>
>>   void
>>   lp_build_unpack2(struct gallivm_state *gallivm,
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> index c88dfbf..1cbe47c 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> @@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
>>      /* Ignore deprecated instructions */
>>      switch (inst->Instruction.Opcode) {
>>
>> -   case TGSI_OPCODE_UP2H:
>>      case TGSI_OPCODE_UP2US:
>>      case TGSI_OPCODE_UP4B:
>>      case TGSI_OPCODE_UP4UB:
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> index 6f75bec..f6b42ee 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> @@ -45,8 +45,10 @@
>>   #include "lp_bld_arit.h"
>>   #include "lp_bld_bitarit.h"
>>   #include "lp_bld_const.h"
>> +#include "lp_bld_conv.h"
>>   #include "lp_bld_gather.h"
>>   #include "lp_bld_logic.h"
>> +#include "lp_bld_pack.h"
>>
>>   #include "tgsi/tgsi_exec.h"
>>
>> @@ -530,6 +532,75 @@ static struct lp_build_tgsi_action log_action = {
>>      log_emit     /* emit */
>>   };
>>
>> +/* TGSI_OPCODE_PK2H */
>> +
>> +static void
>> +pk2h_fetch_args(
>> +   struct lp_build_tgsi_context * bld_base,
>> +   struct lp_build_emit_data * emit_data)
> 
> pk2h_fetch_args(struct lp_build_tgsi_context *bld_base,
>                 struct lp_build_emit_data *emit_data)
I didn't notice before, but there's over 100 functions in that file and
every single one does it like this.
The only functions not honoring that style are the new pk2h_emit and
up2h_emit... So I suppose it would be better to make them wrong too for
some consistency, not sure it's worth doing mass-whitespace fix...

Roland


>> +{
>> +   /* src0.x */
>> +   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
>> +                                            0, TGSI_CHAN_X);
>> +   /* src0.y */
>> +   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
>> +                                            0, TGSI_CHAN_Y);
>> +}
>> +
>> +static void
>> +pk2h_emit(const struct lp_build_tgsi_action *action,
>> +          struct lp_build_tgsi_context *bld_base,
>> +          struct lp_build_emit_data *emit_data)
>> +{
>> +   struct gallivm_state *gallivm = bld_base->base.gallivm;
>> +   struct lp_type f16i_t;
>> +   LLVMValueRef lo, hi, res;
>> +
>> +   f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
>> +   lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
>> +   hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
>> +   /* maybe some interleave doubling vector width would be useful... */
>> +   lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length *
>> 2);
>> +   hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length *
>> 2);
>> +   res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
>> +
>> +   emit_data->output[emit_data->chan] = res;
>> +}
>> +
>> +static struct lp_build_tgsi_action pk2h_action = {
>> +   pk2h_fetch_args, /* fetch_args */
>> +   pk2h_emit        /* emit */
>> +};
>> +
>> +/* TGSI_OPCODE_UP2H */
>> +
>> +static void
>> +up2h_emit(const struct lp_build_tgsi_action *action,
>> +          struct lp_build_tgsi_context *bld_base,
>> +          struct lp_build_emit_data *emit_data)
>> +{
>> +   struct gallivm_state *gallivm = bld_base->base.gallivm;
>> +   LLVMBuilderRef builder = gallivm->builder;
>> +   LLVMContextRef context = gallivm->context;
>> +   LLVMValueRef lo, hi, res[2], arg;
>> +   unsigned nr = bld_base->base.type.length;
>> +   LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context),
>> nr * 2);
>> +
>> +   arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
>> +   lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
>> +   hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
>> +   res[0] = lp_build_half_to_float(gallivm, lo);
>> +   res[1] = lp_build_half_to_float(gallivm, hi);
>> +
>> +   emit_data->output[0] = emit_data->output[2] = res[0];
>> +   emit_data->output[1] = emit_data->output[3] = res[1];
>> +}
>> +
>> +static struct lp_build_tgsi_action up2h_action = {
>> +   scalar_unary_fetch_args, /* fetch_args */
>> +   up2h_emit                /* emit */
>> +};
>> +
>>   /* TGSI_OPCODE_LRP */
>>
>>   static void
>> @@ -1032,10 +1103,12 @@ lp_set_default_actions(struct
>> lp_build_tgsi_context * bld_base)
>>      bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
>>      bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
>>      bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
>> +   bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
>>      bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
>>      bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
>>      bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
>>      bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
>> +   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
>>      bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;
>>
>>      bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args =
>> scalar_unary_fetch_args;
>>
>