[Mesa-dev] [PATCH 2/2] gallivm: add PK2H/UP2H support
Roland Scheidegger
sroland at vmware.com
Tue Feb 2 16:43:36 UTC 2016
Am 02.02.2016 um 13:39 schrieb Jose Fonseca:
> On 31/01/16 02:08, sroland at vmware.com wrote:
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> Add support for these opcodes, the conversion functions were already
>> there albeit need some new packing stuff.
>> Just like the tgsi version, piglit won't like it for all the same
>> reasons, so it's disabled (UP2H passes piglit arb_shader_language_packing
>> tests, albeit since PK2H won't due those rounding differences I don't
>> know if that one works or not as the piglit test is rather difficult to
>> deal with).
>> ---
>> src/gallium/auxiliary/gallivm/lp_bld_conv.c | 16 ++++-
>> src/gallium/auxiliary/gallivm/lp_bld_pack.c | 26 ++++++++
>> src/gallium/auxiliary/gallivm/lp_bld_pack.h | 5 ++
>> src/gallium/auxiliary/gallivm/lp_bld_tgsi.c | 1 -
>> src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 73
>> ++++++++++++++++++++++
>> 5 files changed, 119 insertions(+), 2 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> index 7854142..7cf0dee 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>> @@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
>> *
>> * Convert float32 to half floats, preserving Infs and NaNs,
>> * with rounding towards zero (trunc).
>> + * XXX: For GL, would prefer rounding towards nearest(-even).
>> */
>> LLVMValueRef
>> lp_build_float_to_half(struct gallivm_state *gallivm,
>> @@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state
>> *gallivm,
>> struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
>> LLVMValueRef result;
>>
>> + /*
>> + * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
>> + * directly, without any (x86 or generic) intrinsics.
>> + * Albeit the rounding mode cannot be specified (and is undefined,
>> + * though in practice on x86 seems to do nearest-even but it may
>> + * be dependent on instruction set support), so is essentially
>> + * useless.
>> + */
>> +
>> if (util_cpu_caps.has_f16c &&
>> (length == 4 || length == 8)) {
>> struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
>> @@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state
>> *gallivm,
>> LLVMValueRef index = LLVMConstInt(i32t, i, 0);
>> LLVMValueRef f32 = LLVMBuildExtractElement(builder, src,
>> index, "");
>> #if 0
>> - /* XXX: not really supported by backends */
>> + /*
>> + * XXX: not really supported by backends.
>> + * Even if they would now, rounding mode cannot be specified and
>> + * is undefined.
>> + */
>> LLVMValueRef f16 = lp_build_intrinsic_unary(builder,
>> "llvm.convert.to.fp16", i16t, f32);
>> #else
>> LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> index 0b0f7f0..daa2043 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
>> @@ -257,6 +257,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
>>
>>
>> /**
>> + * Un-interleave vector.
>> + * This will return a vector consisting of every second element
>> + * (depending on lo_hi, beginning at 0 or 1).
>> + * The returned vector size (elems and width) will only be half
>> + * that of the source vector.
>> + */
>> +LLVMValueRef
>> +lp_build_uninterleave1(struct gallivm_state *gallivm,
>> + unsigned num_elems,
>> + LLVMValueRef a,
>> + unsigned lo_hi)
>> +{
>> + LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
>> + unsigned i;
>> + assert(num_elems <= LP_MAX_VECTOR_LENGTH);
>> +
>> + for(i = 0; i < num_elems / 2; ++i)
>> + elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
>> +
>> + shuffle = LLVMConstVector(elems, num_elems / 2);
>> +
>> + return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
>
> One don't need the second `a` here. One could use
> `LLVMGetUndef(LLVMTypeOf(a))` to make it explicit.
Right. I was lazy, was much less typing :-). LLVM of course will
recognize that too, albeit you're right when you read the code it would
be more obvious the second vector isn't needed.
(There's other instances of such shuffles using just the same source
again, albeit some do use undef indeed.)
>
>> +}
>> +
>> +
>> +/**
>> * Interleave vector elements.
>> *
>> * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> index 7cede35..367fba1 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
>> @@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
>> LLVMValueRef b,
>> unsigned lo_hi);
>>
>> +LLVMValueRef
>> +lp_build_uninterleave1(struct gallivm_state *gallivm,
>> + unsigned num_elems,
>> + LLVMValueRef a,
>> + unsigned lo_hi);
>>
>> void
>> lp_build_unpack2(struct gallivm_state *gallivm,
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> index c88dfbf..1cbe47c 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
>> @@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
>> /* Ignore deprecated instructions */
>> switch (inst->Instruction.Opcode) {
>>
>> - case TGSI_OPCODE_UP2H:
>> case TGSI_OPCODE_UP2US:
>> case TGSI_OPCODE_UP4B:
>> case TGSI_OPCODE_UP4UB:
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> index 6f75bec..f6b42ee 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> @@ -45,8 +45,10 @@
>> #include "lp_bld_arit.h"
>> #include "lp_bld_bitarit.h"
>> #include "lp_bld_const.h"
>> +#include "lp_bld_conv.h"
>> #include "lp_bld_gather.h"
>> #include "lp_bld_logic.h"
>> +#include "lp_bld_pack.h"
>>
>> #include "tgsi/tgsi_exec.h"
>>
>> @@ -530,6 +532,75 @@ static struct lp_build_tgsi_action log_action = {
>> log_emit /* emit */
>> };
>>
>> +/* TGSI_OPCODE_PK2H */
>> +
>> +static void
>> +pk2h_fetch_args(
>> + struct lp_build_tgsi_context * bld_base,
>> + struct lp_build_emit_data * emit_data)
>> +{
>> + /* src0.x */
>> + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
>> + 0, TGSI_CHAN_X);
>> + /* src0.y */
>> + emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
>> + 0, TGSI_CHAN_Y);
>> +}
>> +
>> +static void
>> +pk2h_emit(const struct lp_build_tgsi_action *action,
>> + struct lp_build_tgsi_context *bld_base,
>> + struct lp_build_emit_data *emit_data)
>> +{
>> + struct gallivm_state *gallivm = bld_base->base.gallivm;
>> + struct lp_type f16i_t;
>> + LLVMValueRef lo, hi, res;
>> +
>> + f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
>> + lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
>> + hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
>> + /* maybe some interleave doubling vector width would be useful... */
>> + lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length *
>> 2);
>> + hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length *
>> 2);
>> + res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
>> +
>> + emit_data->output[emit_data->chan] = res;
>> +}
>> +
>> +static struct lp_build_tgsi_action pk2h_action = {
>> + pk2h_fetch_args, /* fetch_args */
>> + pk2h_emit /* emit */
>> +};
>> +
>> +/* TGSI_OPCODE_UP2H */
>> +
>> +static void
>> +up2h_emit(const struct lp_build_tgsi_action *action,
>> + struct lp_build_tgsi_context *bld_base,
>> + struct lp_build_emit_data *emit_data)
>> +{
>> + struct gallivm_state *gallivm = bld_base->base.gallivm;
>> + LLVMBuilderRef builder = gallivm->builder;
>> + LLVMContextRef context = gallivm->context;
>> + LLVMValueRef lo, hi, res[2], arg;
>> + unsigned nr = bld_base->base.type.length;
>> + LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context),
>> nr * 2);
>> +
>> + arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
>> + lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
>> + hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
>> + res[0] = lp_build_half_to_float(gallivm, lo);
>> + res[1] = lp_build_half_to_float(gallivm, hi);
>> +
>> + emit_data->output[0] = emit_data->output[2] = res[0];
>> + emit_data->output[1] = emit_data->output[3] = res[1];
>> +}
>> +
>> +static struct lp_build_tgsi_action up2h_action = {
>> + scalar_unary_fetch_args, /* fetch_args */
>> + up2h_emit /* emit */
>> +};
>> +
>> /* TGSI_OPCODE_LRP */
>>
>> static void
>> @@ -1032,10 +1103,12 @@ lp_set_default_actions(struct
>> lp_build_tgsi_context * bld_base)
>> bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
>> bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
>> bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
>> + bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
>> bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
>> bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
>> bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
>> bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
>> + bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
>> bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;
>>
>> bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args =
>> scalar_unary_fetch_args;
>>
>
>
> Otherwise series looks good.
>
> Reviewed-by: Jose Fonseca <jfonseca at vmware.com>
More information about the mesa-dev
mailing list