[Mesa-dev] [PATCH] gallivm: fix [IU]MUL_HI regression

Tue Nov 8 15:23:21 UTC 2016

On 08.11.2016 14:44, Roland Scheidegger wrote:
> Sorry for breaking radeonsi, I somehow thought this way only used for
> cpu only already, without actually checking...
> And thanks for fixing that typo, apparently you can pass piglits
> umul_hi/imul_hi tests (at least those from the shader_integer_mix group)
> even with the square of argument a...

Yeah, it sucks that test runs take so long with llvmpipe. Is there 
anybody doing systematic full regression runs on it?

I do full runs on radeonsi fairly frequently, and I noticed this bug 
with 
tests/spec/arb_gpu_shader5/execution/built-in-functions/fs-imulExtended.shader_test 
and friends.

> btw as I didn't consider this, I don't know if you want to change the
> shift/trunc to shuffle in the end - feel free to change it back if it
> doesn't generate good code on radeonsi...

It seems instcombine has no difficulties seeing through the IR, so I 
think we're good :)

> Reviewed-by: Roland Scheidegger <sroland at vmware.com>

Thanks!

Nicolai

> Am 08.11.2016 um 10:15 schrieb Nicolai Hähnle:
>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>
>> This patch does two things:
>>
>> 1. It separates the host-CPU code generation from the generic code
>>    generation. This guards against accidently breaking things for
>>    radeonsi in the future.
>>
>> 2. It makes sure we actually use both arguments and don't just compute
>>    a square :-p
>>
>> Fixes a regression introduced by commit 29279f44b3172ef3b84d470e70fc7684695ced4b
>>
>> Cc: Roland Scheidegger <sroland at vmware.com>
>> ---
>>  src/gallium/auxiliary/gallivm/lp_bld_arit.c        | 72 ++++++++++++++--------
>>  src/gallium/auxiliary/gallivm/lp_bld_arit.h        |  6 ++
>>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 40 +++++++++++-
>>  3 files changed, 90 insertions(+), 28 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>> index 3de4628..43ad238 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>> @@ -1087,26 +1087,28 @@ lp_build_mul(struct lp_build_context *bld,
>>              res = LLVMBuildLShr(builder, res, shift, "");
>>        }
>>     }
>>
>>     return res;
>>  }
>>
>>  /*
>>   * Widening mul, valid for 32x32 bit -> 64bit only.
>>   * Result is low 32bits, high bits returned in res_hi.
>> + *
>> + * Emits code that is meant to be compiled for the host CPU.
>>   */
>>  LLVMValueRef
>> -lp_build_mul_32_lohi(struct lp_build_context *bld,
>> -                     LLVMValueRef a,
>> -                     LLVMValueRef b,
>> -                     LLVMValueRef *res_hi)
>> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
>> +                         LLVMValueRef a,
>> +                         LLVMValueRef b,
>> +                         LLVMValueRef *res_hi)
>>  {
>>     struct gallivm_state *gallivm = bld->gallivm;
>>     LLVMBuilderRef builder = gallivm->builder;
>>
>>     assert(bld->type.width == 32);
>>     assert(bld->type.floating == 0);
>>     assert(bld->type.fixed == 0);
>>     assert(bld->type.norm == 0);
>>
>>     /*
>> @@ -1209,43 +1211,61 @@ lp_build_mul_32_lohi(struct lp_build_context *bld,
>>        *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
>>
>>        for (i = 0; i < bld->type.length; i += 2) {
>>           shuf[i] = lp_build_const_int32(gallivm, i);
>>           shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
>>        }
>>        shuf_vec = LLVMConstVector(shuf, bld->type.length);
>>        return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
>>     }
>>     else {
>> -      LLVMValueRef tmp;
>> -      struct lp_type type_tmp;
>> -      LLVMTypeRef wide_type, cast_type;
>> -
>> -      type_tmp = bld->type;
>> -      type_tmp.width *= 2;
>> -      wide_type = lp_build_vec_type(gallivm, type_tmp);
>> -      type_tmp = bld->type;
>> -      type_tmp.length *= 2;
>> -      cast_type = lp_build_vec_type(gallivm, type_tmp);
>> -
>> -      if (bld->type.sign) {
>> -         a = LLVMBuildSExt(builder, a, wide_type, "");
>> -         b = LLVMBuildSExt(builder, b, wide_type, "");
>> -      } else {
>> -         a = LLVMBuildZExt(builder, a, wide_type, "");
>> -         b = LLVMBuildZExt(builder, b, wide_type, "");
>> -      }
>> -      tmp = LLVMBuildMul(builder, a, b, "");
>> -      tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
>> -      *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
>> -      return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
>> +      return lp_build_mul_32_lohi(bld, a, b, res_hi);
>> +   }
>> +}
>> +
>> +
>> +/*
>> + * Widening mul, valid for 32x32 bit -> 64bit only.
>> + * Result is low 32bits, high bits returned in res_hi.
>> + *
>> + * Emits generic code.
>> + */
>> +LLVMValueRef
>> +lp_build_mul_32_lohi(struct lp_build_context *bld,
>> +                     LLVMValueRef a,
>> +                     LLVMValueRef b,
>> +                     LLVMValueRef *res_hi)
>> +{
>> +   struct gallivm_state *gallivm = bld->gallivm;
>> +   LLVMBuilderRef builder = gallivm->builder;
>> +   LLVMValueRef tmp;
>> +   struct lp_type type_tmp;
>> +   LLVMTypeRef wide_type, cast_type;
>> +
>> +   type_tmp = bld->type;
>> +   type_tmp.width *= 2;
>> +   wide_type = lp_build_vec_type(gallivm, type_tmp);
>> +   type_tmp = bld->type;
>> +   type_tmp.length *= 2;
>> +   cast_type = lp_build_vec_type(gallivm, type_tmp);
>> +
>> +   if (bld->type.sign) {
>> +      a = LLVMBuildSExt(builder, a, wide_type, "");
>> +      b = LLVMBuildSExt(builder, b, wide_type, "");
>> +   } else {
>> +      a = LLVMBuildZExt(builder, a, wide_type, "");
>> +      b = LLVMBuildZExt(builder, b, wide_type, "");
>>     }
>> +   tmp = LLVMBuildMul(builder, a, b, "");
>> +   tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
>> +   *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
>> +   return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
>>  }
>>
>>
>>  /* a * b + c */
>>  LLVMValueRef
>>  lp_build_mad(struct lp_build_context *bld,
>>               LLVMValueRef a,
>>               LLVMValueRef b,
>>               LLVMValueRef c)
>>  {
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>> index 5d48b1c..2a4137a 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>> @@ -70,20 +70,26 @@ LLVMValueRef
>>  lp_build_sub(struct lp_build_context *bld,
>>               LLVMValueRef a,
>>               LLVMValueRef b);
>>
>>  LLVMValueRef
>>  lp_build_mul(struct lp_build_context *bld,
>>               LLVMValueRef a,
>>               LLVMValueRef b);
>>
>>  LLVMValueRef
>> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
>> +                         LLVMValueRef a,
>> +                         LLVMValueRef b,
>> +                         LLVMValueRef *res_hi);
>> +
>> +LLVMValueRef
>>  lp_build_mul_32_lohi(struct lp_build_context *bld,
>>                       LLVMValueRef a,
>>                       LLVMValueRef b,
>>                       LLVMValueRef *res_hi);
>>
>>  LLVMValueRef
>>  lp_build_mul_imm(struct lp_build_context *bld,
>>                   LLVMValueRef a,
>>                   int b);
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> index 72d4579..9c6fc4b 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> @@ -842,39 +842,73 @@ imul_hi_emit(
>>     struct lp_build_tgsi_context * bld_base,
>>     struct lp_build_emit_data * emit_data)
>>  {
>>     struct lp_build_context *int_bld = &bld_base->int_bld;
>>     LLVMValueRef hi_bits;
>>
>>     assert(int_bld->type.width == 32);
>>
>>     /* low result bits are tossed away */
>>     lp_build_mul_32_lohi(int_bld, emit_data->args[0],
>> -                        emit_data->args[0], &hi_bits);
>> +                        emit_data->args[1], &hi_bits);
>> +   emit_data->output[emit_data->chan] = hi_bits;
>> +}
>> +
>> +static void
>> +imul_hi_emit_cpu(
>> +   const struct lp_build_tgsi_action * action,
>> +   struct lp_build_tgsi_context * bld_base,
>> +   struct lp_build_emit_data * emit_data)
>> +{
>> +   struct lp_build_context *int_bld = &bld_base->int_bld;
>> +   LLVMValueRef hi_bits;
>> +
>> +   assert(int_bld->type.width == 32);
>> +
>> +   /* low result bits are tossed away */
>> +   lp_build_mul_32_lohi_cpu(int_bld, emit_data->args[0],
>> +                            emit_data->args[1], &hi_bits);
>>     emit_data->output[emit_data->chan] = hi_bits;
>>  }
>>
>>  /* TGSI_OPCODE_UMUL_HI */
>>  static void
>>  umul_hi_emit(
>>     const struct lp_build_tgsi_action * action,
>>     struct lp_build_tgsi_context * bld_base,
>>     struct lp_build_emit_data * emit_data)
>>  {
>>     struct lp_build_context *uint_bld = &bld_base->uint_bld;
>>     LLVMValueRef hi_bits;
>>
>>     assert(uint_bld->type.width == 32);
>>
>>     /* low result bits are tossed away */
>>     lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
>> -                        emit_data->args[0], &hi_bits);
>> +                        emit_data->args[1], &hi_bits);
>> +   emit_data->output[emit_data->chan] = hi_bits;
>> +}
>> +
>> +static void
>> +umul_hi_emit_cpu(
>> +   const struct lp_build_tgsi_action * action,
>> +   struct lp_build_tgsi_context * bld_base,
>> +   struct lp_build_emit_data * emit_data)
>> +{
>> +   struct lp_build_context *uint_bld = &bld_base->uint_bld;
>> +   LLVMValueRef hi_bits;
>> +
>> +   assert(uint_bld->type.width == 32);
>> +
>> +   /* low result bits are tossed away */
>> +   lp_build_mul_32_lohi_cpu(uint_bld, emit_data->args[0],
>> +                            emit_data->args[1], &hi_bits);
>>     emit_data->output[emit_data->chan] = hi_bits;
>>  }
>>
>>  /* TGSI_OPCODE_MAX */
>>  static void fmax_emit(
>>     const struct lp_build_tgsi_action * action,
>>     struct lp_build_tgsi_context * bld_base,
>>     struct lp_build_emit_data * emit_data)
>>  {
>>     LLVMBuilderRef builder = bld_base->base.gallivm->builder;
>> @@ -2574,20 +2608,22 @@ lp_set_default_actions_cpu(
>>     bld_base->op_actions[TGSI_OPCODE_I2F].emit = i2f_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_IABS].emit = iabs_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_IDIV].emit = idiv_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_INEG].emit = ineg_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_IMAX].emit = imax_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_IMIN].emit = imin_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_ISGE].emit = isge_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu;
>> +   bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit_cpu;
>> +   bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit_cpu;
>>
>>     bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_MAD].emit = mad_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_MAX].emit = max_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_MIN].emit = min_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_MOD].emit = mod_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_NOT].emit = not_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_OR].emit = or_emit_cpu;
>>     bld_base->op_actions[TGSI_OPCODE_POW].emit = pow_emit_cpu;
>>
>