[Mesa-dev] [PATCH] gallivm: fix [IU]MUL_HI regression

Tue Nov 8 16:24:11 UTC 2016

Am 08.11.2016 um 16:23 schrieb Nicolai Hähnle:
> On 08.11.2016 14:44, Roland Scheidegger wrote:
>> Sorry for breaking radeonsi, I somehow thought this way only used for
>> cpu only already, without actually checking...
>> And thanks for fixing that typo, apparently you can pass piglits
>> umul_hi/imul_hi tests (at least those from the shader_integer_mix group)
>> even with the square of argument a...
> 
> Yeah, it sucks that test runs take so long with llvmpipe. Is there
> anybody doing systematic full regression runs on it?
> 
> I do full runs on radeonsi fairly frequently, and I noticed this bug
> with
> tests/spec/arb_gpu_shader5/execution/built-in-functions/fs-imulExtended.shader_test
> and friends.

I do but not that frequently. I usually do full runs though with changes
such as this, so I thought I screwed up the testing.
However, looking at the assembly, this wasn't the case - glsl lowered
the imul_hi and umul_hi to a sequence of muls/adds/shifts....
The reason for that is probably the IMUL_HIGH_TO_MUL lowering - this is
done when ARB_gpu_shader5 isn't supported (which llvmpipe does not).
With llvmpipe, we definitely don't want the mul_hi lowering but there's
even a comment there that there's no individual caps (and some of the
other stuff might not be implemented).
So there was no way to catch that with llvmpipe, effectively only the
umul_hi was tested called directly from the draw fetch code, not from
shader (fwiw I did see a regression with some automated internal testing
using another api...).
(We also have automated piglit tests, however due to results changing
frequently it's difficult to catch "real" regressions.)

Roland

> 
> 
>> btw as I didn't consider this, I don't know if you want to change the
>> shift/trunc to shuffle in the end - feel free to change it back if it
>> doesn't generate good code on radeonsi...
> 
> It seems instcombine has no difficulties seeing through the IR, so I
> think we're good :)
Ok. With x86 sse2 it definitely generates some different assembly, but
what exactly is better depends on sse2/sse41/avx/avx2 being available
and the llvm version...

Roland

> 
> 
>> Reviewed-by: Roland Scheidegger <sroland at vmware.com>
> 
> Thanks!
> 
> Nicolai
> 
> 
>> Am 08.11.2016 um 10:15 schrieb Nicolai Hähnle:
>>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>>
>>> This patch does two things:
>>>
>>> 1. It separates the host-CPU code generation from the generic code
>>>    generation. This guards against accidently breaking things for
>>>    radeonsi in the future.
>>>
>>> 2. It makes sure we actually use both arguments and don't just compute
>>>    a square :-p
>>>
>>> Fixes a regression introduced by commit
>>> 29279f44b3172ef3b84d470e70fc7684695ced4b
>>>
>>> Cc: Roland Scheidegger <sroland at vmware.com>
>>> ---
>>>  src/gallium/auxiliary/gallivm/lp_bld_arit.c        | 72
>>> ++++++++++++++--------
>>>  src/gallium/auxiliary/gallivm/lp_bld_arit.h        |  6 ++
>>>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 40 +++++++++++-
>>>  3 files changed, 90 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> index 3de4628..43ad238 100644
>>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> @@ -1087,26 +1087,28 @@ lp_build_mul(struct lp_build_context *bld,
>>>              res = LLVMBuildLShr(builder, res, shift, "");
>>>        }
>>>     }
>>>
>>>     return res;
>>>  }
>>>
>>>  /*
>>>   * Widening mul, valid for 32x32 bit -> 64bit only.
>>>   * Result is low 32bits, high bits returned in res_hi.
>>> + *
>>> + * Emits code that is meant to be compiled for the host CPU.
>>>   */
>>>  LLVMValueRef
>>> -lp_build_mul_32_lohi(struct lp_build_context *bld,
>>> -                     LLVMValueRef a,
>>> -                     LLVMValueRef b,
>>> -                     LLVMValueRef *res_hi)
>>> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
>>> +                         LLVMValueRef a,
>>> +                         LLVMValueRef b,
>>> +                         LLVMValueRef *res_hi)
>>>  {
>>>     struct gallivm_state *gallivm = bld->gallivm;
>>>     LLVMBuilderRef builder = gallivm->builder;
>>>
>>>     assert(bld->type.width == 32);
>>>     assert(bld->type.floating == 0);
>>>     assert(bld->type.fixed == 0);
>>>     assert(bld->type.norm == 0);
>>>
>>>     /*
>>> @@ -1209,43 +1211,61 @@ lp_build_mul_32_lohi(struct lp_build_context
>>> *bld,
>>>        *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd,
>>> shuf_vec, "");
>>>
>>>        for (i = 0; i < bld->type.length; i += 2) {
>>>           shuf[i] = lp_build_const_int32(gallivm, i);
>>>           shuf[i+1] = lp_build_const_int32(gallivm, i +
>>> bld->type.length);
>>>        }
>>>        shuf_vec = LLVMConstVector(shuf, bld->type.length);
>>>        return LLVMBuildShuffleVector(builder, muleven, mulodd,
>>> shuf_vec, "");
>>>     }
>>>     else {
>>> -      LLVMValueRef tmp;
>>> -      struct lp_type type_tmp;
>>> -      LLVMTypeRef wide_type, cast_type;
>>> -
>>> -      type_tmp = bld->type;
>>> -      type_tmp.width *= 2;
>>> -      wide_type = lp_build_vec_type(gallivm, type_tmp);
>>> -      type_tmp = bld->type;
>>> -      type_tmp.length *= 2;
>>> -      cast_type = lp_build_vec_type(gallivm, type_tmp);
>>> -
>>> -      if (bld->type.sign) {
>>> -         a = LLVMBuildSExt(builder, a, wide_type, "");
>>> -         b = LLVMBuildSExt(builder, b, wide_type, "");
>>> -      } else {
>>> -         a = LLVMBuildZExt(builder, a, wide_type, "");
>>> -         b = LLVMBuildZExt(builder, b, wide_type, "");
>>> -      }
>>> -      tmp = LLVMBuildMul(builder, a, b, "");
>>> -      tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
>>> -      *res_hi = lp_build_uninterleave1(gallivm, bld->type.length *
>>> 2, tmp, 1);
>>> -      return lp_build_uninterleave1(gallivm, bld->type.length * 2,
>>> tmp, 0);
>>> +      return lp_build_mul_32_lohi(bld, a, b, res_hi);
>>> +   }
>>> +}
>>> +
>>> +
>>> +/*
>>> + * Widening mul, valid for 32x32 bit -> 64bit only.
>>> + * Result is low 32bits, high bits returned in res_hi.
>>> + *
>>> + * Emits generic code.
>>> + */
>>> +LLVMValueRef
>>> +lp_build_mul_32_lohi(struct lp_build_context *bld,
>>> +                     LLVMValueRef a,
>>> +                     LLVMValueRef b,
>>> +                     LLVMValueRef *res_hi)
>>> +{
>>> +   struct gallivm_state *gallivm = bld->gallivm;
>>> +   LLVMBuilderRef builder = gallivm->builder;
>>> +   LLVMValueRef tmp;
>>> +   struct lp_type type_tmp;
>>> +   LLVMTypeRef wide_type, cast_type;
>>> +
>>> +   type_tmp = bld->type;
>>> +   type_tmp.width *= 2;
>>> +   wide_type = lp_build_vec_type(gallivm, type_tmp);
>>> +   type_tmp = bld->type;
>>> +   type_tmp.length *= 2;
>>> +   cast_type = lp_build_vec_type(gallivm, type_tmp);
>>> +
>>> +   if (bld->type.sign) {
>>> +      a = LLVMBuildSExt(builder, a, wide_type, "");
>>> +      b = LLVMBuildSExt(builder, b, wide_type, "");
>>> +   } else {
>>> +      a = LLVMBuildZExt(builder, a, wide_type, "");
>>> +      b = LLVMBuildZExt(builder, b, wide_type, "");
>>>     }
>>> +   tmp = LLVMBuildMul(builder, a, b, "");
>>> +   tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
>>> +   *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2,
>>> tmp, 1);
>>> +   return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp,
>>> 0);
>>>  }
>>>
>>>
>>>  /* a * b + c */
>>>  LLVMValueRef
>>>  lp_build_mad(struct lp_build_context *bld,
>>>               LLVMValueRef a,
>>>               LLVMValueRef b,
>>>               LLVMValueRef c)
>>>  {
>>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> index 5d48b1c..2a4137a 100644
>>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> @@ -70,20 +70,26 @@ LLVMValueRef
>>>  lp_build_sub(struct lp_build_context *bld,
>>>               LLVMValueRef a,
>>>               LLVMValueRef b);
>>>
>>>  LLVMValueRef
>>>  lp_build_mul(struct lp_build_context *bld,
>>>               LLVMValueRef a,
>>>               LLVMValueRef b);
>>>
>>>  LLVMValueRef
>>> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
>>> +                         LLVMValueRef a,
>>> +                         LLVMValueRef b,
>>> +                         LLVMValueRef *res_hi);
>>> +
>>> +LLVMValueRef
>>>  lp_build_mul_32_lohi(struct lp_build_context *bld,
>>>                       LLVMValueRef a,
>>>                       LLVMValueRef b,
>>>                       LLVMValueRef *res_hi);
>>>
>>>  LLVMValueRef
>>>  lp_build_mul_imm(struct lp_build_context *bld,
>>>                   LLVMValueRef a,
>>>                   int b);
>>>
>>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> index 72d4579..9c6fc4b 100644
>>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> @@ -842,39 +842,73 @@ imul_hi_emit(
>>>     struct lp_build_tgsi_context * bld_base,
>>>     struct lp_build_emit_data * emit_data)
>>>  {
>>>     struct lp_build_context *int_bld = &bld_base->int_bld;
>>>     LLVMValueRef hi_bits;
>>>
>>>     assert(int_bld->type.width == 32);
>>>
>>>     /* low result bits are tossed away */
>>>     lp_build_mul_32_lohi(int_bld, emit_data->args[0],
>>> -                        emit_data->args[0], &hi_bits);
>>> +                        emit_data->args[1], &hi_bits);
>>> +   emit_data->output[emit_data->chan] = hi_bits;
>>> +}
>>> +
>>> +static void
>>> +imul_hi_emit_cpu(
>>> +   const struct lp_build_tgsi_action * action,
>>> +   struct lp_build_tgsi_context * bld_base,
>>> +   struct lp_build_emit_data * emit_data)
>>> +{
>>> +   struct lp_build_context *int_bld = &bld_base->int_bld;
>>> +   LLVMValueRef hi_bits;
>>> +
>>> +   assert(int_bld->type.width == 32);
>>> +
>>> +   /* low result bits are tossed away */
>>> +   lp_build_mul_32_lohi_cpu(int_bld, emit_data->args[0],
>>> +                            emit_data->args[1], &hi_bits);
>>>     emit_data->output[emit_data->chan] = hi_bits;
>>>  }
>>>
>>>  /* TGSI_OPCODE_UMUL_HI */
>>>  static void
>>>  umul_hi_emit(
>>>     const struct lp_build_tgsi_action * action,
>>>     struct lp_build_tgsi_context * bld_base,
>>>     struct lp_build_emit_data * emit_data)
>>>  {
>>>     struct lp_build_context *uint_bld = &bld_base->uint_bld;
>>>     LLVMValueRef hi_bits;
>>>
>>>     assert(uint_bld->type.width == 32);
>>>
>>>     /* low result bits are tossed away */
>>>     lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
>>> -                        emit_data->args[0], &hi_bits);
>>> +                        emit_data->args[1], &hi_bits);
>>> +   emit_data->output[emit_data->chan] = hi_bits;
>>> +}
>>> +
>>> +static void
>>> +umul_hi_emit_cpu(
>>> +   const struct lp_build_tgsi_action * action,
>>> +   struct lp_build_tgsi_context * bld_base,
>>> +   struct lp_build_emit_data * emit_data)
>>> +{
>>> +   struct lp_build_context *uint_bld = &bld_base->uint_bld;
>>> +   LLVMValueRef hi_bits;
>>> +
>>> +   assert(uint_bld->type.width == 32);
>>> +
>>> +   /* low result bits are tossed away */
>>> +   lp_build_mul_32_lohi_cpu(uint_bld, emit_data->args[0],
>>> +                            emit_data->args[1], &hi_bits);
>>>     emit_data->output[emit_data->chan] = hi_bits;
>>>  }
>>>
>>>  /* TGSI_OPCODE_MAX */
>>>  static void fmax_emit(
>>>     const struct lp_build_tgsi_action * action,
>>>     struct lp_build_tgsi_context * bld_base,
>>>     struct lp_build_emit_data * emit_data)
>>>  {
>>>     LLVMBuilderRef builder = bld_base->base.gallivm->builder;
>>> @@ -2574,20 +2608,22 @@ lp_set_default_actions_cpu(
>>>     bld_base->op_actions[TGSI_OPCODE_I2F].emit = i2f_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_IABS].emit = iabs_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_IDIV].emit = idiv_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_INEG].emit = ineg_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_IMAX].emit = imax_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_IMIN].emit = imin_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_ISGE].emit = isge_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu;
>>> +   bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit_cpu;
>>> +   bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit_cpu;
>>>
>>>     bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_MAD].emit = mad_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_MAX].emit = max_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_MIN].emit = min_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_MOD].emit = mod_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_NOT].emit = not_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_OR].emit = or_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_POW].emit = pow_emit_cpu;
>>>
>>