[Mesa-dev] [PATCH] gallivm: fix [IU]MUL_HI regression

Tue Nov 8 13:44:49 UTC 2016

Sorry for breaking radeonsi, I somehow thought this way only used for
cpu only already, without actually checking...
And thanks for fixing that typo, apparently you can pass piglits
umul_hi/imul_hi tests (at least those from the shader_integer_mix group)
even with the square of argument a...
btw as I didn't consider this, I don't know if you want to change the
shift/trunc to shuffle in the end - feel free to change it back if it
doesn't generate good code on radeonsi...

Reviewed-by: Roland Scheidegger <sroland at vmware.com>

Am 08.11.2016 um 10:15 schrieb Nicolai Hähnle:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
> 
> This patch does two things:
> 
> 1. It separates the host-CPU code generation from the generic code
>    generation. This guards against accidently breaking things for
>    radeonsi in the future.
> 
> 2. It makes sure we actually use both arguments and don't just compute
>    a square :-p
> 
> Fixes a regression introduced by commit 29279f44b3172ef3b84d470e70fc7684695ced4b
> 
> Cc: Roland Scheidegger <sroland at vmware.com>
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c        | 72 ++++++++++++++--------
>  src/gallium/auxiliary/gallivm/lp_bld_arit.h        |  6 ++
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 40 +++++++++++-
>  3 files changed, 90 insertions(+), 28 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index 3de4628..43ad238 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -1087,26 +1087,28 @@ lp_build_mul(struct lp_build_context *bld,
>              res = LLVMBuildLShr(builder, res, shift, "");
>        }
>     }
>  
>     return res;
>  }
>  
>  /*
>   * Widening mul, valid for 32x32 bit -> 64bit only.
>   * Result is low 32bits, high bits returned in res_hi.
> + *
> + * Emits code that is meant to be compiled for the host CPU.
>   */
>  LLVMValueRef
> -lp_build_mul_32_lohi(struct lp_build_context *bld,
> -                     LLVMValueRef a,
> -                     LLVMValueRef b,
> -                     LLVMValueRef *res_hi)
> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
> +                         LLVMValueRef a,
> +                         LLVMValueRef b,
> +                         LLVMValueRef *res_hi)
>  {
>     struct gallivm_state *gallivm = bld->gallivm;
>     LLVMBuilderRef builder = gallivm->builder;
>  
>     assert(bld->type.width == 32);
>     assert(bld->type.floating == 0);
>     assert(bld->type.fixed == 0);
>     assert(bld->type.norm == 0);
>  
>     /*
> @@ -1209,43 +1211,61 @@ lp_build_mul_32_lohi(struct lp_build_context *bld,
>        *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
>  
>        for (i = 0; i < bld->type.length; i += 2) {
>           shuf[i] = lp_build_const_int32(gallivm, i);
>           shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
>        }
>        shuf_vec = LLVMConstVector(shuf, bld->type.length);
>        return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
>     }
>     else {
> -      LLVMValueRef tmp;
> -      struct lp_type type_tmp;
> -      LLVMTypeRef wide_type, cast_type;
> -
> -      type_tmp = bld->type;
> -      type_tmp.width *= 2;
> -      wide_type = lp_build_vec_type(gallivm, type_tmp);
> -      type_tmp = bld->type;
> -      type_tmp.length *= 2;
> -      cast_type = lp_build_vec_type(gallivm, type_tmp);
> -
> -      if (bld->type.sign) {
> -         a = LLVMBuildSExt(builder, a, wide_type, "");
> -         b = LLVMBuildSExt(builder, b, wide_type, "");
> -      } else {
> -         a = LLVMBuildZExt(builder, a, wide_type, "");
> -         b = LLVMBuildZExt(builder, b, wide_type, "");
> -      }
> -      tmp = LLVMBuildMul(builder, a, b, "");
> -      tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
> -      *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
> -      return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
> +      return lp_build_mul_32_lohi(bld, a, b, res_hi);
> +   }
> +}
> +
> +
> +/*
> + * Widening mul, valid for 32x32 bit -> 64bit only.
> + * Result is low 32bits, high bits returned in res_hi.
> + *
> + * Emits generic code.
> + */
> +LLVMValueRef
> +lp_build_mul_32_lohi(struct lp_build_context *bld,
> +                     LLVMValueRef a,
> +                     LLVMValueRef b,
> +                     LLVMValueRef *res_hi)
> +{
> +   struct gallivm_state *gallivm = bld->gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef tmp;
> +   struct lp_type type_tmp;
> +   LLVMTypeRef wide_type, cast_type;
> +
> +   type_tmp = bld->type;
> +   type_tmp.width *= 2;
> +   wide_type = lp_build_vec_type(gallivm, type_tmp);
> +   type_tmp = bld->type;
> +   type_tmp.length *= 2;
> +   cast_type = lp_build_vec_type(gallivm, type_tmp);
> +
> +   if (bld->type.sign) {
> +      a = LLVMBuildSExt(builder, a, wide_type, "");
> +      b = LLVMBuildSExt(builder, b, wide_type, "");
> +   } else {
> +      a = LLVMBuildZExt(builder, a, wide_type, "");
> +      b = LLVMBuildZExt(builder, b, wide_type, "");
>     }
> +   tmp = LLVMBuildMul(builder, a, b, "");
> +   tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
> +   *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
> +   return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
>  }
>  
>  
>  /* a * b + c */
>  LLVMValueRef
>  lp_build_mad(struct lp_build_context *bld,
>               LLVMValueRef a,
>               LLVMValueRef b,
>               LLVMValueRef c)
>  {
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> index 5d48b1c..2a4137a 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> @@ -70,20 +70,26 @@ LLVMValueRef
>  lp_build_sub(struct lp_build_context *bld,
>               LLVMValueRef a,
>               LLVMValueRef b);
>  
>  LLVMValueRef
>  lp_build_mul(struct lp_build_context *bld,
>               LLVMValueRef a,
>               LLVMValueRef b);
>  
>  LLVMValueRef
> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
> +                         LLVMValueRef a,
> +                         LLVMValueRef b,
> +                         LLVMValueRef *res_hi);
> +
> +LLVMValueRef
>  lp_build_mul_32_lohi(struct lp_build_context *bld,
>                       LLVMValueRef a,
>                       LLVMValueRef b,
>                       LLVMValueRef *res_hi);
>  
>  LLVMValueRef
>  lp_build_mul_imm(struct lp_build_context *bld,
>                   LLVMValueRef a,
>                   int b);
>  
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> index 72d4579..9c6fc4b 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> @@ -842,39 +842,73 @@ imul_hi_emit(
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     struct lp_build_context *int_bld = &bld_base->int_bld;
>     LLVMValueRef hi_bits;
>  
>     assert(int_bld->type.width == 32);
>  
>     /* low result bits are tossed away */
>     lp_build_mul_32_lohi(int_bld, emit_data->args[0],
> -                        emit_data->args[0], &hi_bits);
> +                        emit_data->args[1], &hi_bits);
> +   emit_data->output[emit_data->chan] = hi_bits;
> +}
> +
> +static void
> +imul_hi_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   struct lp_build_context *int_bld = &bld_base->int_bld;
> +   LLVMValueRef hi_bits;
> +
> +   assert(int_bld->type.width == 32);
> +
> +   /* low result bits are tossed away */
> +   lp_build_mul_32_lohi_cpu(int_bld, emit_data->args[0],
> +                            emit_data->args[1], &hi_bits);
>     emit_data->output[emit_data->chan] = hi_bits;
>  }
>  
>  /* TGSI_OPCODE_UMUL_HI */
>  static void
>  umul_hi_emit(
>     const struct lp_build_tgsi_action * action,
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     struct lp_build_context *uint_bld = &bld_base->uint_bld;
>     LLVMValueRef hi_bits;
>  
>     assert(uint_bld->type.width == 32);
>  
>     /* low result bits are tossed away */
>     lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
> -                        emit_data->args[0], &hi_bits);
> +                        emit_data->args[1], &hi_bits);
> +   emit_data->output[emit_data->chan] = hi_bits;
> +}
> +
> +static void
> +umul_hi_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   struct lp_build_context *uint_bld = &bld_base->uint_bld;
> +   LLVMValueRef hi_bits;
> +
> +   assert(uint_bld->type.width == 32);
> +
> +   /* low result bits are tossed away */
> +   lp_build_mul_32_lohi_cpu(uint_bld, emit_data->args[0],
> +                            emit_data->args[1], &hi_bits);
>     emit_data->output[emit_data->chan] = hi_bits;
>  }
>  
>  /* TGSI_OPCODE_MAX */
>  static void fmax_emit(
>     const struct lp_build_tgsi_action * action,
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> @@ -2574,20 +2608,22 @@ lp_set_default_actions_cpu(
>     bld_base->op_actions[TGSI_OPCODE_I2F].emit = i2f_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_IABS].emit = iabs_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_IDIV].emit = idiv_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_INEG].emit = ineg_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_IMAX].emit = imax_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_IMIN].emit = imin_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_ISGE].emit = isge_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit_cpu;
>  
>     bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_MAD].emit = mad_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_MAX].emit = max_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_MIN].emit = min_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_MOD].emit = mod_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_NOT].emit = not_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_OR].emit = or_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_POW].emit = pow_emit_cpu;
>