[Mesa-dev] [PATCH] gallivm: fix [IU]MUL_HI regression
Nicolai Hähnle
nhaehnle at gmail.com
Tue Nov 8 15:23:21 UTC 2016
On 08.11.2016 14:44, Roland Scheidegger wrote:
> Sorry for breaking radeonsi, I somehow thought this way only used for
> cpu only already, without actually checking...
> And thanks for fixing that typo, apparently you can pass piglits
> umul_hi/imul_hi tests (at least those from the shader_integer_mix group)
> even with the square of argument a...
Yeah, it sucks that test runs take so long with llvmpipe. Is there
anybody doing systematic full regression runs on it?
I do full runs on radeonsi fairly frequently, and I noticed this bug
with
tests/spec/arb_gpu_shader5/execution/built-in-functions/fs-imulExtended.shader_test
and friends.
> btw as I didn't consider this, I don't know if you want to change the
> shift/trunc to shuffle in the end - feel free to change it back if it
> doesn't generate good code on radeonsi...
It seems instcombine has no difficulties seeing through the IR, so I
think we're good :)
> Reviewed-by: Roland Scheidegger <sroland at vmware.com>
Thanks!
Nicolai
> Am 08.11.2016 um 10:15 schrieb Nicolai Hähnle:
>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>
>> This patch does two things:
>>
>> 1. It separates the host-CPU code generation from the generic code
>> generation. This guards against accidently breaking things for
>> radeonsi in the future.
>>
>> 2. It makes sure we actually use both arguments and don't just compute
>> a square :-p
>>
>> Fixes a regression introduced by commit 29279f44b3172ef3b84d470e70fc7684695ced4b
>>
>> Cc: Roland Scheidegger <sroland at vmware.com>
>> ---
>> src/gallium/auxiliary/gallivm/lp_bld_arit.c | 72 ++++++++++++++--------
>> src/gallium/auxiliary/gallivm/lp_bld_arit.h | 6 ++
>> src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 40 +++++++++++-
>> 3 files changed, 90 insertions(+), 28 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>> index 3de4628..43ad238 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>> @@ -1087,26 +1087,28 @@ lp_build_mul(struct lp_build_context *bld,
>> res = LLVMBuildLShr(builder, res, shift, "");
>> }
>> }
>>
>> return res;
>> }
>>
>> /*
>> * Widening mul, valid for 32x32 bit -> 64bit only.
>> * Result is low 32bits, high bits returned in res_hi.
>> + *
>> + * Emits code that is meant to be compiled for the host CPU.
>> */
>> LLVMValueRef
>> -lp_build_mul_32_lohi(struct lp_build_context *bld,
>> - LLVMValueRef a,
>> - LLVMValueRef b,
>> - LLVMValueRef *res_hi)
>> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
>> + LLVMValueRef a,
>> + LLVMValueRef b,
>> + LLVMValueRef *res_hi)
>> {
>> struct gallivm_state *gallivm = bld->gallivm;
>> LLVMBuilderRef builder = gallivm->builder;
>>
>> assert(bld->type.width == 32);
>> assert(bld->type.floating == 0);
>> assert(bld->type.fixed == 0);
>> assert(bld->type.norm == 0);
>>
>> /*
>> @@ -1209,43 +1211,61 @@ lp_build_mul_32_lohi(struct lp_build_context *bld,
>> *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
>>
>> for (i = 0; i < bld->type.length; i += 2) {
>> shuf[i] = lp_build_const_int32(gallivm, i);
>> shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
>> }
>> shuf_vec = LLVMConstVector(shuf, bld->type.length);
>> return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
>> }
>> else {
>> - LLVMValueRef tmp;
>> - struct lp_type type_tmp;
>> - LLVMTypeRef wide_type, cast_type;
>> -
>> - type_tmp = bld->type;
>> - type_tmp.width *= 2;
>> - wide_type = lp_build_vec_type(gallivm, type_tmp);
>> - type_tmp = bld->type;
>> - type_tmp.length *= 2;
>> - cast_type = lp_build_vec_type(gallivm, type_tmp);
>> -
>> - if (bld->type.sign) {
>> - a = LLVMBuildSExt(builder, a, wide_type, "");
>> - b = LLVMBuildSExt(builder, b, wide_type, "");
>> - } else {
>> - a = LLVMBuildZExt(builder, a, wide_type, "");
>> - b = LLVMBuildZExt(builder, b, wide_type, "");
>> - }
>> - tmp = LLVMBuildMul(builder, a, b, "");
>> - tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
>> - *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
>> - return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
>> + return lp_build_mul_32_lohi(bld, a, b, res_hi);
>> + }
>> +}
>> +
>> +
>> +/*
>> + * Widening mul, valid for 32x32 bit -> 64bit only.
>> + * Result is low 32bits, high bits returned in res_hi.
>> + *
>> + * Emits generic code.
>> + */
>> +LLVMValueRef
>> +lp_build_mul_32_lohi(struct lp_build_context *bld,
>> + LLVMValueRef a,
>> + LLVMValueRef b,
>> + LLVMValueRef *res_hi)
>> +{
>> + struct gallivm_state *gallivm = bld->gallivm;
>> + LLVMBuilderRef builder = gallivm->builder;
>> + LLVMValueRef tmp;
>> + struct lp_type type_tmp;
>> + LLVMTypeRef wide_type, cast_type;
>> +
>> + type_tmp = bld->type;
>> + type_tmp.width *= 2;
>> + wide_type = lp_build_vec_type(gallivm, type_tmp);
>> + type_tmp = bld->type;
>> + type_tmp.length *= 2;
>> + cast_type = lp_build_vec_type(gallivm, type_tmp);
>> +
>> + if (bld->type.sign) {
>> + a = LLVMBuildSExt(builder, a, wide_type, "");
>> + b = LLVMBuildSExt(builder, b, wide_type, "");
>> + } else {
>> + a = LLVMBuildZExt(builder, a, wide_type, "");
>> + b = LLVMBuildZExt(builder, b, wide_type, "");
>> }
>> + tmp = LLVMBuildMul(builder, a, b, "");
>> + tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
>> + *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
>> + return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
>> }
>>
>>
>> /* a * b + c */
>> LLVMValueRef
>> lp_build_mad(struct lp_build_context *bld,
>> LLVMValueRef a,
>> LLVMValueRef b,
>> LLVMValueRef c)
>> {
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>> index 5d48b1c..2a4137a 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>> @@ -70,20 +70,26 @@ LLVMValueRef
>> lp_build_sub(struct lp_build_context *bld,
>> LLVMValueRef a,
>> LLVMValueRef b);
>>
>> LLVMValueRef
>> lp_build_mul(struct lp_build_context *bld,
>> LLVMValueRef a,
>> LLVMValueRef b);
>>
>> LLVMValueRef
>> +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
>> + LLVMValueRef a,
>> + LLVMValueRef b,
>> + LLVMValueRef *res_hi);
>> +
>> +LLVMValueRef
>> lp_build_mul_32_lohi(struct lp_build_context *bld,
>> LLVMValueRef a,
>> LLVMValueRef b,
>> LLVMValueRef *res_hi);
>>
>> LLVMValueRef
>> lp_build_mul_imm(struct lp_build_context *bld,
>> LLVMValueRef a,
>> int b);
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> index 72d4579..9c6fc4b 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>> @@ -842,39 +842,73 @@ imul_hi_emit(
>> struct lp_build_tgsi_context * bld_base,
>> struct lp_build_emit_data * emit_data)
>> {
>> struct lp_build_context *int_bld = &bld_base->int_bld;
>> LLVMValueRef hi_bits;
>>
>> assert(int_bld->type.width == 32);
>>
>> /* low result bits are tossed away */
>> lp_build_mul_32_lohi(int_bld, emit_data->args[0],
>> - emit_data->args[0], &hi_bits);
>> + emit_data->args[1], &hi_bits);
>> + emit_data->output[emit_data->chan] = hi_bits;
>> +}
>> +
>> +static void
>> +imul_hi_emit_cpu(
>> + const struct lp_build_tgsi_action * action,
>> + struct lp_build_tgsi_context * bld_base,
>> + struct lp_build_emit_data * emit_data)
>> +{
>> + struct lp_build_context *int_bld = &bld_base->int_bld;
>> + LLVMValueRef hi_bits;
>> +
>> + assert(int_bld->type.width == 32);
>> +
>> + /* low result bits are tossed away */
>> + lp_build_mul_32_lohi_cpu(int_bld, emit_data->args[0],
>> + emit_data->args[1], &hi_bits);
>> emit_data->output[emit_data->chan] = hi_bits;
>> }
>>
>> /* TGSI_OPCODE_UMUL_HI */
>> static void
>> umul_hi_emit(
>> const struct lp_build_tgsi_action * action,
>> struct lp_build_tgsi_context * bld_base,
>> struct lp_build_emit_data * emit_data)
>> {
>> struct lp_build_context *uint_bld = &bld_base->uint_bld;
>> LLVMValueRef hi_bits;
>>
>> assert(uint_bld->type.width == 32);
>>
>> /* low result bits are tossed away */
>> lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
>> - emit_data->args[0], &hi_bits);
>> + emit_data->args[1], &hi_bits);
>> + emit_data->output[emit_data->chan] = hi_bits;
>> +}
>> +
>> +static void
>> +umul_hi_emit_cpu(
>> + const struct lp_build_tgsi_action * action,
>> + struct lp_build_tgsi_context * bld_base,
>> + struct lp_build_emit_data * emit_data)
>> +{
>> + struct lp_build_context *uint_bld = &bld_base->uint_bld;
>> + LLVMValueRef hi_bits;
>> +
>> + assert(uint_bld->type.width == 32);
>> +
>> + /* low result bits are tossed away */
>> + lp_build_mul_32_lohi_cpu(uint_bld, emit_data->args[0],
>> + emit_data->args[1], &hi_bits);
>> emit_data->output[emit_data->chan] = hi_bits;
>> }
>>
>> /* TGSI_OPCODE_MAX */
>> static void fmax_emit(
>> const struct lp_build_tgsi_action * action,
>> struct lp_build_tgsi_context * bld_base,
>> struct lp_build_emit_data * emit_data)
>> {
>> LLVMBuilderRef builder = bld_base->base.gallivm->builder;
>> @@ -2574,20 +2608,22 @@ lp_set_default_actions_cpu(
>> bld_base->op_actions[TGSI_OPCODE_I2F].emit = i2f_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_IABS].emit = iabs_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_IDIV].emit = idiv_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_INEG].emit = ineg_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_IMAX].emit = imax_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_IMIN].emit = imin_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_ISGE].emit = isge_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu;
>> + bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit_cpu;
>> + bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit_cpu;
>>
>> bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_MAD].emit = mad_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_MAX].emit = max_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_MIN].emit = min_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_MOD].emit = mod_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_NOT].emit = not_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_OR].emit = or_emit_cpu;
>> bld_base->op_actions[TGSI_OPCODE_POW].emit = pow_emit_cpu;
>>
>
More information about the mesa-dev
mailing list