[Mesa-dev] [PATCH] r600/fp64: fix integer->double conversion

Fri Feb 2 16:38:04 UTC 2018

Am 02.02.2018 um 05:56 schrieb Dave Airlie:
> From: Dave Airlie <airlied at redhat.com>
> 
> Doing a straight uint/int->fp32->fp64 conversion causes
> some precision issues, Roland suggested splitting the
> integer into two portions and doing two separate
> int->fp32->fp64 conversions then adding the results.
> 
> This passes the tests in CTS and piglit.
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/gallium/drivers/r600/r600_shader.c | 118 +++++++++++++++++++++++++--------
>  1 file changed, 90 insertions(+), 28 deletions(-)
> 
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index 13aa681049..22f2736b03 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -4490,44 +4490,106 @@ static int egcm_int_to_double(struct r600_shader_ctx *ctx)
>  {
>  	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
>  	struct r600_bytecode_alu alu;
> -	int i, r;
> -	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
> +	int i, c, r;
> +	int write_mask = inst->Dst[0].Register.WriteMask;
> +	int temp_reg = r600_get_temp(ctx);
>  
>  	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
>  		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
>  
> -	for (i = 0; i <= (lasti+1)/2; i++) {
> -		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> -		alu.op = ctx->inst_info->op;
> -
> -		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
> -		alu.dst.sel = ctx->temp_reg;
> -		alu.dst.chan = i;
> -		alu.dst.write = 1;
> -		alu.last = 1;
> +	for (c = 0; c < 2; c++) {
> +		int dchan = c * 2;
> +		if (write_mask & (0x3 << dchan)) {
> +	/* split into 24-bit int and 8-bit int */
> +			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +			alu.op = ALU_OP2_AND_INT;
> +			alu.dst.sel = temp_reg;
> +			alu.dst.chan = dchan;
> +			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
> +			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +			alu.src[1].value = 0xffffff00;
> +			alu.dst.write = 1;
> +			r = r600_bytecode_add_alu(ctx->bc, &alu);
> +			if (r)
> +				return r;
>  
> -		r = r600_bytecode_add_alu(ctx->bc, &alu);
> -		if (r)
> -			return r;
> +			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +			alu.op = ALU_OP2_AND_INT;
> +			alu.dst.sel = temp_reg;
> +			alu.dst.chan = dchan + 1;
> +			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
> +			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +			alu.src[1].value = 0xff;
> +			alu.dst.write = 1;
> +			alu.last = 1;
> +			r = r600_bytecode_add_alu(ctx->bc, &alu);
> +			if (r)
> +				return r;
> +		}
>  	}
>  
> -	for (i = 0; i <= lasti; i++) {
> -		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> -		alu.op = ALU_OP1_FLT32_TO_FLT64;
> +	for (c = 0; c < 2; c++) {
> +		int dchan = c * 2;
> +		if (write_mask & (0x3 << dchan)) {
> +			for (i = dchan; i <= dchan + 1; i++) {
> +				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +				alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
>  
> -		alu.src[0].chan = i/2;
> -		if (i%2 == 0)
> -			alu.src[0].sel = ctx->temp_reg;
> -		else {
> -			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> -			alu.src[0].value = 0x0;
> +				alu.src[0].sel = temp_reg;
> +				alu.src[0].chan = i;
> +				alu.dst.sel = temp_reg;
> +				alu.dst.chan = i;
> +				alu.dst.write = 1;
> +				alu.last = i == dchan + 1;
> +
> +				r = r600_bytecode_add_alu(ctx->bc, &alu);
> +				if (r)
> +					return r;
> +			}
>  		}
That'll still work on eg (cypress) where UINT_TO_FLT is scalar, right?
I just realized that for the low 8 bits you could actually skip the
masking and use UBYTE0_FLT instead if that instruction does what the
docs say :-). Though I guess on Cayman that won't be much of an
improvement, but might shave off another instruction or two on Cypress
(as this one is a vector instruction)...
In any case,

Reviewed-by: Roland Scheidegger <sroland at vmware.com>

> -		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> -		alu.last = i == lasti;
> +	}
>  
> -		r = r600_bytecode_add_alu(ctx->bc, &alu);
> -		if (r)
> -			return r;
> +	for (c = 0; c < 2; c++) {
> +		int dchan = c * 2;
> +		if (write_mask & (0x3 << dchan)) {
> +			for (i = 0; i < 4; i++) {
> +				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +				alu.op = ALU_OP1_FLT32_TO_FLT64;
> +
> +				alu.src[0].chan = dchan + (i / 2);
> +				if (i == 0 || i == 2)
> +					alu.src[0].sel = temp_reg;
> +				else {
> +					alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> +					alu.src[0].value = 0x0;
> +				}
> +				alu.dst.sel = ctx->temp_reg;
> +				alu.dst.chan = i;
> +				alu.last = i == 3;
> +				alu.dst.write = 1;
> +
> +				r = r600_bytecode_add_alu(ctx->bc, &alu);
> +				if (r)
> +					return r;
> +			}
> +
> +			for (i = 0; i <= 1; i++) {
> +				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +				alu.op = ALU_OP2_ADD_64;
> +
> +				alu.src[0].chan = fp64_switch(i);
> +				alu.src[0].sel = ctx->temp_reg;
> +
> +				alu.src[1].chan = fp64_switch(i + 2);
> +				alu.src[1].sel = ctx->temp_reg;
> +				tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
> +				alu.last = i == 1;
> +
> +				r = r600_bytecode_add_alu(ctx->bc, &alu);
> +				if (r)
> +					return r;
> +			}
> +		}
>  	}
>  
>  	return 0;
>