[Mesa-dev] [PATCH v2 3/5] r600: implement DDIV
Nicolai Hähnle
nhaehnle at gmail.com
Thu Jan 19 17:35:27 UTC 2017
On 19.01.2017 17:39, Roland Scheidegger wrote:
> Double-capable Evergreen/NI can't do ddiv? Interesting. I wonder how
> it's made d3d11 double conformant...
Even GCN has no native DDIV. It's implemented as a bunch of bespoke
instructions for scale-shifted DRCP followed by what is effectively a
kind of Newton-Raphson iteration. We'd have to do that for r600 as well,
but since I can't even test it, I couldn't be bothered...
Nicolai
>
> Roland
>
> Am 19.01.2017 um 14:59 schrieb Nicolai Hähnle:
>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>
>> ---
>> src/gallium/drivers/r600/r600_shader.c | 59 ++++++++++++++++++++++++++++++++++
>> 1 file changed, 59 insertions(+)
>>
>> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
>> index 5c4bc91..eaabb04 100644
>> --- a/src/gallium/drivers/r600/r600_shader.c
>> +++ b/src/gallium/drivers/r600/r600_shader.c
>> @@ -4384,20 +4384,77 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
>> alu.last = 1;
>> r = r600_bytecode_add_alu(ctx->bc, &alu);
>> if (r)
>> return r;
>> }
>>
>> return 0;
>> }
>>
>> /*
>> + * Emit RECIP_64 + MUL_64 to implement division.
>> + */
>> +static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
>> +{
>> + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
>> + int r;
>> + struct r600_bytecode_alu alu;
>> + int t1 = ctx->temp_reg;
>> + int k;
>> +
>> + /* Only support one double at a time. This is the same constraint as
>> + * in DMUL lowering. */
>> + assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
>> + inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
>> +
>> + k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
>> +
>> + r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
>> + if (r)
>> + return r;
>> +
>> + for (int i = 0; i < 4; i++) {
>> + memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>> + alu.op = ALU_OP2_MUL_64;
>> +
>> + r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
>> +
>> + alu.src[1].sel = t1;
>> + alu.src[1].chan = (i == 3) ? 0 : 1;
>> +
>> + alu.dst.sel = t1;
>> + alu.dst.chan = i;
>> + alu.dst.write = 1;
>> + if (i == 3)
>> + alu.last = 1;
>> + r = r600_bytecode_add_alu(ctx->bc, &alu);
>> + if (r)
>> + return r;
>> + }
>> +
>> + for (int i = 0; i < 2; i++) {
>> + memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>> + alu.op = ALU_OP1_MOV;
>> + alu.src[0].sel = t1;
>> + alu.src[0].chan = i;
>> + tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
>> + alu.dst.write = 1;
>> + if (i == 1)
>> + alu.last = 1;
>> + r = r600_bytecode_add_alu(ctx->bc, &alu);
>> + if (r)
>> + return r;
>> + }
>> + return 0;
>> +}
>> +
>> +/*
>> * r600 - trunc to -PI..PI range
>> * r700 - normalize by dividing by 2PI
>> * see fdo bug 27901
>> */
>> static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
>> {
>> int r;
>> struct r600_bytecode_alu alu;
>>
>> memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>> @@ -9393,20 +9450,21 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
>> [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
>> [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
>> [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
>> [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
>> [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
>> [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
>> [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
>> [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
>> [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
>> + [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
>> [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
>> [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
>> [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
>> [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
>> [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
>> [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
>> [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
>> @@ -9615,20 +9673,21 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
>> [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
>> [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
>> [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
>> [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
>> [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
>> [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
>> [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
>> [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
>> [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
>> + [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
>> [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
>> [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
>> [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
>> [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
>> [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
>> [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
>> [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
>> [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
>>
>
More information about the mesa-dev
mailing list