[Mesa-dev] [PATCH 2/2] r600g: add doubles support for CAYMAN

Thu Feb 19 18:59:13 PST 2015

On Fri, 20 Feb 2015 01:54:03 +0100, Dave Airlie <airlied at gmail.com> wrote:

> From: Dave Airlie <airlied at redhat.com>
>
> Only a subset of AMD GPUs supported by r600g support doubles,
> CAYMAN and CYPRESS are probably all we'll try and support, however
> I don't have a CYPRESS so ignore that for now.
>
> This disables SB support for doubles, as we think we need to
> make the scheduler smarter to introduce delay slots.
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/gallium/drivers/r600/r600_asm.c    |  14 ++
>  src/gallium/drivers/r600/r600_asm.h    |  15 ++
>  src/gallium/drivers/r600/r600_isa.h    |   8 +-
>  src/gallium/drivers/r600/r600_pipe.c   |   2 +
>  src/gallium/drivers/r600/r600_shader.c | 389  
> ++++++++++++++++++++++++++++++++-
>  src/gallium/drivers/r600/r600_shader.h |   2 +
>  6 files changed, 424 insertions(+), 6 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/r600_asm.c  
> b/src/gallium/drivers/r600/r600_asm.c
> index 79e7f74..dc26b63 100644
> --- a/src/gallium/drivers/r600/r600_asm.c
> +++ b/src/gallium/drivers/r600/r600_asm.c
> @@ -252,6 +252,12 @@ static int alu_uses_rel(struct r600_bytecode *bc,  
> struct r600_bytecode_alu *alu)
>  	return 0;
>  }
> +static int is_alu_64bit_inst(struct r600_bytecode *bc, struct  
> r600_bytecode_alu *alu)
> +{
> +	const struct alu_op_info *op = r600_isa_alu(alu->op);
> +	return (op->flags & AF_64);
> +}
> +
>  static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct  
> r600_bytecode_alu *alu)
>  {
>  	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
> @@ -576,6 +582,12 @@ static int replace_gpr_with_pv_ps(struct  
> r600_bytecode *bc,
> 	for (i = 0; i < max_slots; ++i) {
>  		if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) &&  
> !prev[i]->dst.rel) {
> +
> +			if (is_alu_64bit_inst(bc, prev[i])) {
> +				gpr[i] = -1;
> +				continue;
> +			}
> +
>  			gpr[i] = prev[i]->dst.sel;
>  			/* cube writes more than PV.X */
>  			if (is_alu_reduction_inst(bc, prev[i]))
> @@ -591,6 +603,8 @@ static int replace_gpr_with_pv_ps(struct  
> r600_bytecode *bc,
>  		if(!alu)
>  			continue;
> +		if (is_alu_64bit_inst(bc, alu))
> +			continue;
>  		num_src = r600_bytecode_get_num_operands(bc, alu);
>  		for (src = 0; src < num_src; ++src) {
>  			if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
> diff --git a/src/gallium/drivers/r600/r600_asm.h  
> b/src/gallium/drivers/r600/r600_asm.h
> index e37d926..7b2734c 100644
> --- a/src/gallium/drivers/r600/r600_asm.h
> +++ b/src/gallium/drivers/r600/r600_asm.h
> @@ -279,4 +279,19 @@ void eg_bytecode_export_read(struct r600_bytecode  
> *bc,
> void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
>  			   unsigned *num_format, unsigned *format_comp, unsigned *endian);
> +
> +static INLINE int fp64_switch(int i)
> +{

Rather hard to decipher what this function does. How about  
fp64_gpr_channel_swizzle?

> +	switch (i) {
> +	case 0:
> +		return 1;
> +	case 1:
> +		return 0;
> +	case 2:
> +		return 3;
> +	case 3:
> +		return 2;
> +	}
> +	return 0;
> +}
>  #endif
> diff --git a/src/gallium/drivers/r600/r600_isa.h  
> b/src/gallium/drivers/r600/r600_isa.h
> index ec3f702..3cc135e 100644
> --- a/src/gallium/drivers/r600/r600_isa.h
> +++ b/src/gallium/drivers/r600/r600_isa.h
> @@ -339,11 +339,11 @@ static const struct alu_op_info alu_op_table[] = {

Might be an idea to fix up the table entries for MULADD for R6xx/R7xx,  
they are 4 slot too.

FREXP_64 is a 4 slot instruction, not 2.

>  		{"PRED_SETGT_64",             2, { 0x7C, 0xC7 },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_PRED | AF_CC_GT | AF_64 },
>  		{"PRED_SETE_64",              2, { 0x7D, 0xC8 },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_PRED | AF_CC_E | AF_64 },
>  		{"PRED_SETGE_64",             2, { 0x7E, 0xC9 },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_PRED | AF_CC_GE | AF_64 },
> -		{"MUL_64",                    2, { 0x1B, 0xCA },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_64 },
> +		{"MUL_64",                    2, { 0x1B, 0xCA },{   AF_V,  AF_V,

4 slot instruction also on r600/r700/evergreen, might as well fix the  
table entries while touching this

> AF_V,  AF_4V}, AF_64 },
>  		{"ADD_64",                    2, { 0x17, 0xCB },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_64 },
>  		{"MOVA_INT",                  1, { 0x18, 0xCC },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_MOVA },
> -		{"FLT64_TO_FLT32",            1, { 0x1C, 0xCD },{   AF_V,  AF_V,   
> AF_V,  AF_V},  0 },
> -		{"FLT32_TO_FLT64",            1, { 0x1D, 0xCE },{   AF_V,  AF_V,   
> AF_V,  AF_V},  0 },
> +		{"FLT64_TO_FLT32",            1, { 0x1C, 0xCD },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_64 },
> +		{"FLT32_TO_FLT64",            1, { 0x1D, 0xCE },{   AF_V,  AF_V,   
> AF_V,  AF_V},  AF_64 },
>  		{"SAD_ACCUM_PREV_UINT",       2, {   -1, 0xCF },{      0,     0,   
> AF_V,  AF_V},  AF_UINT_DST | AF_PREV_NEXT },
>  		{"DOT",                       2, {   -1, 0xD0 },{      0,     0,   
> AF_V,  AF_V},  AF_PREV_NEXT },
>  		{"MUL_PREV",                  1, {   -1, 0xD1 },{      0,     0,   
> AF_V,  AF_V},  AF_PREV_INTERLEAVE },
> @@ -369,7 +369,7 @@ static const struct alu_op_info alu_op_table[] = {
>  		{"FMA",                       3, {   -1, 0x07 },{      0,     0,   
> AF_V,  AF_V},  0 },
>  		{"MULADD_INT24",              3, {   -1, 0x08 },{      0,     0,      
> 0,  AF_V},  AF_INT_DST | AF_24 },
>  		{"CNDNE_64",                  3, {   -1, 0x09 },{      0,     0,   
> AF_V,  AF_V},  AF_CMOV | AF_64 },
> -		{"FMA_64",                    3, {   -1, 0x0A },{      0,     0,   
> AF_V,  AF_V},  AF_64 },
> +		{"FMA_64",                    3, {   -1, 0x0A },{      0,     0,   
> AF_V,  AF_4V}, AF_64 },

4 slot also on evergreen

>  		{"LERP_UINT",                 3, {   -1, 0x0B },{      0,     0,   
> AF_V,  AF_V},  AF_UINT_DST },
>  		{"BIT_ALIGN_INT",             3, {   -1, 0x0C },{      0,     0,   
> AF_V,  AF_V},  AF_INT_DST },
>  		{"BYTE_ALIGN_INT",            3, {   -1, 0x0D },{      0,     0,   
> AF_V,  AF_V},  AF_INT_DST },
> diff --git a/src/gallium/drivers/r600/r600_pipe.c  
> b/src/gallium/drivers/r600/r600_pipe.c
> index a4b7b66..9d9f1d4 100644
> --- a/src/gallium/drivers/r600/r600_pipe.c
> +++ b/src/gallium/drivers/r600/r600_pipe.c
> @@ -488,6 +488,8 @@ static int r600_get_shader_param(struct pipe_screen*  
> pscreen, unsigned shader, e
>  			return PIPE_SHADER_IR_TGSI;
>  		}
>  	case PIPE_SHADER_CAP_DOUBLES:
> +		if (rscreen->b.family == CHIP_CAYMAN)
> +			return 1;
>  		return 0;
>  	}
>  	return 0;
> diff --git a/src/gallium/drivers/r600/r600_shader.c  
> b/src/gallium/drivers/r600/r600_shader.c
> index 77c9909..34c4e14 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -47,7 +47,7 @@ MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
>  These 8xx t-slot only opcodes become vector ops, with all four
>  slots expecting the arguments on sources a and b. Result is
>  broadcast to all channels.
> -MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
> +MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64

Drop this hunk, its 4 slot on prior chips too, though the documentation is  
a bit contradictory in claiming its a t-slot instruction which it cannot  
possibly be due to the 4 inputs it takes...

>  These 8xx t-slot only opcodes become vector ops in the z, y, and
>  x slots.
>  EXP_IEEE, LOG_IEEE/CLAMPED,  
> RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
> @@ -163,6 +163,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
>  	use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
>  	/* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array  
> indexing) as it doesn't handle those currently */
>  	use_sb &= !shader->shader.uses_index_registers;
> +	/* disable SB for shaders using doubles */

Note that its a scheduling issue

> +	use_sb &= !shader->shader.uses_doubles;
> 	/* Check if the bytecode has already been built.  When using the llvm
>  	 * backend, r600_shader_from_tgsi() will take care of building the
> @@ -339,7 +341,7 @@ static int tgsi_is_supported(struct r600_shader_ctx  
> *ctx)
>  	struct tgsi_full_instruction *i =  
> &ctx->parse.FullToken.FullInstruction;
>  	int j;
> -	if (i->Instruction.NumDstRegs > 1) {
> +	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode !=  
> TGSI_OPCODE_DFRACEXP) {
>  		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
>  		return -EINVAL;
>  	}
> @@ -1827,6 +1829,9 @@ static int r600_shader_from_tgsi(struct  
> r600_context *rctx,
>  	ctx.tokens = tokens;
>  	tgsi_scan_shader(tokens, &ctx.info);
>  	shader->indirect_files = ctx.info.indirect_files;
> +
> +	shader->uses_doubles = ctx.info.uses_doubles;
> +
>  	indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
>  	tgsi_parse_init(&ctx.parse, tokens);
>  	ctx.type = ctx.parse.FullHeader.Processor.Processor;
> @@ -2608,6 +2613,168 @@ static int tgsi_last_instruction(unsigned  
> writemask)
>  	return lasti;
>  }
> +
> +
> +static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool  
> singledest, bool swap)
> +{
> +	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> +	unsigned write_mask = inst->Dst[0].Register.WriteMask;
> +	struct r600_bytecode_alu alu;
> +	int i, j, r, lasti = tgsi_last_instruction(write_mask);

dead assignment

> +	int use_tmp = 0;
> +
> +	if (singledest) {
> +		switch (write_mask) {
> +		case 0x1:
> +			write_mask = 0x3;
> +			break;
> +		case 0x2:
> +			use_tmp = 1;
> +			write_mask = 0x3;
> +			break;
> +		case 0x4:
> +			write_mask = 0xc;
> +			break;
> +		case 0x8:
> +			write_mask = 0xc;
> +			use_tmp = 3;
> +			break;
> +		}
> +	}
> +
> +	lasti = tgsi_last_instruction(write_mask);
> +	for (i = 0; i <= lasti; i++) {
> +
> +		if (!(write_mask & (1 << i)))
> +			continue;
> +
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +
> +		if (singledest) {
> +			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +			if (use_tmp) {
> +				alu.dst.sel = ctx->temp_reg;
> +				alu.dst.chan = i;
> +				alu.dst.write = 1;
> +			}
> +			if (i == 1 || i == 3)
> +				alu.dst.write = 0;
> +		} else
> +			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +
> +		alu.op = ctx->inst_info->op;
> +
> +		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_DABS) {
> +			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
> +		} else if (!swap) {
> +			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
> +				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
> +			}
> +		} else {
> +			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
> +			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
> +		}
> +
> +		/* handle some special cases */
> +		if (i == 1 || i == 3) {
> +			switch (ctx->inst_info->tgsi_opcode) {
> +			case TGSI_OPCODE_SUB:
> +				r600_bytecode_src_toggle_neg(&alu.src[1]);
> +				break;
> +			case TGSI_OPCODE_DABS:
> +				r600_bytecode_src_set_abs(&alu.src[0]);
> +				break;
> +			default:
> +				break;
> +			}
> +		}
> +		if (i == lasti) {
> +			alu.last = 1;
> +		}
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +
> +	if (use_tmp) {
> +		write_mask = inst->Dst[0].Register.WriteMask;
> +
> +		/* move result from temp to dst */
> +		for (i = 0; i <= lasti; i++) {
> +			if (!(write_mask & (1 << i)))
> +				continue;
> +
> +			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +			alu.op = ALU_OP1_MOV;
> +			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +			alu.src[0].sel = ctx->temp_reg;
> +			alu.src[0].chan = use_tmp - 1;
> +			alu.last = (i == lasti);
> +
> +			r = r600_bytecode_add_alu(ctx->bc, &alu);
> +			if (r)
> +				return r;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static int tgsi_op2_64(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> +	unsigned write_mask = inst->Dst[0].Register.WriteMask;
> +	/* confirm writemaskiing */

assiiert?

> +	if ((write_mask & 0x3) != 0x3 &&
> +	    (write_mask & 0xc) != 0xc) {
> +		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
> +		return -1;
> +	}
> +	return tgsi_op2_64_params(ctx, false, false);
> +}
> +
> +static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
> +{
> +	return tgsi_op2_64_params(ctx, true, false);
> +}
> +
> +static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)

shortage of string? tgsi_op2_64_single_dest_swap?

> +{
> +	return tgsi_op2_64_params(ctx, true, true);
> +}
> +
> +static int tgsi_op3_64(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> +	struct r600_bytecode_alu alu;
> +	int i, j, r;
> +	int lasti = 3;
> +	int tmp = r600_get_temp(ctx);
> +
> +	for (i = 0; i < lasti + 1; i++) {
> +
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ctx->inst_info->op;
> +		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
> +			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);

i == 3 Might deserve a comment

> +		}
> +
> +		if (inst->Dst[0].Register.WriteMask & (1 << i))
> +			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +		else
> +			alu.dst.sel = tmp;
> +
> +		alu.dst.chan = i;
> +		alu.is_op3 = 1;
> +		if (i == lasti) {
> +			alu.last = 1;
> +		}
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +	return 0;
> +}
> +
>  static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int  
> trans_only)
>  {
>  	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> @@ -2724,6 +2891,152 @@ static int tgsi_ineg(struct r600_shader_ctx *ctx)
> }
> +static int tgsi_dneg(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> +	struct r600_bytecode_alu alu;
> +	int i, r;
> +	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
> +
> +	for (i = 0; i < lasti + 1; i++) {
> +
> +		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
> +			continue;
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ALU_OP1_MOV;
> +
> +		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
> +
> +		if (i == 1 || i == 3)
> +			r600_bytecode_src_toggle_neg(&alu.src[0]);
> +		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +
> +		if (i == lasti) {
> +			alu.last = 1;
> +		}
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +	return 0;
> +
> +}
> +
> +static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> +	struct r600_bytecode_alu alu;
> +	unsigned write_mask = inst->Dst[0].Register.WriteMask;
> +	int i, j, r;
> +	int firsti = write_mask == 0xc ? 2 : 0;
> +
> +	for (i = 0; i <= 3; i++) {
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ctx->inst_info->op;
> +
> +		alu.dst.sel = ctx->temp_reg;
> +		alu.dst.chan = i;
> +		alu.dst.write = 1;
> +		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
> +			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
> +		}
> +
> +		if (i == 3)
> +			alu.last = 1;
> +
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +
> +	/* MOV first two channels to writemask dst0 */
> +	for (i = 0; i <= 1; i++) {
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ALU_OP1_MOV;
> +		alu.src[0].chan = i + 2;
> +		alu.src[0].sel = ctx->temp_reg;
> +
> +		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
> +		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
> +		alu.last = 1;
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +
> +	for (i = 0; i <= 3; i++) {
> +		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
> +			/* MOV third channels to writemask dst1 */
> +			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +			alu.op = ALU_OP1_MOV;
> +			alu.src[0].chan = 1;
> +			alu.src[0].sel = ctx->temp_reg;
> +
> +			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
> +			alu.last = 1;
> +			r = r600_bytecode_add_alu(ctx->bc, &alu);
> +			if (r)
> +				return r;
> +			break;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> +	int i, r;
> +	struct r600_bytecode_alu alu;
> +	int last_slot = 3;
> +	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
> +	int t1 = ctx->temp_reg;
> +
> +	/* these have to write the result to X/Y by the looks of it */
> +	for (i = 0 ; i < last_slot; i++) {
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ctx->inst_info->op;
> +
> +		/* should only be one src regs */
> +		assert (inst->Instruction.NumSrcRegs == 1);
> +
> +		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
> +		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
> +
> +		/* RSQ should take the absolute value of src */
> +		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_DRSQ ||
> +		    ctx->inst_info->tgsi_opcode == TGSI_OPCODE_DSQRT) {
> +			r600_bytecode_src_set_abs(&alu.src[1]);
> +		}
> +		alu.dst.sel = t1;
> +		alu.dst.chan = i;
> +		alu.dst.write = (i == 0 || i == 1);
> +
> +		if (i == last_slot - 1)
> +			alu.last = 1;
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +
> +	for (i = 0 ; i <= lasti; i++) {
> +		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
> +			continue;
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ALU_OP1_MOV;
> +		alu.src[0].sel = t1;
> +		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
> +		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +		alu.dst.write = 1;
> +		if (i == lasti)
> +			alu.last = 1;
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +	return 0;
> +}
> +
>  static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
>  {
>  	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> @@ -2802,6 +3115,55 @@ static int cayman_mul_int_instr(struct  
> r600_shader_ctx *ctx)
>  	return 0;
>  }
> +
> +static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst =  
> &ctx->parse.FullToken.FullInstruction;
> +	int i, j, k, r;
> +	struct r600_bytecode_alu alu;
> +	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
> +	int t1 = ctx->temp_reg;
> +
> +	for (k = 0; k <= 2; k++) {
> +		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
> +			continue;
> +
> +		for (i = 0; i < 4; i++) {
> +			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +			alu.op = ctx->inst_info->op;
> +			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
> +				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0  
> : 1));;
> +			}
> +			alu.dst.sel = t1;
> +			alu.dst.chan = i;
> +			alu.dst.write = 1;
> +			if (i == 3)
> +				alu.last = 1;
> +			r = r600_bytecode_add_alu(ctx->bc, &alu);
> +			if (r)
> +				return r;
> +		}
> +	}
> +
> +	for (i = 0; i <= lasti; i++) {
> +		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
> +			continue;
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ALU_OP1_MOV;
> +		alu.src[0].sel = t1;
> +		alu.src[0].chan = i;
> +		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +		alu.dst.write = 1;
> +		if (i == lasti)
> +			alu.last = 1;
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * r600 - trunc to -PI..PI range
>   * r700 - normalize by dividing by 2PI
> @@ -7869,5 +8231,28 @@ static struct r600_shader_tgsi_instruction  
> cm_shader_tgsi_instruction[] = {
>  	{TGSI_OPCODE_INTERP_CENTROID,	0, ALU_OP0_NOP, tgsi_interp_egcm},
>  	{TGSI_OPCODE_INTERP_SAMPLE,		0, ALU_OP0_NOP, tgsi_interp_egcm},
>  	{TGSI_OPCODE_INTF2DERP_OFFSET,		0, ALU_OP0_NOP, tgsi_interp_egcm},
> +	{TGSI_OPCODE_F2D,	0, ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
> +	{TGSI_OPCODE_D2F,	0, ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
> +	{TGSI_OPCODE_DABS,	0, ALU_OP1_MOV, tgsi_op2_64},
> +	{TGSI_OPCODE_DNEG,	0, ALU_OP2_ADD_64, tgsi_dneg},
> +	{TGSI_OPCODE_DADD,	0, ALU_OP2_ADD_64, tgsi_op2_64},
> +	{TGSI_OPCODE_DMUL,	0, ALU_OP2_MUL_64, cayman_mul_double_instr},
> +	{TGSI_OPCODE_DMAX,	0, ALU_OP2_MAX_64, tgsi_op2_64},
> +	{TGSI_OPCODE_DMIN,	0, ALU_OP2_MIN_64, tgsi_op2_64},
> +	{TGSI_OPCODE_DSLT,	0, ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
> +	{TGSI_OPCODE_DSGE,	0, ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
> +	{TGSI_OPCODE_DSEQ,	0, ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
> +	{TGSI_OPCODE_DSNE,	0, ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
> +	{TGSI_OPCODE_DRCP,	0, ALU_OP2_RECIP_64, cayman_emit_double_instr},
> +	{TGSI_OPCODE_DSQRT,	0, ALU_OP2_SQRT_64, cayman_emit_double_instr},
> +	{TGSI_OPCODE_DMAD,	0, ALU_OP3_FMA_64, tgsi_op3_64},
> +	{TGSI_OPCODE_DFRAC,	0, ALU_OP1_FRACT_64, tgsi_op2_64},
> +	{TGSI_OPCODE_DLDEXP,	0, ALU_OP2_LDEXP_64, tgsi_op2_64},
> +	{TGSI_OPCODE_DFRACEXP,	0, ALU_OP1_FREXP_64, tgsi_dfracexp},
> +	{TGSI_OPCODE_D2I,	0, ALU_OP0_NOP, tgsi_unsupported},
> +	{TGSI_OPCODE_I2D,	0, ALU_OP0_NOP, tgsi_unsupported},
> +	{TGSI_OPCODE_D2U,	0, ALU_OP0_NOP, tgsi_unsupported},
> +	{TGSI_OPCODE_U2D,	0, ALU_OP0_NOP, tgsi_unsupported},
> +	{TGSI_OPCODE_DRSQ,	0, ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
>  	{TGSI_OPCODE_LAST,	0, ALU_OP0_NOP, tgsi_unsupported},
>  };
> diff --git a/src/gallium/drivers/r600/r600_shader.h  
> b/src/gallium/drivers/r600/r600_shader.h
> index b2559e9..a10004c 100644
> --- a/src/gallium/drivers/r600/r600_shader.h
> +++ b/src/gallium/drivers/r600/r600_shader.h
> @@ -87,6 +87,8 @@ struct r600_shader {
>  	unsigned		vs_as_gs_a;
>  	unsigned                ps_prim_id_input;
>  	struct r600_shader_array * arrays;
> +
> +	boolean			uses_doubles;
>  };
> struct r600_shader_key {

With above nits fixed,
Reviewed-by: Glenn Kennard <glenn.kennard at gmail.com>