[Mesa-dev] [PATCH] r600/atomic: add cayman version of atomic save/restore from GDS (v2)

Nicolai Hähnle nhaehnle at gmail.com
Tue Dec 5 10:01:23 UTC 2017


On 05.12.2017 09:16, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
> 
> On Cayman we don't use the append/consume counters (fglrx doesn't)
> and they don't seem to work well with compute shaders.
> 
> This just uses GDS instead to do the atomic operations.
> 
> v1.1: remove unused line.
> v2: use EOS on cayman, it appears to work.
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>

I can't say much about the assembly, but the CP packets look reasonable 
to me.

Acked-by: Nicolai Hähnle <nicolai.haehnle at amd.com>


> ---
>   src/gallium/drivers/r600/evergreen_state.c | 57 +++++++++++++++++-
>   src/gallium/drivers/r600/r600_shader.c     | 93 +++++++++++++++++++++++-------
>   2 files changed, 126 insertions(+), 24 deletions(-)
> 
> diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
> index 6bca35e850f..a1d2e0cd14b 100644
> --- a/src/gallium/drivers/r600/evergreen_state.c
> +++ b/src/gallium/drivers/r600/evergreen_state.c
> @@ -2672,6 +2672,7 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
>   	r600_store_value(cb, 0x76543210); /* CM_R_028BD4_PA_SC_CENTROID_PRIORITY_0 */
>   	r600_store_value(cb, 0xfedcba98); /* CM_R_028BD8_PA_SC_CENTROID_PRIORITY_1 */
>   
> +	r600_store_context_reg(cb, R_028724_GDS_ADDR_SIZE, 0x3fff);
>   	r600_store_context_reg_seq(cb, R_0288E8_SQ_LDS_ALLOC, 2);
>   	r600_store_value(cb, 0); /* R_0288E8_SQ_LDS_ALLOC */
>   	r600_store_value(cb, 0); /* R_0288EC_SQ_LDS_ALLOC_PS */
> @@ -4627,6 +4628,51 @@ static void evergreen_emit_event_write_eos(struct r600_context *rctx,
>   	radeon_emit(cs, reloc);
>   }
>   
> +static void cayman_emit_event_write_eos(struct r600_context *rctx,
> +					struct r600_shader_atomic *atomic,
> +					struct r600_resource *resource,
> +					uint32_t pkt_flags)
> +{
> +	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> +	uint32_t event = EVENT_TYPE_PS_DONE;
> +	uint32_t reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> +						   resource,
> +						   RADEON_USAGE_WRITE,
> +						   RADEON_PRIO_SHADER_RW_BUFFER);
> +	uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
> +
> +	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
> +	radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
> +	radeon_emit(cs, (dst_offset) & 0xffffffff);
> +	radeon_emit(cs, (1 << 29) | ((dst_offset >> 32) & 0xff));
> +	radeon_emit(cs, (atomic->hw_idx) | (1 << 16));
> +	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +	radeon_emit(cs, reloc);
> +}
> +
> +/* writes count from a buffer into GDS */
> +static void cayman_write_count_to_gds(struct r600_context *rctx,
> +				      struct r600_shader_atomic *atomic,
> +				      struct r600_resource *resource,
> +				      uint32_t pkt_flags)
> +{
> +	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> +	unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> +						   resource,
> +						   RADEON_USAGE_READ,
> +						   RADEON_PRIO_SHADER_RW_BUFFER);
> +	uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
> +
> +	radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0) | pkt_flags);
> +	radeon_emit(cs, dst_offset & 0xffffffff);
> +	radeon_emit(cs, PKT3_CP_DMA_CP_SYNC | PKT3_CP_DMA_DST_SEL(1) | ((dst_offset >> 32) & 0xff));// GDS
> +	radeon_emit(cs, atomic->hw_idx * 4);
> +	radeon_emit(cs, 0);
> +	radeon_emit(cs, PKT3_CP_DMA_CMD_DAS | 4);
> +	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +	radeon_emit(cs, reloc);
> +}
> +
>   bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
>   					struct r600_shader_atomic *combined_atomics,
>   					uint8_t *atomic_used_mask_p)
> @@ -4674,7 +4720,10 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
>   		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
>   		assert(resource);
>   
> -		evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
> +		if (rctx->b.chip_class == CAYMAN)
> +			cayman_write_count_to_gds(rctx, atomic, resource, pkt_flags);
> +		else
> +			evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
>   	}
>   	*atomic_used_mask_p = atomic_used_mask;
>   	return true;
> @@ -4702,8 +4751,12 @@ void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
>   		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
>   		assert(resource);
>   
> -		evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
> +		if (rctx->b.chip_class == CAYMAN)
> +			cayman_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
> +		else
> +			evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
>   	}
> +
>   	++rctx->append_fence_id;
>   	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
>   					  r600_resource(rctx->append_fence),
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index 5d78e4f8ade..da74de04de3 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -7809,6 +7809,53 @@ static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
>   	return -1;
>   }
>   
> +static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
> +			     int *uav_id_p, int *uav_index_mode_p)
> +{
> +	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> +	int uav_id, uav_index_mode;
> +	int r;
> +	bool is_cm = (ctx->bc->chip_class == CAYMAN);
> +
> +	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> +
> +	if (inst->Src[0].Register.Indirect) {
> +		if (is_cm) {
> +			struct r600_bytecode_alu alu;
> +			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +			alu.op = ALU_OP2_LSHL_INT;
> +			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
> +			alu.src[0].chan = 0;
> +			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +			alu.src[1].value = 2;
> +			alu.dst.sel = ctx->temp_reg;
> +			alu.dst.chan = 0;
> +			alu.dst.write = 1;
> +			alu.last = 1;
> +			r = r600_bytecode_add_alu(ctx->bc, &alu);
> +			if (r)
> +				return r;
> +
> +			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
> +					   ctx->temp_reg, 0,
> +					   ctx->temp_reg, 0,
> +					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
> +			if (r)
> +				return r;
> +		} else
> +			uav_index_mode = 2;
> +	} else if (is_cm) {
> +		r = single_alu_op2(ctx, ALU_OP1_MOV,
> +				   ctx->temp_reg, 0,
> +				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
> +				   0, 0);
> +		if (r)
> +			return r;
> +	}
> +	*uav_id_p = uav_id;
> +	*uav_index_mode_p = uav_index_mode;
> +	return 0;
> +}
>   
>   static int tgsi_load_gds(struct r600_shader_ctx *ctx)
>   {
> @@ -7817,27 +7864,27 @@ static int tgsi_load_gds(struct r600_shader_ctx *ctx)
>   	struct r600_bytecode_gds gds;
>   	int uav_id = 0;
>   	int uav_index_mode = 0;
> +	bool is_cm = (ctx->bc->chip_class == CAYMAN);
>   
> -	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> -
> -	if (inst->Src[0].Register.Indirect)
> -		uav_index_mode = 2;
> +	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
> +	if (r)
> +		return r;
>   
>   	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
>   	gds.op = FETCH_OP_GDS_READ_RET;
>   	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> -	gds.uav_id = uav_id;
> -	gds.uav_index_mode = uav_index_mode;
> +	gds.uav_id = is_cm ? 0 : uav_id;
> +	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
>   	gds.src_gpr = ctx->temp_reg;
> -	gds.src_sel_x = 4;
> +	gds.src_sel_x = (is_cm) ? 0 : 4;
>   	gds.src_sel_y = 4;
>   	gds.src_sel_z = 4;
>   	gds.dst_sel_x = 0;
>   	gds.dst_sel_y = 7;
>   	gds.dst_sel_z = 7;
>   	gds.dst_sel_w = 7;
> -	gds.src_gpr2 = ctx->temp_reg;
> -	gds.alloc_consume = 1;
> +	gds.src_gpr2 = 0;
> +	gds.alloc_consume = !is_cm;
>   	r = r600_bytecode_add_gds(ctx->bc, &gds);
>   	if (r)
>   		return r;
> @@ -8369,16 +8416,16 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   	int r;
>   	int uav_id = 0;
>   	int uav_index_mode = 0;
> +	bool is_cm = (ctx->bc->chip_class == CAYMAN);
>   
>   	if (gds_op == -1) {
>   		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
>   		return -1;
>   	}
>   
> -	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> -
> -	if (inst->Src[0].Register.Indirect)
> -		uav_index_mode = 2;
> +	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
> +	if (r)
> +		return r;
>   
>   	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
>   		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
> @@ -8388,7 +8435,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>   		alu.op = ALU_OP1_MOV;
>   		alu.dst.sel = ctx->temp_reg;
> -		alu.dst.chan = 0;
> +		alu.dst.chan = is_cm ? 1 : 0;
>   		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
>   		alu.src[0].value = abs_value;
>   		alu.last = 1;
> @@ -8400,7 +8447,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>   		alu.op = ALU_OP1_MOV;
>   		alu.dst.sel = ctx->temp_reg;
> -		alu.dst.chan = 0;
> +		alu.dst.chan = is_cm ? 1 : 0;
>   		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
>   		alu.last = 1;
>   		alu.dst.write = 1;
> @@ -8409,21 +8456,23 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   			return r;
>   	}
>   
> +
>   	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
>   	gds.op = gds_op;
>   	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> -	gds.uav_id = uav_id;
> -	gds.uav_index_mode = uav_index_mode;
> +	gds.uav_id = is_cm ? 0 : uav_id;
> +	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
>   	gds.src_gpr = ctx->temp_reg;
> -	gds.src_gpr2 = ctx->temp_reg;
> -	gds.src_sel_x = 4;
> -	gds.src_sel_y = 0;
> -	gds.src_sel_z = 4;
> +	gds.src_gpr2 = 0;
> +	gds.src_sel_x = is_cm ? 0 : 4;
> +	gds.src_sel_y = is_cm ? 1 : 0;
> +	gds.src_sel_z = 7;
>   	gds.dst_sel_x = 0;
>   	gds.dst_sel_y = 7;
>   	gds.dst_sel_z = 7;
>   	gds.dst_sel_w = 7;
> -	gds.alloc_consume = 1;
> +	gds.alloc_consume = !is_cm;
> +
>   	r = r600_bytecode_add_gds(ctx->bc, &gds);
>   	if (r)
>   		return r;
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list