[Mesa-dev] [PATCH 2/2] r600/atomic: add cayman version of atomic save/restore from GDS

Nicolai Hähnle nhaehnle at gmail.com
Fri Dec 1 10:49:50 UTC 2017


On 01.12.2017 06:06, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
> 
> On Cayman we don't use the append/consume counters (fglrx doesn't)
> and they don't seem to work well with compute shaders.
> 
> This just uses GDS instead to do the atomic operations.

Interesting. This is kind of what I'd have expected to be used from the 
beginning at least for GCN.

Don't you still need to use an EOS event for proper synchronization? I 
mean, I guess you looked at fglrx traces, but still... CP_DMA definitely 
isn't waiting for shaders on newer hardware, and I don't know why it 
would do that on older hardware.

FWIW, I don't have the packet specification for pre-GCN hardware here, 
but on GCN it should be:

	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
	radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
	radeon_emit(cs, (dst_offset) & 0xffffffff);
	radeon_emit(cs, (1 << 29) | ((dst_offset >> 32) & 0xffff));
	radeon_emit(cs, (gds_index & 0xffff) | (num_dwords << 16));

to copy GDS data to memory at EOS.

Cheers,
Nicolai

> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>   src/gallium/drivers/r600/evergreen_state.c | 60 +++++++++++++++++++-
>   src/gallium/drivers/r600/r600_shader.c     | 91 +++++++++++++++++++++++-------
>   2 files changed, 129 insertions(+), 22 deletions(-)
> 
> diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
> index 850165b30b..c44ed27b2c 100644
> --- a/src/gallium/drivers/r600/evergreen_state.c
> +++ b/src/gallium/drivers/r600/evergreen_state.c
> @@ -2659,6 +2659,7 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
>   	r600_store_value(cb, 0x76543210); /* CM_R_028BD4_PA_SC_CENTROID_PRIORITY_0 */
>   	r600_store_value(cb, 0xfedcba98); /* CM_R_028BD8_PA_SC_CENTROID_PRIORITY_1 */
>   
> +	r600_store_context_reg(cb, R_028724_GDS_ADDR_SIZE, 0x3fff);
>   	r600_store_context_reg_seq(cb, R_0288E8_SQ_LDS_ALLOC, 2);
>   	r600_store_value(cb, 0); /* R_0288E8_SQ_LDS_ALLOC */
>   	r600_store_value(cb, 0); /* R_0288EC_SQ_LDS_ALLOC_PS */
> @@ -4502,6 +4503,51 @@ static void evergreen_emit_event_write_eos(struct r600_context *rctx,
>   	radeon_emit(cs, reloc);
>   }
>   
> +/* writes count from a buffer into GDS */
> +static void cayman_write_count_to_gds(struct r600_context *rctx,
> +				      struct r600_shader_atomic *atomic,
> +				      struct r600_resource *resource,
> +				      uint32_t pkt_flags)
> +{
> +	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> +	unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> +						   resource,
> +						   RADEON_USAGE_READ,
> +						   RADEON_PRIO_SHADER_RW_BUFFER);
> +	uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
> +
> +	radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0) | pkt_flags);
> +	radeon_emit(cs, dst_offset & 0xffffffff);
> +	radeon_emit(cs, PKT3_CP_DMA_CP_SYNC | PKT3_CP_DMA_DST_SEL(1) | ((dst_offset >> 32) & 0xff));// GDS
> +	radeon_emit(cs, atomic->hw_idx * 4);
> +	radeon_emit(cs, 0);
> +	radeon_emit(cs, PKT3_CP_DMA_CMD_DAS | 4);
> +	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +	radeon_emit(cs, reloc);
> +}
> +
> +static void cayman_read_count_from_gds(struct r600_context *rctx,
> +				struct r600_shader_atomic *atomic,
> +				struct r600_resource *resource,
> +				uint32_t pkt_flags)
> +{
> +	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> +	unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> +						   resource,
> +						   RADEON_USAGE_WRITE,
> +						   RADEON_PRIO_SHADER_RW_BUFFER);
> +	uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
> +
> +	radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0) | pkt_flags);
> +	radeon_emit(cs, atomic->hw_idx * 4);
> +	radeon_emit(cs, PKT3_CP_DMA_CP_SYNC | PKT3_CP_DMA_SRC_SEL(1));// GDS
> +	radeon_emit(cs, dst_offset & 0xffffffff);
> +	radeon_emit(cs, (dst_offset >> 32) & 0xff);
> +	radeon_emit(cs, PKT3_CP_DMA_CMD_SAS | 4);
> +	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +	radeon_emit(cs, reloc);
> +}
> +
>   bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
>   					struct r600_shader_atomic *combined_atomics,
>   					uint8_t *atomic_used_mask_p)
> @@ -4549,7 +4595,10 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
>   		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
>   		assert(resource);
>   
> -		evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
> +		if (rctx->b.chip_class == CAYMAN)
> +			cayman_write_count_to_gds(rctx, atomic, resource, pkt_flags);
> +		else
> +			evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
>   	}
>   	*atomic_used_mask_p = atomic_used_mask;
>   	return true;
> @@ -4577,8 +4626,15 @@ void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
>   		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
>   		assert(resource);
>   
> -		evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
> +		if (rctx->b.chip_class == CAYMAN)
> +			cayman_read_count_from_gds(rctx, atomic, resource, pkt_flags);
> +		else
> +			evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
>   	}
> +
> +	if (rctx->b.chip_class == CAYMAN)
> +		return;
> +
>   	++rctx->append_fence_id;
>   	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
>   					  r600_resource(rctx->append_fence),
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index ae8326fdd1..cb14389582 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -7807,6 +7807,53 @@ static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
>   	return -1;
>   }
>   
> +static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
> +			     int *uav_id_p, int *uav_index_mode_p)
> +{
> +	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> +	int uav_id, uav_index_mode;
> +	int r;
> +	bool is_cm = (ctx->bc->chip_class == CAYMAN);
> +
> +	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> +
> +	if (inst->Src[0].Register.Indirect) {
> +		if (is_cm) {
> +			struct r600_bytecode_alu alu;
> +			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +			alu.op = ALU_OP2_LSHL_INT;
> +			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
> +			alu.src[0].chan = 0;
> +			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +			alu.src[1].value = 2;
> +			alu.dst.sel = ctx->temp_reg;
> +			alu.dst.chan = 0;
> +			alu.dst.write = 1;
> +			alu.last = 1;
> +			r = r600_bytecode_add_alu(ctx->bc, &alu);
> +			if (r)
> +				return r;
> +
> +			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
> +					   ctx->temp_reg, 0,
> +					   ctx->temp_reg, 0,
> +					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
> +			if (r)
> +				return r;
> +		} else
> +			uav_index_mode = 2;
> +	} else if (is_cm) {
> +		r = single_alu_op2(ctx, ALU_OP1_MOV,
> +				   ctx->temp_reg, 0,
> +				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
> +				   0, 0);
> +		if (r)
> +			return r;
> +	}
> +	*uav_id_p = uav_id;
> +	*uav_index_mode_p = uav_index_mode;
> +	return 0;
> +}
>   
>   static int tgsi_load_gds(struct r600_shader_ctx *ctx)
>   {
> @@ -7815,27 +7862,29 @@ static int tgsi_load_gds(struct r600_shader_ctx *ctx)
>   	struct r600_bytecode_gds gds;
>   	int uav_id = 0;
>   	int uav_index_mode = 0;
> +	bool is_cm = (ctx->bc->chip_class == CAYMAN);
>   
>   	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
>   
> -	if (inst->Src[0].Register.Indirect)
> -		uav_index_mode = 2;
> +	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
> +	if (r)
> +		return r;
>   
>   	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
>   	gds.op = FETCH_OP_GDS_READ_RET;
>   	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> -	gds.uav_id = uav_id;
> -	gds.uav_index_mode = uav_index_mode;
> +	gds.uav_id = is_cm ? 0 : uav_id;
> +	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
>   	gds.src_gpr = ctx->temp_reg;
> -	gds.src_sel_x = 4;
> +	gds.src_sel_x = (is_cm) ? 0 : 4;
>   	gds.src_sel_y = 4;
>   	gds.src_sel_z = 4;
>   	gds.dst_sel_x = 0;
>   	gds.dst_sel_y = 7;
>   	gds.dst_sel_z = 7;
>   	gds.dst_sel_w = 7;
> -	gds.src_gpr2 = ctx->temp_reg;
> -	gds.alloc_consume = 1;
> +	gds.src_gpr2 = 0;
> +	gds.alloc_consume = !is_cm;
>   	r = r600_bytecode_add_gds(ctx->bc, &gds);
>   	if (r)
>   		return r;
> @@ -8195,16 +8244,16 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   	int r;
>   	int uav_id = 0;
>   	int uav_index_mode = 0;
> +	bool is_cm = (ctx->bc->chip_class == CAYMAN);
>   
>   	if (gds_op == -1) {
>   		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
>   		return -1;
>   	}
>   
> -	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> -
> -	if (inst->Src[0].Register.Indirect)
> -		uav_index_mode = 2;
> +	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
> +	if (r)
> +		return r;
>   
>   	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
>   		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
> @@ -8214,7 +8263,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>   		alu.op = ALU_OP1_MOV;
>   		alu.dst.sel = ctx->temp_reg;
> -		alu.dst.chan = 0;
> +		alu.dst.chan = is_cm ? 1 : 0;
>   		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
>   		alu.src[0].value = abs_value;
>   		alu.last = 1;
> @@ -8226,7 +8275,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>   		alu.op = ALU_OP1_MOV;
>   		alu.dst.sel = ctx->temp_reg;
> -		alu.dst.chan = 0;
> +		alu.dst.chan = is_cm ? 1 : 0;
>   		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
>   		alu.last = 1;
>   		alu.dst.write = 1;
> @@ -8235,21 +8284,23 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
>   			return r;
>   	}
>   
> +
>   	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
>   	gds.op = gds_op;
>   	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> -	gds.uav_id = uav_id;
> -	gds.uav_index_mode = uav_index_mode;
> +	gds.uav_id = is_cm ? 0 : uav_id;
> +	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
>   	gds.src_gpr = ctx->temp_reg;
> -	gds.src_gpr2 = ctx->temp_reg;
> -	gds.src_sel_x = 4;
> -	gds.src_sel_y = 0;
> -	gds.src_sel_z = 4;
> +	gds.src_gpr2 = 0;
> +	gds.src_sel_x = is_cm ? 0 : 4;
> +	gds.src_sel_y = is_cm ? 1 : 0;
> +	gds.src_sel_z = 7;
>   	gds.dst_sel_x = 0;
>   	gds.dst_sel_y = 7;
>   	gds.dst_sel_z = 7;
>   	gds.dst_sel_w = 7;
> -	gds.alloc_consume = 1;
> +	gds.alloc_consume = !is_cm;
> +
>   	r = r600_bytecode_add_gds(ctx->bc, &gds);
>   	if (r)
>   		return r;
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list