[Mesa-dev] [PATCH 2/2] r600/atomic: add cayman version of atomic save/restore from GDS

Dave Airlie airlied at gmail.com
Fri Dec 1 05:06:45 UTC 2017


From: Dave Airlie <airlied at redhat.com>

On Cayman we don't use the append/consume counters (fglrx doesn't)
and they don't seem to work well with compute shaders.

This just uses GDS instead to do the atomic operations.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 src/gallium/drivers/r600/evergreen_state.c | 60 +++++++++++++++++++-
 src/gallium/drivers/r600/r600_shader.c     | 91 +++++++++++++++++++++++-------
 2 files changed, 129 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 850165b30b..c44ed27b2c 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2659,6 +2659,7 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_value(cb, 0x76543210); /* CM_R_028BD4_PA_SC_CENTROID_PRIORITY_0 */
 	r600_store_value(cb, 0xfedcba98); /* CM_R_028BD8_PA_SC_CENTROID_PRIORITY_1 */
 
+	r600_store_context_reg(cb, R_028724_GDS_ADDR_SIZE, 0x3fff);
 	r600_store_context_reg_seq(cb, R_0288E8_SQ_LDS_ALLOC, 2);
 	r600_store_value(cb, 0); /* R_0288E8_SQ_LDS_ALLOC */
 	r600_store_value(cb, 0); /* R_0288EC_SQ_LDS_ALLOC_PS */
@@ -4502,6 +4503,51 @@ static void evergreen_emit_event_write_eos(struct r600_context *rctx,
 	radeon_emit(cs, reloc);
 }
 
+/* writes count from a buffer into GDS */
+static void cayman_write_count_to_gds(struct r600_context *rctx,
+				      struct r600_shader_atomic *atomic,
+				      struct r600_resource *resource,
+				      uint32_t pkt_flags)
+{
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+						   resource,
+						   RADEON_USAGE_READ,
+						   RADEON_PRIO_SHADER_RW_BUFFER);
+	uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
+
+	radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0) | pkt_flags);
+	radeon_emit(cs, dst_offset & 0xffffffff);
+	radeon_emit(cs, PKT3_CP_DMA_CP_SYNC | PKT3_CP_DMA_DST_SEL(1) | ((dst_offset >> 32) & 0xff));// GDS
+	radeon_emit(cs, atomic->hw_idx * 4);
+	radeon_emit(cs, 0);
+	radeon_emit(cs, PKT3_CP_DMA_CMD_DAS | 4);
+	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+	radeon_emit(cs, reloc);
+}
+
+static void cayman_read_count_from_gds(struct r600_context *rctx,
+				struct r600_shader_atomic *atomic,
+				struct r600_resource *resource,
+				uint32_t pkt_flags)
+{
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+						   resource,
+						   RADEON_USAGE_WRITE,
+						   RADEON_PRIO_SHADER_RW_BUFFER);
+	uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
+
+	radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0) | pkt_flags);
+	radeon_emit(cs, atomic->hw_idx * 4);
+	radeon_emit(cs, PKT3_CP_DMA_CP_SYNC | PKT3_CP_DMA_SRC_SEL(1));// GDS
+	radeon_emit(cs, dst_offset & 0xffffffff);
+	radeon_emit(cs, (dst_offset >> 32) & 0xff);
+	radeon_emit(cs, PKT3_CP_DMA_CMD_SAS | 4);
+	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+	radeon_emit(cs, reloc);
+}
+
 bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
 					struct r600_shader_atomic *combined_atomics,
 					uint8_t *atomic_used_mask_p)
@@ -4549,7 +4595,10 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
 		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
 		assert(resource);
 
-		evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
+		if (rctx->b.chip_class == CAYMAN)
+			cayman_write_count_to_gds(rctx, atomic, resource, pkt_flags);
+		else
+			evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
 	}
 	*atomic_used_mask_p = atomic_used_mask;
 	return true;
@@ -4577,8 +4626,15 @@ void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
 		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
 		assert(resource);
 
-		evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
+		if (rctx->b.chip_class == CAYMAN)
+			cayman_read_count_from_gds(rctx, atomic, resource, pkt_flags);
+		else
+			evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
 	}
+
+	if (rctx->b.chip_class == CAYMAN)
+		return;
+
 	++rctx->append_fence_id;
 	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 					  r600_resource(rctx->append_fence),
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index ae8326fdd1..cb14389582 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -7807,6 +7807,53 @@ static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
 	return -1;
 }
 
+static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
+			     int *uav_id_p, int *uav_index_mode_p)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	int uav_id, uav_index_mode;
+	int r;
+	bool is_cm = (ctx->bc->chip_class == CAYMAN);
+
+	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
+
+	if (inst->Src[0].Register.Indirect) {
+		if (is_cm) {
+			struct r600_bytecode_alu alu;
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP2_LSHL_INT;
+			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
+			alu.src[0].chan = 0;
+			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+			alu.src[1].value = 2;
+			alu.dst.sel = ctx->temp_reg;
+			alu.dst.chan = 0;
+			alu.dst.write = 1;
+			alu.last = 1;
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+
+			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+					   ctx->temp_reg, 0,
+					   ctx->temp_reg, 0,
+					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
+			if (r)
+				return r;
+		} else
+			uav_index_mode = 2;
+	} else if (is_cm) {
+		r = single_alu_op2(ctx, ALU_OP1_MOV,
+				   ctx->temp_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
+				   0, 0);
+		if (r)
+			return r;
+	}
+	*uav_id_p = uav_id;
+	*uav_index_mode_p = uav_index_mode;
+	return 0;
+}
 
 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
 {
@@ -7815,27 +7862,29 @@ static int tgsi_load_gds(struct r600_shader_ctx *ctx)
 	struct r600_bytecode_gds gds;
 	int uav_id = 0;
 	int uav_index_mode = 0;
+	bool is_cm = (ctx->bc->chip_class == CAYMAN);
 
 	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
 
-	if (inst->Src[0].Register.Indirect)
-		uav_index_mode = 2;
+	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
+	if (r)
+		return r;
 
 	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
 	gds.op = FETCH_OP_GDS_READ_RET;
 	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
-	gds.uav_id = uav_id;
-	gds.uav_index_mode = uav_index_mode;
+	gds.uav_id = is_cm ? 0 : uav_id;
+	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
 	gds.src_gpr = ctx->temp_reg;
-	gds.src_sel_x = 4;
+	gds.src_sel_x = (is_cm) ? 0 : 4;
 	gds.src_sel_y = 4;
 	gds.src_sel_z = 4;
 	gds.dst_sel_x = 0;
 	gds.dst_sel_y = 7;
 	gds.dst_sel_z = 7;
 	gds.dst_sel_w = 7;
-	gds.src_gpr2 = ctx->temp_reg;
-	gds.alloc_consume = 1;
+	gds.src_gpr2 = 0;
+	gds.alloc_consume = !is_cm;
 	r = r600_bytecode_add_gds(ctx->bc, &gds);
 	if (r)
 		return r;
@@ -8195,16 +8244,16 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
 	int r;
 	int uav_id = 0;
 	int uav_index_mode = 0;
+	bool is_cm = (ctx->bc->chip_class == CAYMAN);
 
 	if (gds_op == -1) {
 		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
 		return -1;
 	}
 
-	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
-
-	if (inst->Src[0].Register.Indirect)
-		uav_index_mode = 2;
+	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
+	if (r)
+		return r;
 
 	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
 		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
@@ -8214,7 +8263,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = ALU_OP1_MOV;
 		alu.dst.sel = ctx->temp_reg;
-		alu.dst.chan = 0;
+		alu.dst.chan = is_cm ? 1 : 0;
 		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
 		alu.src[0].value = abs_value;
 		alu.last = 1;
@@ -8226,7 +8275,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = ALU_OP1_MOV;
 		alu.dst.sel = ctx->temp_reg;
-		alu.dst.chan = 0;
+		alu.dst.chan = is_cm ? 1 : 0;
 		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
 		alu.last = 1;
 		alu.dst.write = 1;
@@ -8235,21 +8284,23 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
 			return r;
 	}
 
+
 	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
 	gds.op = gds_op;
 	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
-	gds.uav_id = uav_id;
-	gds.uav_index_mode = uav_index_mode;
+	gds.uav_id = is_cm ? 0 : uav_id;
+	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
 	gds.src_gpr = ctx->temp_reg;
-	gds.src_gpr2 = ctx->temp_reg;
-	gds.src_sel_x = 4;
-	gds.src_sel_y = 0;
-	gds.src_sel_z = 4;
+	gds.src_gpr2 = 0;
+	gds.src_sel_x = is_cm ? 0 : 4;
+	gds.src_sel_y = is_cm ? 1 : 0;
+	gds.src_sel_z = 7;
 	gds.dst_sel_x = 0;
 	gds.dst_sel_y = 7;
 	gds.dst_sel_z = 7;
 	gds.dst_sel_w = 7;
-	gds.alloc_consume = 1;
+	gds.alloc_consume = !is_cm;
+
 	r = r600_bytecode_add_gds(ctx->bc, &gds);
 	if (r)
 		return r;
-- 
2.14.3



More information about the mesa-dev mailing list