Mesa (staging/22.0): r600: Implement memoryBarrier() in the non-SFN path.

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Apr 21 16:23:33 UTC 2022


Module: Mesa
Branch: staging/22.0
Commit: b3d7e39b538bd0edb49e4b6ab07be2f672d938ed
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=b3d7e39b538bd0edb49e4b6ab07be2f672d938ed

Author: Emma Anholt <emma at anholt.net>
Date:   Fri Feb 11 15:11:59 2022 -0800

r600: Implement memoryBarrier() in the non-SFN path.

Previously we were just doing a group barrier for both membar and barrier.
This sometimes worked out, because atomics and reads waited for ack
already, but writes were not waiting for ack.  Use the need_wait_ack
pattern that scratch writes used, with a little refactoring for
reusability.

The refactor also incidentally fixes the atomics waiting for outstanding
acks to be > 1 instead of > 0.

Cc: mesa-stable
Fixes: #6028
Reviewed-by: Gert Wollny <gert.wollny at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14429>
(cherry picked from commit b8324a7387d5282454e8635cde2d362c873b8a2d)

Conflicts:
	src/gallium/drivers/r600/ci/r600-turks-fails.txt

---

 .pick_status.json                      |  2 +-
 src/gallium/drivers/r600/eg_sq.h       |  2 ++
 src/gallium/drivers/r600/r600_asm.c    | 48 ++++++++++++++++++++++++-----
 src/gallium/drivers/r600/r600_asm.h    |  7 +++--
 src/gallium/drivers/r600/r600_shader.c | 55 +++++++++++++++++++++-------------
 src/gallium/drivers/r600/r600_sq.h     |  5 ++++
 6 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index e565f4f5006..a9b98156b36 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -7548,7 +7548,7 @@
         "description": "r600: Implement memoryBarrier() in the non-SFN path.",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 5,
+        "resolution": 1,
         "because_sha": null
     },
     {
diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h
index c6280167a92..bdf9b330ee9 100644
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -165,7 +165,9 @@
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE               0x00000000
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND           0x00000001
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ                0x00000002
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK           0x00000002
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND            0x00000003
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK       0x00000003
 
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((unsigned)(x) & 0x7F) << 15)
 #define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) >> 15) & 0x7F)
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 321d5e3b555..c0a29e3d2f4 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -240,14 +240,48 @@ int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
 	return 0;
 }
 
-void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean need_wait_ack)
+void
+r600_bytecode_add_ack(struct r600_bytecode *bc)
 {
-	bc->need_wait_ack = need_wait_ack;
+	bc->need_wait_ack = true;
 }
 
-boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc)
+int
+r600_bytecode_wait_acks(struct r600_bytecode *bc)
 {
-	return bc->need_wait_ack;
+	/* Store acks are an R700+ feature. */
+	if (bc->chip_class < R700)
+		return 0;
+
+	if (!bc->need_wait_ack)
+		return 0;
+
+	int ret = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
+	if (ret != 0)
+		return ret;
+
+	struct r600_bytecode_cf *cf = bc->cf_last;
+	cf->barrier = 1;
+	/* Request a wait if the number of outstanding acks is > 0 */
+	cf->cf_addr = 0;
+
+	return 0;
+}
+
+uint32_t
+r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect)
+{
+	if (bc->chip_class >= R700) {
+		if (indirect)
+			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG;
+		else
+			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG;
+	} else {
+		if (indirect)
+			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
+		else
+			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+	}
 }
 
 /* alu instructions that can ony exits once per group */
@@ -1536,10 +1570,8 @@ int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op)
 	int r;
 
 	/* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */
-	if (op != CF_OP_MEM_SCRATCH && bc->need_wait_ack) {
-		bc->need_wait_ack = false;
-		r = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
-	}
+	if (op != CF_OP_WAIT_ACK && op != CF_OP_MEM_SCRATCH)
+		r600_bytecode_wait_acks(bc);
 
 	r = r600_bytecode_add_cf(bc);
 	if (r)
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index a526993b318..501d827744a 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -313,8 +313,11 @@ int r600_bytecode_add_output(struct r600_bytecode *bc,
 		const struct r600_bytecode_output *output);
 int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
 		const struct r600_bytecode_output *output);
-void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean needed);
-boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc);
+
+void r600_bytecode_add_ack(struct r600_bytecode *bc);
+int r600_bytecode_wait_acks(struct r600_bytecode *bc);
+uint32_t r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect);
+
 int r600_bytecode_build(struct r600_bytecode *bc);
 int r600_bytecode_add_cf(struct r600_bytecode *bc);
 int r600_bytecode_add_cfinst(struct r600_bytecode *bc,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index f65b6b32d24..2257262cace 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -969,9 +969,18 @@ static int tgsi_barrier(struct r600_shader_ctx *ctx)
 	r = r600_bytecode_add_alu(ctx->bc, &alu);
 	if (r)
 		return r;
+
+	/* XXX: Need to implement GWS ops to sync across wavefronts */
+
 	return 0;
 }
 
+static int tgsi_membar(struct r600_shader_ctx *ctx)
+{
+	/* Wait for any SSBO/image stores to land. */
+	return r600_bytecode_wait_acks(ctx->bc);
+}
+
 static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
 {
 	// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
@@ -1659,10 +1668,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 			else {
 				struct r600_bytecode_vtx vtx;
 
-				if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
-					r600_bytecode_need_wait_ack(ctx->bc, false);
-					r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
-				}
+				r600_bytecode_wait_acks(ctx->bc);
 
 				memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
 				vtx.op = FETCH_OP_READ_SCRATCH;
@@ -4475,7 +4481,7 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
 				cf.op = CF_OP_MEM_SCRATCH;
 				cf.elem_size = 3;
 				cf.gpr = reg;
-				cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+				cf.type = r600_bytecode_write_export_ack_type(ctx->bc, tgsi_dst->Register.Indirect);
 				cf.mark = 1;
 				cf.comp_mask = inst->Dst[0].Register.WriteMask;
 				cf.swizzle_x = 0;
@@ -4485,10 +4491,6 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
 				cf.burst_count = 1;
 
 				if (tgsi_dst->Register.Indirect) {
-					if (ctx->bc->chip_class < R700)
-						cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
-					else
-						cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
 					cf.index_gpr = ctx->bc->ar_reg;
 			}
 			else {
@@ -4500,8 +4502,8 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
 			if (r)
 				return;
 
-			if (ctx->bc->chip_class >= R700)
-				r600_bytecode_need_wait_ack(ctx->bc, true);
+			r600_bytecode_add_ack(ctx->bc);
+
 			}
 			return;
 		}
@@ -8952,9 +8954,8 @@ static int tgsi_load_rat(struct r600_shader_ctx *ctx)
 	cf->mark = 1;
 	cf->output.elem_size = 0;
 
-	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
-	cf = ctx->bc->cf_last;
-	cf->barrier = 1;
+	r600_bytecode_add_ack(ctx->bc);
+	r600_bytecode_wait_acks(ctx->bc);
 
 	desc = util_format_description(inst->Memory.Format);
 	r600_vertex_data_type(inst->Memory.Format,
@@ -9055,6 +9056,7 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
 			return r;
 	}
 
+	cf = NULL;
 	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
 	for (i = 0; i <= lasti; i++) {
 		struct r600_bytecode_alu alu;
@@ -9095,6 +9097,14 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
 		cf->barrier = 1;
 		cf->output.elem_size = 0;
 	}
+
+	/* Request an ack from the last write emitted. */
+	if (cf) {
+		cf->mark = true;
+		cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
+		r600_bytecode_add_ack(ctx->bc);
+	}
+
 	return 0;
 }
 
@@ -9144,7 +9154,7 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx)
 	cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
 	cf->rat.inst = V_RAT_INST_STORE_TYPED;
 	cf->rat.index_mode = rat_index_mode;
-	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
+	cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
 	cf->output.gpr = val_gpr;
 	cf->output.index_gpr = idx_gpr;
 	cf->output.comp_mask = 0xf;
@@ -9152,6 +9162,10 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx)
 	cf->vpm = 1;
 	cf->barrier = 1;
 	cf->output.elem_size = 0;
+	cf->mark = 1;
+
+	r600_bytecode_add_ack(ctx->bc);
+
 	return 0;
 }
 
@@ -9324,10 +9338,9 @@ static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
 	cf->barrier = 1;
 	cf->mark = 1;
 	cf->output.elem_size = 0;
-	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
-	cf = ctx->bc->cf_last;
-	cf->barrier = 1;
-	cf->cf_addr = 1;
+
+	r600_bytecode_add_ack(ctx->bc);
+	r600_bytecode_wait_acks(ctx->bc);
 
 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
 	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
@@ -12084,7 +12097,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
-	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
+	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_NOP, tgsi_membar},
 	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
@@ -12311,7 +12324,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
-	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
+	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_NOP, tgsi_membar},
 	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index 6b07dc1ecfc..12c2c61150a 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -118,6 +118,11 @@
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND           0x00000001
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ                0x00000002
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND            0x00000003
+
+/* R700+-only */
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG        0x00000002
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG    0x00000003
+
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((unsigned)(x) & 0x7F) << 15)
 #define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) >> 15) & 0x7F)
 #define   C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR                          0xFFC07FFF



More information about the mesa-commit mailing list