Mesa (staging/22.0): r600: Implement memoryBarrier() in the non-SFN path.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Thu Apr 21 16:23:33 UTC 2022
Module: Mesa
Branch: staging/22.0
Commit: b3d7e39b538bd0edb49e4b6ab07be2f672d938ed
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=b3d7e39b538bd0edb49e4b6ab07be2f672d938ed
Author: Emma Anholt <emma at anholt.net>
Date: Fri Feb 11 15:11:59 2022 -0800
r600: Implement memoryBarrier() in the non-SFN path.
Previously we were just doing a group barrier for both membar and barrier.
This sometimes worked out, because atomics and reads waited for ack
already, but writes were not waiting for ack. Use the need_wait_ack
pattern that scratch writes used, with a little refactoring for
reusability.
The refactor also incidentally fixes the atomics waiting for outstanding
acks to be > 1 instead of > 0.
Cc: mesa-stable
Fixes: #6028
Reviewed-by: Gert Wollny <gert.wollny at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14429>
(cherry picked from commit b8324a7387d5282454e8635cde2d362c873b8a2d)
Conflicts:
src/gallium/drivers/r600/ci/r600-turks-fails.txt
---
.pick_status.json | 2 +-
src/gallium/drivers/r600/eg_sq.h | 2 ++
src/gallium/drivers/r600/r600_asm.c | 48 ++++++++++++++++++++++++-----
src/gallium/drivers/r600/r600_asm.h | 7 +++--
src/gallium/drivers/r600/r600_shader.c | 55 +++++++++++++++++++++-------------
src/gallium/drivers/r600/r600_sq.h | 5 ++++
6 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/.pick_status.json b/.pick_status.json
index e565f4f5006..a9b98156b36 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -7548,7 +7548,7 @@
"description": "r600: Implement memoryBarrier() in the non-SFN path.",
"nominated": true,
"nomination_type": 0,
- "resolution": 5,
+ "resolution": 1,
"because_sha": null
},
{
diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h
index c6280167a92..bdf9b330ee9 100644
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -165,7 +165,9 @@
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE 0x00000000
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK 0x00000002
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK 0x00000003
#define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((unsigned)(x) & 0x7F) << 15)
#define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F)
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 321d5e3b555..c0a29e3d2f4 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -240,14 +240,48 @@ int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
return 0;
}
-void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean need_wait_ack)
+void
+r600_bytecode_add_ack(struct r600_bytecode *bc)
{
- bc->need_wait_ack = need_wait_ack;
+ bc->need_wait_ack = true;
}
-boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc)
+int
+r600_bytecode_wait_acks(struct r600_bytecode *bc)
{
- return bc->need_wait_ack;
+ /* Store acks are an R700+ feature. */
+ if (bc->chip_class < R700)
+ return 0;
+
+ if (!bc->need_wait_ack)
+ return 0;
+
+ int ret = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
+ if (ret != 0)
+ return ret;
+
+ struct r600_bytecode_cf *cf = bc->cf_last;
+ cf->barrier = 1;
+ /* Request a wait if the number of outstanding acks is > 0 */
+ cf->cf_addr = 0;
+
+ return 0;
+}
+
+uint32_t
+r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect)
+{
+ if (bc->chip_class >= R700) {
+ if (indirect)
+ return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG;
+ else
+ return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG;
+ } else {
+ if (indirect)
+ return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
+ else
+ return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+ }
}
/* alu instructions that can ony exits once per group */
@@ -1536,10 +1570,8 @@ int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op)
int r;
/* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */
- if (op != CF_OP_MEM_SCRATCH && bc->need_wait_ack) {
- bc->need_wait_ack = false;
- r = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
- }
+ if (op != CF_OP_WAIT_ACK && op != CF_OP_MEM_SCRATCH)
+ r600_bytecode_wait_acks(bc);
r = r600_bytecode_add_cf(bc);
if (r)
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index a526993b318..501d827744a 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -313,8 +313,11 @@ int r600_bytecode_add_output(struct r600_bytecode *bc,
const struct r600_bytecode_output *output);
int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
const struct r600_bytecode_output *output);
-void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean needed);
-boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc);
+
+void r600_bytecode_add_ack(struct r600_bytecode *bc);
+int r600_bytecode_wait_acks(struct r600_bytecode *bc);
+uint32_t r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect);
+
int r600_bytecode_build(struct r600_bytecode *bc);
int r600_bytecode_add_cf(struct r600_bytecode *bc);
int r600_bytecode_add_cfinst(struct r600_bytecode *bc,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index f65b6b32d24..2257262cace 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -969,9 +969,18 @@ static int tgsi_barrier(struct r600_shader_ctx *ctx)
r = r600_bytecode_add_alu(ctx->bc, &alu);
if (r)
return r;
+
+ /* XXX: Need to implement GWS ops to sync across wavefronts */
+
return 0;
}
+static int tgsi_membar(struct r600_shader_ctx *ctx)
+{
+ /* Wait for any SSBO/image stores to land. */
+ return r600_bytecode_wait_acks(ctx->bc);
+}
+
static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
{
// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
@@ -1659,10 +1668,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
else {
struct r600_bytecode_vtx vtx;
- if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
- r600_bytecode_need_wait_ack(ctx->bc, false);
- r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
- }
+ r600_bytecode_wait_acks(ctx->bc);
memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
vtx.op = FETCH_OP_READ_SCRATCH;
@@ -4475,7 +4481,7 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
cf.op = CF_OP_MEM_SCRATCH;
cf.elem_size = 3;
cf.gpr = reg;
- cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+ cf.type = r600_bytecode_write_export_ack_type(ctx->bc, tgsi_dst->Register.Indirect);
cf.mark = 1;
cf.comp_mask = inst->Dst[0].Register.WriteMask;
cf.swizzle_x = 0;
@@ -4485,10 +4491,6 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
cf.burst_count = 1;
if (tgsi_dst->Register.Indirect) {
- if (ctx->bc->chip_class < R700)
- cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
- else
- cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
cf.index_gpr = ctx->bc->ar_reg;
}
else {
@@ -4500,8 +4502,8 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
if (r)
return;
- if (ctx->bc->chip_class >= R700)
- r600_bytecode_need_wait_ack(ctx->bc, true);
+ r600_bytecode_add_ack(ctx->bc);
+
}
return;
}
@@ -8952,9 +8954,8 @@ static int tgsi_load_rat(struct r600_shader_ctx *ctx)
cf->mark = 1;
cf->output.elem_size = 0;
- r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
- cf = ctx->bc->cf_last;
- cf->barrier = 1;
+ r600_bytecode_add_ack(ctx->bc);
+ r600_bytecode_wait_acks(ctx->bc);
desc = util_format_description(inst->Memory.Format);
r600_vertex_data_type(inst->Memory.Format,
@@ -9055,6 +9056,7 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
return r;
}
+ cf = NULL;
lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
for (i = 0; i <= lasti; i++) {
struct r600_bytecode_alu alu;
@@ -9095,6 +9097,14 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
cf->barrier = 1;
cf->output.elem_size = 0;
}
+
+ /* Request an ack from the last write emitted. */
+ if (cf) {
+ cf->mark = true;
+ cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
+ r600_bytecode_add_ack(ctx->bc);
+ }
+
return 0;
}
@@ -9144,7 +9154,7 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx)
cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
cf->rat.inst = V_RAT_INST_STORE_TYPED;
cf->rat.index_mode = rat_index_mode;
- cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
+ cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
cf->output.gpr = val_gpr;
cf->output.index_gpr = idx_gpr;
cf->output.comp_mask = 0xf;
@@ -9152,6 +9162,10 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx)
cf->vpm = 1;
cf->barrier = 1;
cf->output.elem_size = 0;
+ cf->mark = 1;
+
+ r600_bytecode_add_ack(ctx->bc);
+
return 0;
}
@@ -9324,10 +9338,9 @@ static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
cf->barrier = 1;
cf->mark = 1;
cf->output.elem_size = 0;
- r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
- cf = ctx->bc->cf_last;
- cf->barrier = 1;
- cf->cf_addr = 1;
+
+ r600_bytecode_add_ack(ctx->bc);
+ r600_bytecode_wait_acks(ctx->bc);
memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
@@ -12084,7 +12097,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
[TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
[TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
- [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
+ [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar},
[113] = { ALU_OP0_NOP, tgsi_unsupported},
[114] = { ALU_OP0_NOP, tgsi_unsupported},
[115] = { ALU_OP0_NOP, tgsi_unsupported},
@@ -12311,7 +12324,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
[TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
[TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
[TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
- [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
+ [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar},
[113] = { ALU_OP0_NOP, tgsi_unsupported},
[114] = { ALU_OP0_NOP, tgsi_unsupported},
[115] = { ALU_OP0_NOP, tgsi_unsupported},
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index 6b07dc1ecfc..12c2c61150a 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -118,6 +118,11 @@
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003
+
+/* R700+-only */
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG 0x00000002
+#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG 0x00000003
+
#define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((unsigned)(x) & 0x7F) << 15)
#define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F)
#define C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR 0xFFC07FFF
More information about the mesa-commit
mailing list