[Mesa-dev] [PATCH 2/3] radeonsi: implement TC L2 write-back (flush) without cache invalidation
Marek Olšák
maraeo at gmail.com
Tue Oct 11 14:48:02 UTC 2016
From: Marek Olšák <marek.olsak at amd.com>
---
src/gallium/drivers/radeonsi/si_pipe.h | 21 ++++----
src/gallium/drivers/radeonsi/si_state_draw.c | 81 +++++++++++++++++++++-------
2 files changed, 74 insertions(+), 28 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 3cefee7..e10d3fb 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -43,31 +43,34 @@
#define SI_GS_PER_ES 128
/* Instruction cache. */
#define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0)
/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
#define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 1)
/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
#define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2)
/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
#define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3)
+/* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't
+ * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. */
+#define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 4)
/* Framebuffer caches. */
-#define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4)
-#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
-#define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 6)
-#define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 7)
+#define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
+#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 6)
+#define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 7)
+#define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 8)
/* Engine synchronization. */
-#define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 8)
-#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9)
-#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10)
-#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11)
-#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 12)
+#define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9)
+#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10)
+#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11)
+#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 12)
+#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13)
#define SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER (SI_CONTEXT_FLUSH_AND_INV_CB | \
SI_CONTEXT_FLUSH_AND_INV_CB_META | \
SI_CONTEXT_FLUSH_AND_INV_DB | \
SI_CONTEXT_FLUSH_AND_INV_DB_META)
#define SI_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id) & 0xffff))
#define SI_IS_TRACE_POINT(x) (((x) & 0xcafe0000) == 0xcafe0000)
#define SI_GET_TRACE_POINT_ID(x) ((x) & 0xffff)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 38e5cb4..33b6b23 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -689,48 +689,52 @@ static void si_emit_draw_packets(struct si_context *sctx,
radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
} else {
radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
radeon_emit(cs, info->count);
radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
}
}
}
+static void si_emit_surface_sync(struct r600_common_context *rctx,
+ unsigned cp_coher_cntl)
+{
+ struct radeon_winsys_cs *cs = rctx->gfx.cs;
+
+ /* ACQUIRE_MEM is only required on a compute ring. */
+ radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+}
+
void si_emit_cache_flush(struct si_context *sctx)
{
struct r600_common_context *rctx = &sctx->b;
struct radeon_winsys_cs *cs = rctx->gfx.cs;
uint32_t cp_coher_cntl = 0;
/* SI has a bug that it always flushes ICACHE and KCACHE if either
* bit is set. An alternative way is to write SQC_CACHES, but that
* doesn't seem to work reliably. Since the bug doesn't affect
* correctness (it only does more work than necessary) and
* the performance impact is likely negligible, there is no plan
* to add a workaround for it.
*/
if (rctx->flags & SI_CONTEXT_INV_ICACHE)
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
if (rctx->flags & SI_CONTEXT_INV_SMEM_L1)
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
- if (rctx->flags & SI_CONTEXT_INV_VMEM_L1)
- cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
- if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
- cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
-
- if (rctx->chip_class >= VI)
- cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
- }
-
if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
S_0085F0_CB0_DEST_BASE_ENA(1) |
S_0085F0_CB1_DEST_BASE_ENA(1) |
S_0085F0_CB2_DEST_BASE_ENA(1) |
S_0085F0_CB3_DEST_BASE_ENA(1) |
S_0085F0_CB4_DEST_BASE_ENA(1) |
S_0085F0_CB5_DEST_BASE_ENA(1) |
S_0085F0_CB6_DEST_BASE_ENA(1) |
S_0085F0_CB7_DEST_BASE_ENA(1);
@@ -799,37 +803,76 @@ void si_emit_cache_flush(struct si_context *sctx)
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
}
if (rctx->flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
}
/* Make sure ME is idle (it executes most packets) before continuing.
* This prevents read-after-write hazards between PFP and ME.
*/
- if (cp_coher_cntl || (rctx->flags & SI_CONTEXT_CS_PARTIAL_FLUSH)) {
+ if (cp_coher_cntl ||
+ (rctx->flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
+ SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
+ SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
- /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle.
- * Therefore, it should be last. Done in PFP.
+ /* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
+ * waits for idle. Therefore, it should be last. SURFACE_SYNC is done
+ * in PFP.
+ *
+ * cp_coher_cntl should contain all necessary flags except TC flags
+ * at this point.
+ *
+ * SI-CIK don't support L2 write-back.
*/
- if (cp_coher_cntl) {
- /* ACQUIRE_MEM is only required on a compute ring. */
- radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2 ||
+ (rctx->chip_class <= CIK &&
+ (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
+ /* Invalidate L1 & L2. (L1 is always invalidated)
+ * WB must be set on VI+ when TC_ACTION is set.
+ */
+ si_emit_surface_sync(rctx, cp_coher_cntl |
+ S_0085F0_TC_ACTION_ENA(1) |
+ S_0301F0_TC_WB_ACTION_ENA(rctx->chip_class >= VI));
+ cp_coher_cntl = 0;
+ } else {
+ /* L1 invalidation and L2 writeback must be done separately,
+ * because both operations can't be done together.
+ */
+ if (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2) {
+ /* WB = write-back
+ * NC = apply to non-coherent MTYPEs
+ * (i.e. MTYPE <= 1, which is what we use everywhere)
+ *
+ * WB doesn't work without NC.
+ */
+ si_emit_surface_sync(rctx, cp_coher_cntl |
+ S_0301F0_TC_WB_ACTION_ENA(1) |
+ S_0301F0_TC_NC_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
+ if (rctx->flags & SI_CONTEXT_INV_VMEM_L1) {
+ /* Invalidate per-CU VMEM L1. */
+ si_emit_surface_sync(rctx, cp_coher_cntl |
+ S_0085F0_TCL1_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
}
+ /* If TC flushes haven't cleared this... */
+ if (cp_coher_cntl)
+ si_emit_surface_sync(rctx, cp_coher_cntl);
+
if (rctx->flags & R600_CONTEXT_START_PIPELINE_STATS) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
EVENT_INDEX(0));
} else if (rctx->flags & R600_CONTEXT_STOP_PIPELINE_STATS) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
EVENT_INDEX(0));
}
--
2.7.4
More information about the mesa-dev
mailing list