[Mesa-dev] [PATCH 09/13] radeonsi: use TC L2 for CP DMA operations with shader resources on CIK
Marek Olšák
maraeo at gmail.com
Mon Jan 5 12:20:59 PST 2015
From: Marek Olšák <marek.olsak at amd.com>
So that TC L2 doesn't need to be flushed.
The only problem is with index buffers, which don't use TC.
A simple solution is added that flushes TC L2 before a draw call (TC_L2_dirty).
---
src/gallium/drivers/radeon/r600_pipe_common.h | 12 ++++++++++
src/gallium/drivers/radeonsi/si_descriptors.c | 32 ++++++++++++++++++---------
src/gallium/drivers/radeonsi/si_state_draw.c | 5 +++++
3 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index a9416b6..60b8fae 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -138,6 +138,18 @@ struct r600_resource {
* the unsynchronized map flag and expect the driver to figure it out.
*/
struct util_range valid_buffer_range;
+
+ /* For buffers only. This indicates that a write operation has been
+ * performed by TC L2, but the cache hasn't been flushed.
+ * Any hw block which doesn't use or bypasses TC L2 should check this
+ * flag and flush the cache before using the buffer.
+ *
+ * For example, TC L2 must be flushed if a buffer which has been
+ * modified by a shader store instruction is about to be used as
+ * an index buffer. The reason is that VGT DMA index fetching doesn't
+ * use TC L2.
+ */
+ bool TC_L2_dirty;
};
struct r600_transfer {
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index e2da476..454e12c 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1073,7 +1073,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
bool is_framebuffer)
{
struct si_context *sctx = (struct si_context*)ctx;
- unsigned flush_flags;
+ unsigned flush_flags, tc_l2_flag;
if (!size)
return;
@@ -1098,19 +1098,22 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
uint64_t va = r600_resource(dst)->gpu_address + offset;
/* Flush the caches where the resource is bound. */
- if (is_framebuffer)
+ if (is_framebuffer) {
flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
- else
+ tc_l2_flag = 0;
+ } else {
flush_flags = SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
+ (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
SI_CONTEXT_INV_KCACHE;
+ tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+ }
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
flush_flags;
while (size) {
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
- unsigned dma_flags = 0;
+ unsigned dma_flags = tc_l2_flag;
si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
FALSE);
@@ -1141,6 +1144,9 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
/* Flush the caches again in case the 3D engine has been prefetching
* the resource. */
sctx->b.flags |= flush_flags;
+
+ if (tc_l2_flag)
+ r600_resource(dst)->TC_L2_dirty = true;
}
void si_copy_buffer(struct si_context *sctx,
@@ -1148,7 +1154,7 @@ void si_copy_buffer(struct si_context *sctx,
uint64_t dst_offset, uint64_t src_offset, unsigned size,
bool is_framebuffer)
{
- unsigned flush_flags;
+ unsigned flush_flags, tc_l2_flag;
if (!size)
return;
@@ -1163,18 +1169,21 @@ void si_copy_buffer(struct si_context *sctx,
src_offset += r600_resource(src)->gpu_address;
/* Flush the caches where the resource is bound. */
- if (is_framebuffer)
+ if (is_framebuffer) {
flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
- else
+ tc_l2_flag = 0;
+ } else {
flush_flags = SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
+ (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
SI_CONTEXT_INV_KCACHE;
+ tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+ }
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
flush_flags;
while (size) {
- unsigned sync_flags = 0;
+ unsigned sync_flags = tc_l2_flag;
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
@@ -1206,6 +1215,9 @@ void si_copy_buffer(struct si_context *sctx,
/* Flush the caches again in case the 3D engine has been prefetching
* the resource. */
sctx->b.flags |= flush_flags;
+
+ if (tc_l2_flag)
+ r600_resource(dst)->TC_L2_dirty = true;
}
/* INIT/DEINIT */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index ccc44d5..e6916c1 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -551,6 +551,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
}
}
+ if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) {
+ sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+ r600_resource(ib.buffer)->TC_L2_dirty = false;
+ }
+
/* Check flush flags. */
if (sctx->b.flags)
sctx->atoms.s.cache_flush->dirty = true;
--
2.1.0
More information about the mesa-dev
mailing list