[Mesa-dev] [PATCH 4/4] radeonsi: use SDMA for uploading data through const_uploader
Marek Olšák
maraeo at gmail.com
Thu Feb 7 01:22:01 UTC 2019
From: Marek Olšák <marek.olsak at amd.com>
---
src/gallium/drivers/radeonsi/si_buffer.c | 56 ++++++++++++++++++++++--
src/gallium/drivers/radeonsi/si_dma_cs.c | 19 ++++----
src/gallium/drivers/radeonsi/si_gfx_cs.c | 42 +++++++++++++++---
src/gallium/drivers/radeonsi/si_pipe.c | 23 ++++++----
src/gallium/drivers/radeonsi/si_pipe.h | 17 +++++++
5 files changed, 131 insertions(+), 26 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index c01118ce96a..3f8db7cf4f0 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -433,21 +433,29 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,
if (si_invalidate_buffer(sctx, buf)) {
/* At this point, the buffer is always idle. */
usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
} else {
/* Fall back to a temporary buffer. */
usage |= PIPE_TRANSFER_DISCARD_RANGE;
}
}
- if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+ if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
+ buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+ usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
+ PIPE_TRANSFER_PERSISTENT);
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ force_discard_range = true;
+ }
+
+ if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
PIPE_TRANSFER_PERSISTENT))) ||
(buf->flags & RADEON_FLAG_SPARSE))) {
assert(usage & PIPE_TRANSFER_WRITE);
/* Check if mapping this buffer would cause waiting for the GPU.
*/
if (buf->flags & RADEON_FLAG_SPARSE ||
force_discard_range ||
si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
@@ -514,32 +522,72 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,
data += box->x;
return si_buffer_get_transfer(ctx, resource, usage, box,
ptransfer, data, NULL, 0);
}
static void si_buffer_do_flush_region(struct pipe_context *ctx,
struct pipe_transfer *transfer,
const struct pipe_box *box)
{
+ struct si_context *sctx = (struct si_context*)ctx;
struct si_transfer *stransfer = (struct si_transfer*)transfer;
struct si_resource *buf = si_resource(transfer->resource);
if (stransfer->staging) {
unsigned src_offset = stransfer->offset +
transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
(box->x - transfer->box.x);
+ if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+ /* This should be true for all uploaders. */
+ assert(transfer->box.x == 0);
+
+ /* Find a previous upload and extend its range. The last
+ * upload is likely to be at the end of the list.
+ */
+ for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
+ struct si_sdma_upload *up = &sctx->sdma_uploads[i];
+
+ if (up->dst != buf)
+ continue;
+
+ assert(up->src == stransfer->staging);
+ assert(box->x > up->dst_offset);
+ up->size = box->x + box->width - up->dst_offset;
+ return;
+ }
+
+ /* Enlarge the array if it's full. */
+ if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
+ unsigned size;
+
+ sctx->max_sdma_uploads += 4;
+ size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
+ sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
+ }
+
+ /* Add a new upload. */
+ struct si_sdma_upload *up =
+ &sctx->sdma_uploads[sctx->num_sdma_uploads++];
+ up->dst = up->src = NULL;
+ si_resource_reference(&up->dst, buf);
+ si_resource_reference(&up->src, stransfer->staging);
+ up->dst_offset = box->x;
+ up->src_offset = src_offset;
+ up->size = box->width;
+ return;
+ }
+
/* Copy the staging buffer into the original one. */
- si_copy_buffer((struct si_context*)ctx, transfer->resource,
- &stransfer->staging->b.b, box->x, src_offset,
- box->width);
+ si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
+ box->x, src_offset, box->width);
}
util_range_add(&buf->valid_buffer_range, box->x,
box->x + box->width);
}
static void si_buffer_flush_region(struct pipe_context *ctx,
struct pipe_transfer *transfer,
const struct pipe_box *rel_box)
{
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 2aafc1f09a0..bba1bd95826 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -133,21 +133,22 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
if (dst) {
vram += dst->vram_usage;
gtt += dst->gart_usage;
}
if (src) {
vram += src->vram_usage;
gtt += src->gart_usage;
}
/* Flush the GFX IB if DMA depends on it. */
- if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+ if (!ctx->sdma_uploads_in_progress &&
+ radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
((dst &&
ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
RADEON_USAGE_READWRITE)) ||
(src &&
ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
RADEON_USAGE_WRITE))))
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
/* Flush if there's not enough space, or if the memory usage per IB
* is too large.
@@ -155,45 +156,47 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
* IBs using too little memory are limited by the IB submission overhead.
* IBs using too much memory are limited by the kernel/TTM overhead.
* Too long IBs create CPU-GPU pipeline bubbles and add latency.
*
* This heuristic makes sure that DMA requests are executed
* very soon after the call is made and lowers memory usage.
* It improves texture upload performance by keeping the DMA
* engine busy while uploads are being submitted.
*/
num_dw++; /* for emit_wait_idle below */
- if (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
- ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
- !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
+ if (!ctx->sdma_uploads_in_progress &&
+ (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
+ ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
+ !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt))) {
si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
}
/* Wait for idle if either buffer has been used in the IB before to
* prevent read-after-write hazards.
*/
if ((dst &&
ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf,
RADEON_USAGE_READWRITE)) ||
(src &&
ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf,
RADEON_USAGE_WRITE)))
si_dma_emit_wait_idle(ctx);
+ unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
if (dst) {
- radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
- RADEON_USAGE_WRITE, 0);
+ ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
+ dst->domains, 0);
}
if (src) {
- radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
- RADEON_USAGE_READ, 0);
+ ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | sync,
+ src->domains, 0);
}
/* this function is called before all DMA calls, so increment this. */
ctx->num_dma_calls++;
}
void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
struct pipe_fence_handle **fence)
{
struct radeon_cmdbuf *cs = ctx->dma_cs;
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 3d64587fa2b..13d5b5a959a 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -19,20 +19,21 @@
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "si_pipe.h"
#include "util/os_time.h"
+#include "util/u_upload_mgr.h"
/* initialize */
void si_need_gfx_cs_space(struct si_context *ctx)
{
struct radeon_cmdbuf *cs = ctx->gfx_cs;
/* There is no need to flush the DMA IB here, because
* si_need_dma_space always flushes the GFX IB if there is
* a conflict, which means any unflushed DMA commands automatically
* precede the GFX IB (= they had no dependency on the GFX IB when
@@ -57,20 +58,29 @@ void si_need_gfx_cs_space(struct si_context *ctx)
* and just flush if there is not enough space left.
*
* Also reserve space for stopping queries at the end of IB, because
* the number of active queries is mostly unlimited.
*/
unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
if (!ctx->ws->cs_check_space(cs, need_dwords))
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
+void si_unref_sdma_uploads(struct si_context *sctx)
+{
+ for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
+ si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
+ si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
+ }
+ sctx->num_sdma_uploads = 0;
+}
+
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
struct pipe_fence_handle **fence)
{
struct radeon_cmdbuf *cs = ctx->gfx_cs;
struct radeon_winsys *ws = ctx->ws;
unsigned wait_flags = 0;
if (ctx->gfx_flush_in_progress)
return;
@@ -91,31 +101,51 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
(!wait_flags || !ctx->gfx_last_ib_is_busy))
return;
if (si_check_device_reset(ctx))
return;
if (ctx->screen->debug_flags & DBG(CHECK_VM))
flags &= ~PIPE_FLUSH_ASYNC;
+ ctx->gfx_flush_in_progress = true;
+
/* If the state tracker is flushing the GFX IB, si_flush_from_st is
* responsible for flushing the DMA IB and merging the fences from both.
- * This code is only needed when the driver flushes the GFX IB
- * internally, and it never asks for a fence handle.
+ * If the driver flushes the GFX IB internally, and it should never ask
+ * for a fence handle.
*/
- if (radeon_emitted(ctx->dma_cs, 0)) {
- assert(fence == NULL); /* internal flushes only */
- si_flush_dma_cs(ctx, flags, NULL);
+ assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL);
+
+ /* Update the sdma_uploads list by flushing the uploader. */
+ u_upload_unmap(ctx->b.const_uploader);
+
+ /* Execute SDMA uploads. */
+ ctx->sdma_uploads_in_progress = true;
+ for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
+ struct si_sdma_upload *up = &ctx->sdma_uploads[i];
+ struct pipe_box box;
+
+ assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
+ up->size % 4 == 0);
+
+ u_box_1d(up->src_offset, up->size, &box);
+ ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0,
+ &up->src->b.b, 0, &box);
}
+ ctx->sdma_uploads_in_progress = false;
+ si_unref_sdma_uploads(ctx);
- ctx->gfx_flush_in_progress = true;
+ /* Flush SDMA (preamble IB). */
+ if (radeon_emitted(ctx->dma_cs, 0))
+ si_flush_dma_cs(ctx, flags, NULL);
if (!LIST_IS_EMPTY(&ctx->active_queries))
si_suspend_queries(ctx);
ctx->streamout.suspended = false;
if (ctx->streamout.begin_emitted) {
si_emit_streamout_end(ctx);
ctx->streamout.suspended = true;
}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c6f93e7b15e..c0ee2b1a6dc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -257,20 +257,21 @@ static void si_destroy_context(struct pipe_context *context)
si_saved_cs_reference(&sctx->current_saved_cs, NULL);
_mesa_hash_table_destroy(sctx->tex_handles, NULL);
_mesa_hash_table_destroy(sctx->img_handles, NULL);
util_dynarray_fini(&sctx->resident_tex_handles);
util_dynarray_fini(&sctx->resident_img_handles);
util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
+ si_unref_sdma_uploads(sctx);
FREE(sctx);
}
static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx)
{
struct si_context *sctx = (struct si_context *)ctx;
if (sctx->screen->info.has_gpu_reset_status_query)
return sctx->ws->ctx_query_reset_status(sctx->ctx);
@@ -436,43 +437,49 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
SI_RESOURCE_FLAG_CLEAR, false);
if (!sctx->allocator_zeroed_memory)
goto fail;
sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
0, PIPE_USAGE_STREAM,
SI_RESOURCE_FLAG_READ_ONLY);
if (!sctx->b.stream_uploader)
goto fail;
- sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024,
- 0, PIPE_USAGE_DEFAULT,
- SI_RESOURCE_FLAG_32BIT |
- (sscreen->cpdma_prefetch_writes_memory ?
- 0 : SI_RESOURCE_FLAG_READ_ONLY));
- if (!sctx->b.const_uploader)
- goto fail;
-
sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
0, PIPE_USAGE_STAGING, 0);
if (!sctx->cached_gtt_allocator)
goto fail;
sctx->ctx = sctx->ws->ctx_create(sctx->ws);
if (!sctx->ctx)
goto fail;
if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
(void*)si_flush_dma_cs,
sctx, stop_exec_on_failure);
}
+ bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs && debug_get_bool_option("SDMA", true);
+ sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
+ 0, PIPE_USAGE_DEFAULT,
+ SI_RESOURCE_FLAG_32BIT |
+ (use_sdma_upload ?
+ SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :
+ (sscreen->cpdma_prefetch_writes_memory ?
+ 0 : SI_RESOURCE_FLAG_READ_ONLY)));
+ if (!sctx->b.const_uploader)
+ goto fail;
+
+ if (use_sdma_upload)
+ u_upload_enable_flush_explicit(sctx->b.const_uploader);
+
si_init_buffer_functions(sctx);
si_init_clear_functions(sctx);
si_init_blit_functions(sctx);
si_init_compute_functions(sctx);
si_init_compute_blit_functions(sctx);
si_init_debug_functions(sctx);
si_init_msaa_functions(sctx);
si_init_streamout_functions(sctx);
if (sscreen->info.has_hw_decode) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b01d5744752..b208bdeb848 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -103,20 +103,22 @@
#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
#define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
#define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
#define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
#define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
+/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
+#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
enum si_clear_code
{
DCC_CLEAR_COLOR_0000 = 0x00000000,
DCC_CLEAR_COLOR_0001 = 0x40404040,
DCC_CLEAR_COLOR_1110 = 0x80808080,
DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0,
DCC_CLEAR_COLOR_REG = 0x20202020,
DCC_UNCOMPRESSED = 0xFFFFFFFF,
};
@@ -769,20 +771,28 @@ struct si_saved_cs {
struct si_context *ctx;
struct radeon_saved_cs gfx;
struct si_resource *trace_buf;
unsigned trace_id;
unsigned gfx_last_dw;
bool flushed;
int64_t time_flush;
};
+struct si_sdma_upload {
+ struct si_resource *dst;
+ struct si_resource *src;
+ unsigned src_offset;
+ unsigned dst_offset;
+ unsigned size;
+};
+
struct si_context {
struct pipe_context b; /* base class */
enum radeon_family family;
enum chip_class chip_class;
struct radeon_winsys *ws;
struct radeon_winsys_ctx *ctx;
struct radeon_cmdbuf *gfx_cs;
struct radeon_cmdbuf *dma_cs;
@@ -1074,20 +1084,26 @@ struct si_context {
int num_perfect_occlusion_queries;
struct list_head active_queries;
unsigned num_cs_dw_queries_suspend;
/* Render condition. */
struct pipe_query *render_cond;
unsigned render_cond_mode;
bool render_cond_invert;
bool render_cond_force_off; /* for u_blitter */
+ /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
+ bool sdma_uploads_in_progress;
+ struct si_sdma_upload *sdma_uploads;
+ unsigned num_sdma_uploads;
+ unsigned max_sdma_uploads;
+
/* Statistics gathering for the DCC enablement heuristic. It can't be
* in si_texture because si_texture can be shared by multiple
* contexts. This is for back buffers only. We shouldn't get too many
* of those.
*
* X11 DRI3 rotates among a finite set of back buffers. They should
* all fit in this array. If they don't, separate DCC might never be
* enabled by DCC stat gathering.
*/
struct {
@@ -1273,20 +1289,21 @@ struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
struct tc_unflushed_batch_token *tc_token);
/* si_get.c */
void si_init_screen_get_functions(struct si_screen *sscreen);
/* si_gfx_cs.c */
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
struct pipe_fence_handle **fence);
void si_begin_new_gfx_cs(struct si_context *ctx);
void si_need_gfx_cs_space(struct si_context *ctx);
+void si_unref_sdma_uploads(struct si_context *sctx);
/* si_gpu_load.c */
void si_gpu_load_kill_thread(struct si_screen *sscreen);
uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
uint64_t begin);
/* si_compute.c */
void si_init_compute_functions(struct si_context *sctx);
--
2.17.1
More information about the mesa-dev
mailing list