[Mesa-dev] [PATCH 3/3] radeonsi: don't map big VRAM buffers for the first upload directly
Nicolai Hähnle
nhaehnle at gmail.com
Mon Nov 6 11:19:55 UTC 2017
For the series:
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
On 04.11.2017 14:03, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
> src/gallium/drivers/radeon/r600_buffer_common.c | 20 ++++++++++++++++++++
> src/gallium/drivers/radeon/r600_pipe_common.h | 1 +
> 2 files changed, 21 insertions(+)
>
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
> index 92521f4..519e52e 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -187,20 +187,21 @@ void si_init_resource_fields(struct r600_common_screen *rscreen,
> if (rscreen->debug_flags & DBG(NO_WC))
> res->flags &= ~RADEON_FLAG_GTT_WC;
>
> /* Set expected VRAM and GART usage for the buffer. */
> res->vram_usage = 0;
> res->gart_usage = 0;
>
> if (res->domains & RADEON_DOMAIN_VRAM) {
> res->vram_usage = size;
>
> + res->max_forced_staging_uploads =
> res->b.max_forced_staging_uploads =
> rscreen->info.has_dedicated_vram &&
> size >= rscreen->info.vram_vis_size / 4 ? 1 : 0;
> } else if (res->domains & RADEON_DOMAIN_GTT) {
> res->gart_usage = size;
> }
> }
>
> bool si_alloc_resource(struct r600_common_screen *rscreen,
> struct r600_resource *res)
> @@ -288,20 +289,21 @@ void si_replace_buffer_storage(struct pipe_context *ctx,
> {
> struct r600_common_context *rctx = (struct r600_common_context *)ctx;
> struct r600_resource *rdst = r600_resource(dst);
> struct r600_resource *rsrc = r600_resource(src);
> uint64_t old_gpu_address = rdst->gpu_address;
>
> pb_reference(&rdst->buf, rsrc->buf);
> rdst->gpu_address = rsrc->gpu_address;
> rdst->b.b.bind = rsrc->b.b.bind;
> rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads;
> + rdst->max_forced_staging_uploads = rsrc->max_forced_staging_uploads;
> rdst->flags = rsrc->flags;
>
> assert(rdst->vram_usage == rsrc->vram_usage);
> assert(rdst->gart_usage == rsrc->gart_usage);
> assert(rdst->bo_size == rsrc->bo_size);
> assert(rdst->bo_alignment == rsrc->bo_alignment);
> assert(rdst->domains == rsrc->domains);
>
> rctx->rebind_buffer(ctx, dst, old_gpu_address);
> }
> @@ -395,20 +397,37 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
> !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
> usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
> }
>
> /* If discarding the entire range, discard the whole resource instead. */
> if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
> box->x == 0 && box->width == resource->width0) {
> usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
> }
>
> + /* If a buffer in VRAM is too large and the range is discarded, don't
> + * map it directly. This makes sure that the buffer stays in VRAM.
> + */
> + bool force_discard_range = false;
> + if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
> + PIPE_TRANSFER_DISCARD_RANGE) &&
> + !(usage & PIPE_TRANSFER_PERSISTENT) &&
> + /* Try not to decrement the counter if it's not positive. Still racy,
> + * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
> + rbuffer->max_forced_staging_uploads > 0 &&
> + p_atomic_dec_return(&rbuffer->max_forced_staging_uploads) >= 0) {
> + usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
> + PIPE_TRANSFER_UNSYNCHRONIZED);
> + usage |= PIPE_TRANSFER_DISCARD_RANGE;
> + force_discard_range = true;
> + }
> +
> if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
> !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
> TC_TRANSFER_MAP_NO_INVALIDATE))) {
> assert(usage & PIPE_TRANSFER_WRITE);
>
> if (r600_invalidate_buffer(rctx, rbuffer)) {
> /* At this point, the buffer is always idle. */
> usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
> } else {
> /* Fall back to a temporary buffer. */
> @@ -420,20 +439,21 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
> !(rscreen->debug_flags & DBG(NO_DISCARD_RANGE)) &&
> ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
> PIPE_TRANSFER_PERSISTENT)) &&
> r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) ||
> (rbuffer->flags & RADEON_FLAG_SPARSE))) {
> assert(usage & PIPE_TRANSFER_WRITE);
>
> /* Check if mapping this buffer would cause waiting for the GPU.
> */
> if (rbuffer->flags & RADEON_FLAG_SPARSE ||
> + force_discard_range ||
> si_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
> !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
> /* Do a wait-free write-only transfer using a temporary buffer. */
> unsigned offset;
> struct r600_resource *staging = NULL;
>
> u_upload_alloc(ctx->stream_uploader, 0,
> box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
> rctx->screen->info.tcc_cache_line_size,
> &offset, (struct pipe_resource**)&staging,
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
> index a45921e..a7fec37 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -167,20 +167,21 @@ struct r600_resource {
> /* Memory usage if the buffer placement is optimal. */
> uint64_t vram_usage;
> uint64_t gart_usage;
>
> /* Resource properties. */
> uint64_t bo_size;
> unsigned bo_alignment;
> enum radeon_bo_domain domains;
> enum radeon_bo_flag flags;
> unsigned bind_history;
> + int max_forced_staging_uploads;
>
> /* The buffer range which is initialized (with a write transfer,
> * streamout, DMA, or as a random access target). The rest of
> * the buffer is considered invalid and can be mapped unsynchronized.
> *
> * This allows unsychronized mapping of a buffer range which hasn't
> * been used yet. It's for applications which forget to use
> * the unsynchronized map flag and expect the driver to figure it out.
> */
> struct util_range valid_buffer_range;
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list