[Mesa-dev] [PATCH 2/3] gallium/u_threaded: don't map big VRAM buffers for the first upload directly
Marek Olšák
maraeo at gmail.com
Sat Nov 4 13:03:21 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
This improves Paraview "many spheres" performance 4x along with the radeonsi
commit.
---
src/gallium/auxiliary/util/u_threaded_context.c | 14 ++++++++++++++
src/gallium/auxiliary/util/u_threaded_context.h | 6 ++++++
src/gallium/drivers/radeon/r600_buffer_common.c | 10 ++++++++--
3 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 0f23258..ccce12b 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1277,20 +1277,34 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
unsigned offset, unsigned size)
{
/* Never invalidate inside the driver and never infer "unsynchronized". */
unsigned tc_flags = TC_TRANSFER_MAP_NO_INVALIDATE |
TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED;
/* Prevent a reentry. */
if (usage & tc_flags)
return usage;
+ /* Use the staging upload if it's preferred. */
+ if (usage & (PIPE_TRANSFER_DISCARD_RANGE |
+ PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) &&
+ !(usage & PIPE_TRANSFER_PERSISTENT) &&
+ /* Try not to decrement the counter if it's not positive. Still racy,
+ * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
+ tres->max_forced_staging_uploads > 0 &&
+ p_atomic_dec_return(&tres->max_forced_staging_uploads) >= 0) {
+ usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+ PIPE_TRANSFER_UNSYNCHRONIZED);
+
+ return usage | tc_flags | PIPE_TRANSFER_DISCARD_RANGE;
+ }
+
/* Sparse buffers can't be mapped directly and can't be reallocated
* (fully invalidated). That may just be a radeonsi limitation, but
* the threaded context must obey it with radeonsi.
*/
if (tres->b.flags & PIPE_RESOURCE_FLAG_SPARSE) {
/* We can use DISCARD_RANGE instead of full discard. This is the only
* fast path for sparse buffers that doesn't need thread synchronization.
*/
if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)
usage |= PIPE_TRANSFER_DISCARD_RANGE;
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
index 8977b03..ac7bc3d 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -234,20 +234,26 @@ struct threaded_resource {
* the valid range of the base instance. It's used for transfers after
* a buffer invalidation, because such transfers operate on "latest", not
* the base instance. Initially it's set to &valid_buffer_range.
*/
struct util_range *base_valid_buffer_range;
/* Drivers are required to update this for shared resources and user
* pointers. */
bool is_shared;
bool is_user_ptr;
+
+ /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
+ * method of CPU access when map flags allow it. Useful for buffers that
+ * are too large for the visible VRAM window.
+ */
+ int max_forced_staging_uploads;
};
struct threaded_transfer {
struct pipe_transfer b;
/* Staging buffer for DISCARD_RANGE transfers. */
struct pipe_resource *staging;
/* Offset into the staging buffer, because the backing buffer is
* sub-allocated. */
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 67daaa4..92521f4 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -184,24 +184,29 @@ void si_init_resource_fields(struct r600_common_screen *rscreen,
res->flags &= ~RADEON_FLAG_NO_CPU_ACCESS; /* disallowed with VRAM_GTT */
}
if (rscreen->debug_flags & DBG(NO_WC))
res->flags &= ~RADEON_FLAG_GTT_WC;
/* Set expected VRAM and GART usage for the buffer. */
res->vram_usage = 0;
res->gart_usage = 0;
- if (res->domains & RADEON_DOMAIN_VRAM)
+ if (res->domains & RADEON_DOMAIN_VRAM) {
res->vram_usage = size;
- else if (res->domains & RADEON_DOMAIN_GTT)
+
+ res->b.max_forced_staging_uploads =
+ rscreen->info.has_dedicated_vram &&
+ size >= rscreen->info.vram_vis_size / 4 ? 1 : 0;
+ } else if (res->domains & RADEON_DOMAIN_GTT) {
res->gart_usage = size;
+ }
}
bool si_alloc_resource(struct r600_common_screen *rscreen,
struct r600_resource *res)
{
struct pb_buffer *old_buf, *new_buf;
/* Allocate a new resource. */
new_buf = rscreen->ws->buffer_create(rscreen->ws, res->bo_size,
res->bo_alignment,
@@ -282,20 +287,21 @@ void si_replace_buffer_storage(struct pipe_context *ctx,
struct pipe_resource *src)
{
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
struct r600_resource *rdst = r600_resource(dst);
struct r600_resource *rsrc = r600_resource(src);
uint64_t old_gpu_address = rdst->gpu_address;
pb_reference(&rdst->buf, rsrc->buf);
rdst->gpu_address = rsrc->gpu_address;
rdst->b.b.bind = rsrc->b.b.bind;
+ rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads;
rdst->flags = rsrc->flags;
assert(rdst->vram_usage == rsrc->vram_usage);
assert(rdst->gart_usage == rsrc->gart_usage);
assert(rdst->bo_size == rsrc->bo_size);
assert(rdst->bo_alignment == rsrc->bo_alignment);
assert(rdst->domains == rsrc->domains);
rctx->rebind_buffer(ctx, dst, old_gpu_address);
}
--
2.7.4
More information about the mesa-dev
mailing list