[Mesa-dev] [PATCH v2 18/25] winsys/amdgpu: add sparse buffers to CS
Nicolai Hähnle
nhaehnle at gmail.com
Tue Mar 28 09:12:08 UTC 2017
From: Nicolai Hähnle <nicolai.haehnle at amd.com>
... and implement the corresponding fence handling.
v2:
- add missing bit in amdgpu_bo_is_referenced_by_cs_with_usage
- remove pipe_mutex_*
---
src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 147 +++++++++++++++++++++++++++---
src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 9 +-
2 files changed, 140 insertions(+), 16 deletions(-)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index bffa725..3ae5d33 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -279,23 +279,26 @@ static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type)
int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
{
unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
int i = cs->buffer_indices_hashlist[hash];
struct amdgpu_cs_buffer *buffers;
int num_buffers;
if (bo->bo) {
buffers = cs->real_buffers;
num_buffers = cs->num_real_buffers;
- } else {
+ } else if (!bo->sparse) {
buffers = cs->slab_buffers;
num_buffers = cs->num_slab_buffers;
+ } else {
+ buffers = cs->sparse_buffers;
+ num_buffers = cs->num_sparse_buffers;
}
/* not found or found */
if (i < 0 || (i < num_buffers && buffers[i].bo == bo))
return i;
/* Hash collision, look for the BO in the list of buffers linearly. */
for (i = num_buffers - 1; i >= 0; i--) {
if (buffers[i].bo == bo) {
/* Put this buffer in the hash list.
@@ -418,20 +421,77 @@ static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs,
buffer->u.slab.real_idx = real_idx;
p_atomic_inc(&bo->num_cs_references);
cs->num_slab_buffers++;
hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
cs->buffer_indices_hashlist[hash] = idx;
return idx;
}
+static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs,
+ struct amdgpu_winsys_bo *bo)
+{
+ struct amdgpu_cs_context *cs = acs->csc;
+ struct amdgpu_cs_buffer *buffer;
+ unsigned hash;
+ int idx = amdgpu_lookup_buffer(cs, bo);
+
+ if (idx >= 0)
+ return idx;
+
+ /* New buffer, check if the backing array is large enough. */
+ if (cs->num_sparse_buffers >= cs->max_sparse_buffers) {
+ unsigned new_max =
+ MAX2(cs->max_sparse_buffers + 16, (unsigned)(cs->max_sparse_buffers * 1.3));
+ struct amdgpu_cs_buffer *new_buffers;
+
+ new_buffers = REALLOC(cs->sparse_buffers,
+ cs->max_sparse_buffers * sizeof(*new_buffers),
+ new_max * sizeof(*new_buffers));
+ if (!new_buffers) {
+ fprintf(stderr, "amdgpu_lookup_or_add_sparse_buffer: allocation failed\n");
+ return -1;
+ }
+
+ cs->max_sparse_buffers = new_max;
+ cs->sparse_buffers = new_buffers;
+ }
+
+ idx = cs->num_sparse_buffers;
+ buffer = &cs->sparse_buffers[idx];
+
+ memset(buffer, 0, sizeof(*buffer));
+ amdgpu_winsys_bo_reference(&buffer->bo, bo);
+ p_atomic_inc(&bo->num_cs_references);
+ cs->num_sparse_buffers++;
+
+ hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
+ cs->buffer_indices_hashlist[hash] = idx;
+
+ /* We delay adding the backing buffers until we really have to. However,
+ * we cannot delay accounting for memory use.
+ */
+ mtx_lock(&bo->u.sparse.commit_lock);
+
+ list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
+ if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+ acs->main.base.used_vram += backing->bo->base.size;
+ else if (bo->initial_domain & RADEON_DOMAIN_GTT)
+ acs->main.base.used_gart += backing->bo->base.size;
+ }
+
+ mtx_unlock(&bo->u.sparse.commit_lock);
+
+ return idx;
+}
+
static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
struct pb_buffer *buf,
enum radeon_bo_usage usage,
enum radeon_bo_domain domains,
enum radeon_bo_priority priority)
{
/* Don't use the "domains" parameter. Amdgpu doesn't support changing
* the buffer placement during command submission.
*/
struct amdgpu_cs *acs = amdgpu_cs(rcs);
@@ -442,39 +502,49 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
/* Fast exit for no-op calls.
* This is very effective with suballocators and linear uploaders that
* are outside of the winsys.
*/
if (bo == cs->last_added_bo &&
(usage & cs->last_added_bo_usage) == usage &&
(1ull << priority) & cs->last_added_bo_priority_usage)
return cs->last_added_bo_index;
- if (!bo->bo) {
- index = amdgpu_lookup_or_add_slab_buffer(acs, bo);
- if (index < 0)
- return 0;
+ if (!bo->sparse) {
+ if (!bo->bo) {
+ index = amdgpu_lookup_or_add_slab_buffer(acs, bo);
+ if (index < 0)
+ return 0;
- buffer = &cs->slab_buffers[index];
- buffer->usage |= usage;
+ buffer = &cs->slab_buffers[index];
+ buffer->usage |= usage;
- usage &= ~RADEON_USAGE_SYNCHRONIZED;
- index = buffer->u.slab.real_idx;
+ usage &= ~RADEON_USAGE_SYNCHRONIZED;
+ index = buffer->u.slab.real_idx;
+ } else {
+ index = amdgpu_lookup_or_add_real_buffer(acs, bo);
+ if (index < 0)
+ return 0;
+ }
+
+ buffer = &cs->real_buffers[index];
+ buffer->u.real.priority_usage |= 1llu << priority;
+ buffer->usage |= usage;
} else {
- index = amdgpu_lookup_or_add_real_buffer(acs, bo);
+ index = amdgpu_lookup_or_add_sparse_buffer(acs, bo);
if (index < 0)
return 0;
- }
- buffer = &cs->real_buffers[index];
- buffer->u.real.priority_usage |= 1llu << priority;
- buffer->usage |= usage;
+ buffer = &cs->sparse_buffers[index];
+ buffer->usage |= usage;
+ buffer->u.real.priority_usage |= 1llu << priority;
+ }
cs->last_added_bo = bo;
cs->last_added_bo_index = index;
cs->last_added_bo_usage = buffer->usage;
cs->last_added_bo_priority_usage = buffer->u.real.priority_usage;
return index;
}
static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
{
@@ -671,38 +741,44 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
unsigned i;
for (i = 0; i < cs->num_real_buffers; i++) {
p_atomic_dec(&cs->real_buffers[i].bo->num_cs_references);
amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL);
}
for (i = 0; i < cs->num_slab_buffers; i++) {
p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references);
amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL);
}
+ for (i = 0; i < cs->num_sparse_buffers; i++) {
+ p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references);
+ amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL);
+ }
cs->num_real_buffers = 0;
cs->num_slab_buffers = 0;
+ cs->num_sparse_buffers = 0;
amdgpu_fence_reference(&cs->fence, NULL);
for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) {
cs->buffer_indices_hashlist[i] = -1;
}
cs->last_added_bo = NULL;
}
static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
{
amdgpu_cs_context_cleanup(cs);
FREE(cs->flags);
FREE(cs->real_buffers);
FREE(cs->handles);
FREE(cs->slab_buffers);
+ FREE(cs->sparse_buffers);
FREE(cs->request.dependencies);
}
static struct radeon_winsys_cs *
amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
enum ring_type ring_type,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
void *flush_ctx)
@@ -1011,20 +1087,56 @@ static void amdgpu_add_fence_dependencies_list(struct amdgpu_cs *acs,
* rings automatically, we have to add fence dependencies manually.
*/
static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
{
struct amdgpu_cs_context *cs = acs->csc;
cs->request.number_of_dependencies = 0;
amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
+ amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
+}
+
+/* Add backing of sparse buffers to the buffer list.
+ *
+ * This is done late, during submission, to keep the buffer list short before
+ * submit, and to avoid managing fences for the backing buffers.
+ */
+static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs)
+{
+ for (unsigned i = 0; i < cs->num_sparse_buffers; ++i) {
+ struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i];
+ struct amdgpu_winsys_bo *bo = buffer->bo;
+
+ mtx_lock(&bo->u.sparse.commit_lock);
+
+ list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
+ /* We can directly add the buffer here, because we know that each
+ * backing buffer occurs only once.
+ */
+ int idx = amdgpu_do_add_real_buffer(cs, backing->bo);
+ if (idx < 0) {
+ fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__);
+ mtx_unlock(&bo->u.sparse.commit_lock);
+ return false;
+ }
+
+ cs->real_buffers[idx].usage = buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
+ cs->real_buffers[idx].u.real.priority_usage = buffer->u.real.priority_usage;
+ p_atomic_inc(&backing->bo->num_active_ioctls);
+ }
+
+ mtx_unlock(&bo->u.sparse.commit_lock);
+ }
+
+ return true;
}
void amdgpu_cs_submit_ib(void *job, int thread_index)
{
struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
struct amdgpu_winsys *ws = acs->ctx->ws;
struct amdgpu_cs_context *cs = acs->cst;
int i, r;
cs->request.fence_info.handle = NULL;
@@ -1055,20 +1167,25 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
assert(num < ws->num_buffers);
handles[num++] = bo->bo;
}
r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
handles, NULL,
&cs->request.resources);
free(handles);
mtx_unlock(&ws->global_bo_list_lock);
} else {
+ if (!amdgpu_add_sparse_backing_buffers(cs)) {
+ r = -ENOMEM;
+ goto bo_list_error;
+ }
+
if (cs->max_real_submit < cs->num_real_buffers) {
FREE(cs->handles);
FREE(cs->flags);
cs->handles = MALLOC(sizeof(*cs->handles) * cs->num_real_buffers);
cs->flags = MALLOC(sizeof(*cs->flags) * cs->num_real_buffers);
if (!cs->handles || !cs->flags) {
cs->max_real_submit = 0;
r = -ENOMEM;
@@ -1129,20 +1246,22 @@ bo_list_error:
/* Cleanup. */
if (cs->request.resources)
amdgpu_bo_list_destroy(cs->request.resources);
cleanup:
for (i = 0; i < cs->num_real_buffers; i++)
p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls);
for (i = 0; i < cs->num_slab_buffers; i++)
p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls);
+ for (i = 0; i < cs->num_sparse_buffers; i++)
+ p_atomic_dec(&cs->sparse_buffers[i].bo->num_active_ioctls);
amdgpu_cs_context_cleanup(cs);
}
/* Make sure the previous submission is completed. */
void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
/* Wait for any pending ioctl of this CS to complete. */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 242410f..d700b8c 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -87,20 +87,24 @@ struct amdgpu_cs_context {
struct amdgpu_cs_buffer *real_buffers;
unsigned max_real_submit;
amdgpu_bo_handle *handles;
uint8_t *flags;
unsigned num_slab_buffers;
unsigned max_slab_buffers;
struct amdgpu_cs_buffer *slab_buffers;
+ unsigned num_sparse_buffers;
+ unsigned max_sparse_buffers;
+ struct amdgpu_cs_buffer *sparse_buffers;
+
int buffer_indices_hashlist[4096];
struct amdgpu_winsys_bo *last_added_bo;
unsigned last_added_bo_index;
unsigned last_added_bo_usage;
uint64_t last_added_bo_priority_usage;
unsigned max_dependencies;
struct pipe_fence_handle *fence;
@@ -219,22 +223,23 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs,
int index;
struct amdgpu_cs_buffer *buffer;
if (!bo->num_cs_references)
return false;
index = amdgpu_lookup_buffer(cs->csc, bo);
if (index == -1)
return false;
- buffer = bo->bo ? &cs->csc->real_buffers[index]
- : &cs->csc->slab_buffers[index];
+ buffer = bo->bo ? &cs->csc->real_buffers[index] :
+ bo->sparse ? &cs->csc->sparse_buffers[index] :
+ &cs->csc->slab_buffers[index];
return (buffer->usage & usage) != 0;
}
static inline bool
amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo)
{
return bo->num_cs_references != 0;
}
--
2.9.3
More information about the mesa-dev
mailing list