[Mesa-dev] [PATCH] winsys/amdgpu: fix a deadlock when waiting for submission_in_progress
Marek Olšák
maraeo at gmail.com
Mon Jun 19 18:53:08 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
First this happens:
1) amdgpu_cs_flush (lock bo_fence_lock)
-> amdgpu_add_fence_dependency
-> os_wait_until_zero (wait for submission_in_progress) - WAITING
2) amdgpu_bo_create
-> pb_cache_reclaim_buffer (lock pb_cache::mutex)
-> pb_cache_is_buffer_compat
-> amdgpu_bo_wait (lock bo_fence_lock) - WAITING
So both bo_fence_lock and pb_cache::mutex are held. amdgpu_bo_create can't
continue. amdgpu_cs_flush is waiting for the CS ioctl to finish the job,
but the CS ioctl is trying to release a buffer:
3) amdgpu_cs_submit_ib (CS thread - job entrypoint)
-> amdgpu_cs_context_cleanup
-> pb_reference
-> pb_destroy
-> amdgpu_bo_destroy_or_cache
-> pb_cache_add_buffer (lock pb_cache::mutex) - DEADLOCK
The simple solution is not to wait for submission_in_progress, which we
need in order to create the list of dependencies for the CS ioctl. Instead
of building the list of dependencies as a direct input to the CS ioctl,
build the list of dependencies as a list of fences, and make the final list
of dependencies in the CS thread itself.
Therefore, amdgpu_cs_flush doesn't have to wait and can continue.
Then, amdgpu_bo_create can continue and return. And then amdgpu_cs_submit_ib
can continue.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101294
---
src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 55 ++++++++++++++++++++++---------
src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 4 ++-
2 files changed, 43 insertions(+), 16 deletions(-)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index a0ef826..c88be06 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -745,39 +745,42 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL);
}
for (i = 0; i < cs->num_slab_buffers; i++) {
p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references);
amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL);
}
for (i = 0; i < cs->num_sparse_buffers; i++) {
p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references);
amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL);
}
+ for (i = 0; i < cs->num_fence_dependencies; i++)
+ amdgpu_fence_reference(&cs->fence_dependencies[i], NULL);
cs->num_real_buffers = 0;
cs->num_slab_buffers = 0;
cs->num_sparse_buffers = 0;
+ cs->num_fence_dependencies = 0;
amdgpu_fence_reference(&cs->fence, NULL);
memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
cs->last_added_bo = NULL;
}
static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
{
amdgpu_cs_context_cleanup(cs);
FREE(cs->flags);
FREE(cs->real_buffers);
FREE(cs->handles);
FREE(cs->slab_buffers);
FREE(cs->sparse_buffers);
- FREE(cs->request.dependencies);
+ FREE(cs->fence_dependencies);
}
static struct radeon_winsys_cs *
amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
enum ring_type ring_type,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
void *flush_ctx)
{
@@ -974,21 +977,20 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
return cs->num_real_buffers;
}
DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
struct amdgpu_cs_buffer *buffer)
{
struct amdgpu_cs_context *cs = acs->csc;
struct amdgpu_winsys_bo *bo = buffer->bo;
- struct amdgpu_cs_fence *dep;
unsigned new_num_fences = 0;
for (unsigned j = 0; j < bo->num_fences; ++j) {
struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
unsigned idx;
if (bo_fence->ctx == acs->ctx &&
bo_fence->fence.ip_type == cs->request.ip_type &&
bo_fence->fence.ip_instance == cs->request.ip_instance &&
bo_fence->fence.ring == cs->request.ring)
@@ -996,35 +998,35 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
if (amdgpu_fence_wait((void *)bo_fence, 0, false))
continue;
amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
new_num_fences++;
if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
continue;
- if (bo_fence->submission_in_progress)
- os_wait_until_zero(&bo_fence->submission_in_progress,
- PIPE_TIMEOUT_INFINITE);
-
- idx = cs->request.number_of_dependencies++;
- if (idx >= cs->max_dependencies) {
+ idx = cs->num_fence_dependencies++;
+ if (idx >= cs->max_fence_dependencies) {
unsigned size;
-
- cs->max_dependencies = idx + 8;
- size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
- cs->request.dependencies = realloc(cs->request.dependencies, size);
+ const unsigned increment = 8;
+
+ cs->max_fence_dependencies = idx + increment;
+ size = cs->max_fence_dependencies * sizeof(cs->fence_dependencies[0]);
+ cs->fence_dependencies = realloc(cs->fence_dependencies, size);
+ /* Clear the newly-allocated elements. */
+ memset(cs->fence_dependencies + idx, 0,
+ increment * sizeof(cs->fence_dependencies[0]));
}
- dep = &cs->request.dependencies[idx];
- memcpy(dep, &bo_fence->fence, sizeof(*dep));
+ amdgpu_fence_reference(&cs->fence_dependencies[idx],
+ (struct pipe_fence_handle*)bo_fence);
}
for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
amdgpu_fence_reference(&bo->fences[j], NULL);
bo->num_fences = new_num_fences;
}
/* Add the given list of fences to the buffer's fence list.
*
@@ -1081,21 +1083,21 @@ static void amdgpu_add_fence_dependencies_list(struct amdgpu_cs *acs,
}
}
/* Since the kernel driver doesn't synchronize execution between different
* rings automatically, we have to add fence dependencies manually.
*/
static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
{
struct amdgpu_cs_context *cs = acs->csc;
- cs->request.number_of_dependencies = 0;
+ cs->num_fence_dependencies = 0;
amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
}
/* Add backing of sparse buffers to the buffer list.
*
* This is done late, during submission, to keep the buffer list short before
* submit, and to avoid managing fences for the backing buffers.
@@ -1129,21 +1131,44 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs)
return true;
}
void amdgpu_cs_submit_ib(void *job, int thread_index)
{
struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
struct amdgpu_winsys *ws = acs->ctx->ws;
struct amdgpu_cs_context *cs = acs->cst;
int i, r;
+ struct amdgpu_cs_fence *dependencies = NULL;
+
+ /* Set dependencies (input fences). */
+ if (cs->num_fence_dependencies) {
+ dependencies = alloca(sizeof(dependencies[0]) *
+ cs->num_fence_dependencies);
+ unsigned num = 0;
+
+ for (i = 0; i < cs->num_fence_dependencies; i++) {
+ struct amdgpu_fence *fence =
+ (struct amdgpu_fence*)cs->fence_dependencies[i];
+
+ /* Past fences can't be unsubmitted because we have only 1 CS thread. */
+ assert(!fence->submission_in_progress);
+ memcpy(&dependencies[num++], &fence->fence, sizeof(dependencies[0]));
+ }
+ cs->request.dependencies = dependencies;
+ cs->request.number_of_dependencies = num;
+ } else {
+ cs->request.dependencies = NULL;
+ cs->request.number_of_dependencies = 0;
+ }
+ /* Set the output fence. */
cs->request.fence_info.handle = NULL;
if (amdgpu_cs_has_user_fence(cs)) {
cs->request.fence_info.handle = acs->ctx->user_fence_bo;
cs->request.fence_info.offset = acs->ring_type;
}
/* Create the buffer list.
* Use a buffer list containing all allocated buffers if requested.
*/
if (debug_get_option_all_bos()) {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index d700b8c..d83c1e0 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -98,21 +98,23 @@ struct amdgpu_cs_context {
unsigned max_sparse_buffers;
struct amdgpu_cs_buffer *sparse_buffers;
int buffer_indices_hashlist[4096];
struct amdgpu_winsys_bo *last_added_bo;
unsigned last_added_bo_index;
unsigned last_added_bo_usage;
uint64_t last_added_bo_priority_usage;
- unsigned max_dependencies;
+ struct pipe_fence_handle **fence_dependencies;
+ unsigned num_fence_dependencies;
+ unsigned max_fence_dependencies;
struct pipe_fence_handle *fence;
/* the error returned from cs_flush for non-async submissions */
int error_code;
};
struct amdgpu_cs {
struct amdgpu_ib main; /* must be first because this is inherited */
struct amdgpu_ib const_ib; /* optional constant engine IB */
--
2.7.4
More information about the mesa-dev
mailing list