[Mesa-dev] [PATCH] winsys/amdgpu: fix a deadlock when waiting for submission_in_progress

Tue Jun 20 10:43:26 UTC 2017

On 19.06.2017 20:53, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> First this happens:
> 
> 1) amdgpu_cs_flush (lock bo_fence_lock)
>     -> amdgpu_add_fence_dependency
>     -> os_wait_until_zero (wait for submission_in_progress) - WAITING
> 
> 2) amdgpu_bo_create
>     -> pb_cache_reclaim_buffer (lock pb_cache::mutex)
>     -> pb_cache_is_buffer_compat
>     -> amdgpu_bo_wait (lock bo_fence_lock) - WAITING
> 
> So both bo_fence_lock and pb_cache::mutex are held. amdgpu_bo_create can't
> continue. amdgpu_cs_flush is waiting for the CS ioctl to finish the job,
> but the CS ioctl is trying to release a buffer:
> 
> 3) amdgpu_cs_submit_ib (CS thread - job entrypoint)
>     -> amdgpu_cs_context_cleanup
>     -> pb_reference
>     -> pb_destroy
>     -> amdgpu_bo_destroy_or_cache
>     -> pb_cache_add_buffer (lock pb_cache::mutex) - DEADLOCK
>
> The simple solution is not to wait for submission_in_progress, which we
> need in order to create the list of dependencies for the CS ioctl. Instead
> of building the list of dependencies as a direct input to the CS ioctl,
> build the list of dependencies as a list of fences, and make the final list
> of dependencies in the CS thread itself.
> 
> Therefore, amdgpu_cs_flush doesn't have to wait and can continue.
> Then, amdgpu_bo_create can continue and return. And then amdgpu_cs_submit_ib
> can continue.

Nice find! And I agree that this is the best solution. In addition to 
fixing the deadlock, it even gets rid of a busy-loop.

> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101294

Cc: 17.1 <mesa-stable at lists.freedesktop.org>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> ---
>   src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 55 ++++++++++++++++++++++---------
>   src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  4 ++-
>   2 files changed, 43 insertions(+), 16 deletions(-)
> 
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> index a0ef826..c88be06 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> @@ -745,39 +745,42 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
>         amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL);
>      }
>      for (i = 0; i < cs->num_slab_buffers; i++) {
>         p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references);
>         amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL);
>      }
>      for (i = 0; i < cs->num_sparse_buffers; i++) {
>         p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references);
>         amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL);
>      }
> +   for (i = 0; i < cs->num_fence_dependencies; i++)
> +      amdgpu_fence_reference(&cs->fence_dependencies[i], NULL);
>   
>      cs->num_real_buffers = 0;
>      cs->num_slab_buffers = 0;
>      cs->num_sparse_buffers = 0;
> +   cs->num_fence_dependencies = 0;
>      amdgpu_fence_reference(&cs->fence, NULL);
>   
>      memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
>      cs->last_added_bo = NULL;
>   }
>   
>   static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
>   {
>      amdgpu_cs_context_cleanup(cs);
>      FREE(cs->flags);
>      FREE(cs->real_buffers);
>      FREE(cs->handles);
>      FREE(cs->slab_buffers);
>      FREE(cs->sparse_buffers);
> -   FREE(cs->request.dependencies);
> +   FREE(cs->fence_dependencies);
>   }
>   
>   
>   static struct radeon_winsys_cs *
>   amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
>                    enum ring_type ring_type,
>                    void (*flush)(void *ctx, unsigned flags,
>                                  struct pipe_fence_handle **fence),
>                    void *flush_ctx)
>   {
> @@ -974,21 +977,20 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
>       return cs->num_real_buffers;
>   }
>   
>   DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
>   
>   static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
>                                           struct amdgpu_cs_buffer *buffer)
>   {
>      struct amdgpu_cs_context *cs = acs->csc;
>      struct amdgpu_winsys_bo *bo = buffer->bo;
> -   struct amdgpu_cs_fence *dep;
>      unsigned new_num_fences = 0;
>   
>      for (unsigned j = 0; j < bo->num_fences; ++j) {
>         struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
>         unsigned idx;
>   
>         if (bo_fence->ctx == acs->ctx &&
>            bo_fence->fence.ip_type == cs->request.ip_type &&
>            bo_fence->fence.ip_instance == cs->request.ip_instance &&
>            bo_fence->fence.ring == cs->request.ring)
> @@ -996,35 +998,35 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
>   
>         if (amdgpu_fence_wait((void *)bo_fence, 0, false))
>            continue;
>   
>         amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
>         new_num_fences++;
>   
>         if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
>            continue;
>   
> -      if (bo_fence->submission_in_progress)
> -         os_wait_until_zero(&bo_fence->submission_in_progress,
> -                            PIPE_TIMEOUT_INFINITE);
> -
> -      idx = cs->request.number_of_dependencies++;
> -      if (idx >= cs->max_dependencies) {
> +      idx = cs->num_fence_dependencies++;
> +      if (idx >= cs->max_fence_dependencies) {
>            unsigned size;
> -
> -         cs->max_dependencies = idx + 8;
> -         size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
> -         cs->request.dependencies = realloc(cs->request.dependencies, size);
> +         const unsigned increment = 8;
> +
> +         cs->max_fence_dependencies = idx + increment;
> +         size = cs->max_fence_dependencies * sizeof(cs->fence_dependencies[0]);
> +         cs->fence_dependencies = realloc(cs->fence_dependencies, size);
> +         /* Clear the newly-allocated elements. */
> +         memset(cs->fence_dependencies + idx, 0,
> +                increment * sizeof(cs->fence_dependencies[0]));
>         }
>   
> -      dep = &cs->request.dependencies[idx];
> -      memcpy(dep, &bo_fence->fence, sizeof(*dep));
> +      amdgpu_fence_reference(&cs->fence_dependencies[idx],
> +                             (struct pipe_fence_handle*)bo_fence);
>      }
>   
>      for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
>         amdgpu_fence_reference(&bo->fences[j], NULL);
>   
>      bo->num_fences = new_num_fences;
>   }
>   
>   /* Add the given list of fences to the buffer's fence list.
>    *
> @@ -1081,21 +1083,21 @@ static void amdgpu_add_fence_dependencies_list(struct amdgpu_cs *acs,
>      }
>   }
>   
>   /* Since the kernel driver doesn't synchronize execution between different
>    * rings automatically, we have to add fence dependencies manually.
>    */
>   static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
>   {
>      struct amdgpu_cs_context *cs = acs->csc;
>   
> -   cs->request.number_of_dependencies = 0;
> +   cs->num_fence_dependencies = 0;
>   
>      amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
>      amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
>      amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
>   }
>   
>   /* Add backing of sparse buffers to the buffer list.
>    *
>    * This is done late, during submission, to keep the buffer list short before
>    * submit, and to avoid managing fences for the backing buffers.
> @@ -1129,21 +1131,44 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs)
>   
>      return true;
>   }
>   
>   void amdgpu_cs_submit_ib(void *job, int thread_index)
>   {
>      struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
>      struct amdgpu_winsys *ws = acs->ctx->ws;
>      struct amdgpu_cs_context *cs = acs->cst;
>      int i, r;
> +   struct amdgpu_cs_fence *dependencies = NULL;
> +
> +   /* Set dependencies (input fences). */
> +   if (cs->num_fence_dependencies) {
> +      dependencies = alloca(sizeof(dependencies[0]) *
> +                            cs->num_fence_dependencies);
> +      unsigned num = 0;
> +
> +      for (i = 0; i < cs->num_fence_dependencies; i++) {
> +         struct amdgpu_fence *fence =
> +            (struct amdgpu_fence*)cs->fence_dependencies[i];
> +
> +         /* Past fences can't be unsubmitted because we have only 1 CS thread. */
> +         assert(!fence->submission_in_progress);
> +         memcpy(&dependencies[num++], &fence->fence, sizeof(dependencies[0]));
> +      }
> +      cs->request.dependencies = dependencies;
> +      cs->request.number_of_dependencies = num;
> +   } else {
> +      cs->request.dependencies = NULL;
> +      cs->request.number_of_dependencies = 0;
> +   }
>   
> +   /* Set the output fence. */
>      cs->request.fence_info.handle = NULL;
>      if (amdgpu_cs_has_user_fence(cs)) {
>   	cs->request.fence_info.handle = acs->ctx->user_fence_bo;
>   	cs->request.fence_info.offset = acs->ring_type;
>      }
>   
>      /* Create the buffer list.
>       * Use a buffer list containing all allocated buffers if requested.
>       */
>      if (debug_get_option_all_bos()) {
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> index d700b8c..d83c1e0 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> @@ -98,21 +98,23 @@ struct amdgpu_cs_context {
>      unsigned                    max_sparse_buffers;
>      struct amdgpu_cs_buffer     *sparse_buffers;
>   
>      int                         buffer_indices_hashlist[4096];
>   
>      struct amdgpu_winsys_bo     *last_added_bo;
>      unsigned                    last_added_bo_index;
>      unsigned                    last_added_bo_usage;
>      uint64_t                    last_added_bo_priority_usage;
>   
> -   unsigned                    max_dependencies;
> +   struct pipe_fence_handle    **fence_dependencies;
> +   unsigned                    num_fence_dependencies;
> +   unsigned                    max_fence_dependencies;
>   
>      struct pipe_fence_handle    *fence;
>   
>      /* the error returned from cs_flush for non-async submissions */
>      int                         error_code;
>   };
>   
>   struct amdgpu_cs {
>      struct amdgpu_ib main; /* must be first because this is inherited */
>      struct amdgpu_ib const_ib; /* optional constant engine IB */
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.