[Mesa-dev] [PATCH] winsys/amdgpu: fix a race condition between fence updates and IB submissions

Fri Jan 6 12:02:38 UTC 2017

On 02.01.2017 21:20, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> The CS thread is needed to ensure proper ordering of operations and can't
> be disabled (without complicating the code).
>
> Discovered by Nine CSMT, which ended up in a deadlock.

I'm curious why the thread makes a difference for the deadlock. Why 
isn't it enough in the un-threaded case to extend the scope of the 
ws->bo_fence_lock to cover the submit ioctl call?

Then again, I'm happy with simplifying the code to eliminate the 
un-threaded path, so...

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> ---
>  src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     | 31 +++++++++++++++------------
>  src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c |  9 ++++----
>  2 files changed, 22 insertions(+), 18 deletions(-)
>
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> index 95402bf..87246f7 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> @@ -1060,25 +1060,23 @@ cleanup:
>     for (i = 0; i < cs->num_slab_buffers; i++)
>        p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls);
>
>     amdgpu_cs_context_cleanup(cs);
>  }
>
>  /* Make sure the previous submission is completed. */
>  void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
>  {
>     struct amdgpu_cs *cs = amdgpu_cs(rcs);
> -   struct amdgpu_winsys *ws = cs->ctx->ws;
>
>     /* Wait for any pending ioctl of this CS to complete. */
> -   if (util_queue_is_initialized(&ws->cs_queue))
> -      util_queue_job_wait(&cs->flush_completed);
> +   util_queue_job_wait(&cs->flush_completed);
>  }
>
>  static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
>                             unsigned flags,
>                             struct pipe_fence_handle **fence)
>  {
>     struct amdgpu_cs *cs = amdgpu_cs(rcs);
>     struct amdgpu_winsys *ws = cs->ctx->ws;
>     int error_code = 0;
>
> @@ -1150,53 +1148,58 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
>           cs->next_fence = NULL;
>        } else {
>           cur->fence = amdgpu_fence_create(cs->ctx,
>                                            cur->request.ip_type,
>                                            cur->request.ip_instance,
>                                            cur->request.ring);
>        }
>        if (fence)
>           amdgpu_fence_reference(fence, cur->fence);
>
> -      /* Prepare buffers. */
> +      amdgpu_cs_sync_flush(rcs);
> +
> +      /* Prepare buffers.
> +       *
> +       * This fence must be held until the submission is queued to ensure
> +       * that the order of fence dependency updates matches the order of
> +       * submissions.
> +       */
>        pipe_mutex_lock(ws->bo_fence_lock);
>        amdgpu_add_fence_dependencies(cs);
>
>        num_buffers = cur->num_real_buffers;
>        for (i = 0; i < num_buffers; i++) {
>           struct amdgpu_winsys_bo *bo = cur->real_buffers[i].bo;
>           p_atomic_inc(&bo->num_active_ioctls);
>           amdgpu_add_fence(bo, cur->fence);
>        }
>
>        num_buffers = cur->num_slab_buffers;
>        for (i = 0; i < num_buffers; i++) {
>           struct amdgpu_winsys_bo *bo = cur->slab_buffers[i].bo;
>           p_atomic_inc(&bo->num_active_ioctls);
>           amdgpu_add_fence(bo, cur->fence);
>        }
> -      pipe_mutex_unlock(ws->bo_fence_lock);
> -
> -      amdgpu_cs_sync_flush(rcs);
>
>        /* Swap command streams. "cst" is going to be submitted. */
>        cs->csc = cs->cst;
>        cs->cst = cur;
>
>        /* Submit. */
> -      if ((flags & RADEON_FLUSH_ASYNC) &&
> -          util_queue_is_initialized(&ws->cs_queue)) {
> -         util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
> -                            amdgpu_cs_submit_ib, NULL);
> -      } else {
> -         amdgpu_cs_submit_ib(cs, 0);
> -         error_code = cs->cst->error_code;
> +      util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
> +                         amdgpu_cs_submit_ib, NULL);
> +      /* The submission has been queued, unlock the fence now. */
> +      pipe_mutex_unlock(ws->bo_fence_lock);
> +
> +      if (!(flags & RADEON_FLUSH_ASYNC)) {
> +         amdgpu_cs_sync_flush(rcs);
> +         error_code = cur->error_code;
>        }
>     } else {
>        amdgpu_cs_context_cleanup(cs->csc);
>     }
>
>     amdgpu_get_new_ib(&ws->base, cs, IB_MAIN);
>     if (cs->const_ib.ib_mapped)
>        amdgpu_get_new_ib(&ws->base, cs, IB_CONST);
>     if (cs->const_preamble_ib.ib_mapped)
>        amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE);
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
> index b950d37..e944e62 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
> @@ -471,22 +471,20 @@ static unsigned hash_dev(void *key)
>  #else
>     return pointer_to_intptr(key);
>  #endif
>  }
>
>  static int compare_dev(void *key1, void *key2)
>  {
>     return key1 != key2;
>  }
>
> -DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", true)
> -
>  static bool amdgpu_winsys_unref(struct radeon_winsys *rws)
>  {
>     struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
>     bool destroy;
>
>     /* When the reference counter drops to zero, remove the device pointer
>      * from the table.
>      * This must happen while the mutex is locked, so that
>      * amdgpu_winsys_create in another thread doesn't get the winsys
>      * from the table when the counter drops to 0. */
> @@ -577,22 +575,25 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
>     ws->base.read_registers = amdgpu_read_registers;
>
>     amdgpu_bo_init_functions(ws);
>     amdgpu_cs_init_functions(ws);
>     amdgpu_surface_init_functions(ws);
>
>     LIST_INITHEAD(&ws->global_bo_list);
>     pipe_mutex_init(ws->global_bo_list_lock);
>     pipe_mutex_init(ws->bo_fence_lock);
>
> -   if (sysconf(_SC_NPROCESSORS_ONLN) > 1 && debug_get_option_thread())
> -      util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1);
> +   if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1)) {
> +      amdgpu_winsys_destroy(&ws->base);
> +      pipe_mutex_unlock(dev_tab_mutex);
> +      return NULL;
> +   }
>
>     /* Create the screen at the end. The winsys must be initialized
>      * completely.
>      *
>      * Alternatively, we could create the screen based on "ws->gen"
>      * and link all drivers into one binary blob. */
>     ws->base.screen = screen_create(&ws->base);
>     if (!ws->base.screen) {
>        amdgpu_winsys_destroy(&ws->base);
>        pipe_mutex_unlock(dev_tab_mutex);
>