[Mesa-dev] [PATCH] winsys/amdgpu: fix a race condition between fence updates and IB submissions

Mon Jan 2 20:20:34 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

The CS thread is needed to ensure proper ordering of operations and can't
be disabled (without complicating the code).

Discovered by Nine CSMT, which ended up in a deadlock.
---
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     | 31 +++++++++++++++------------
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c |  9 ++++----
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 95402bf..87246f7 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1060,25 +1060,23 @@ cleanup:
    for (i = 0; i < cs->num_slab_buffers; i++)
       p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls);
 
    amdgpu_cs_context_cleanup(cs);
 }
 
 /* Make sure the previous submission is completed. */
 void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
-   struct amdgpu_winsys *ws = cs->ctx->ws;
 
    /* Wait for any pending ioctl of this CS to complete. */
-   if (util_queue_is_initialized(&ws->cs_queue))
-      util_queue_job_wait(&cs->flush_completed);
+   util_queue_job_wait(&cs->flush_completed);
 }
 
 static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
                            unsigned flags,
                            struct pipe_fence_handle **fence)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
    struct amdgpu_winsys *ws = cs->ctx->ws;
    int error_code = 0;
 
@@ -1150,53 +1148,58 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
          cs->next_fence = NULL;
       } else {
          cur->fence = amdgpu_fence_create(cs->ctx,
                                           cur->request.ip_type,
                                           cur->request.ip_instance,
                                           cur->request.ring);
       }
       if (fence)
          amdgpu_fence_reference(fence, cur->fence);
 
-      /* Prepare buffers. */
+      amdgpu_cs_sync_flush(rcs);
+
+      /* Prepare buffers.
+       *
+       * This fence must be held until the submission is queued to ensure
+       * that the order of fence dependency updates matches the order of
+       * submissions.
+       */
       pipe_mutex_lock(ws->bo_fence_lock);
       amdgpu_add_fence_dependencies(cs);
 
       num_buffers = cur->num_real_buffers;
       for (i = 0; i < num_buffers; i++) {
          struct amdgpu_winsys_bo *bo = cur->real_buffers[i].bo;
          p_atomic_inc(&bo->num_active_ioctls);
          amdgpu_add_fence(bo, cur->fence);
       }
 
       num_buffers = cur->num_slab_buffers;
       for (i = 0; i < num_buffers; i++) {
          struct amdgpu_winsys_bo *bo = cur->slab_buffers[i].bo;
          p_atomic_inc(&bo->num_active_ioctls);
          amdgpu_add_fence(bo, cur->fence);
       }
-      pipe_mutex_unlock(ws->bo_fence_lock);
-
-      amdgpu_cs_sync_flush(rcs);
 
       /* Swap command streams. "cst" is going to be submitted. */
       cs->csc = cs->cst;
       cs->cst = cur;
 
       /* Submit. */
-      if ((flags & RADEON_FLUSH_ASYNC) &&
-          util_queue_is_initialized(&ws->cs_queue)) {
-         util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
-                            amdgpu_cs_submit_ib, NULL);
-      } else {
-         amdgpu_cs_submit_ib(cs, 0);
-         error_code = cs->cst->error_code;
+      util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
+                         amdgpu_cs_submit_ib, NULL);
+      /* The submission has been queued, unlock the fence now. */
+      pipe_mutex_unlock(ws->bo_fence_lock);
+
+      if (!(flags & RADEON_FLUSH_ASYNC)) {
+         amdgpu_cs_sync_flush(rcs);
+         error_code = cur->error_code;
       }
    } else {
       amdgpu_cs_context_cleanup(cs->csc);
    }
 
    amdgpu_get_new_ib(&ws->base, cs, IB_MAIN);
    if (cs->const_ib.ib_mapped)
       amdgpu_get_new_ib(&ws->base, cs, IB_CONST);
    if (cs->const_preamble_ib.ib_mapped)
       amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index b950d37..e944e62 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -471,22 +471,20 @@ static unsigned hash_dev(void *key)
 #else
    return pointer_to_intptr(key);
 #endif
 }
 
 static int compare_dev(void *key1, void *key2)
 {
    return key1 != key2;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", true)
-
 static bool amdgpu_winsys_unref(struct radeon_winsys *rws)
 {
    struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
    bool destroy;
 
    /* When the reference counter drops to zero, remove the device pointer
     * from the table.
     * This must happen while the mutex is locked, so that
     * amdgpu_winsys_create in another thread doesn't get the winsys
     * from the table when the counter drops to 0. */
@@ -577,22 +575,25 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
    ws->base.read_registers = amdgpu_read_registers;
 
    amdgpu_bo_init_functions(ws);
    amdgpu_cs_init_functions(ws);
    amdgpu_surface_init_functions(ws);
 
    LIST_INITHEAD(&ws->global_bo_list);
    pipe_mutex_init(ws->global_bo_list_lock);
    pipe_mutex_init(ws->bo_fence_lock);
 
-   if (sysconf(_SC_NPROCESSORS_ONLN) > 1 && debug_get_option_thread())
-      util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1);
+   if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1)) {
+      amdgpu_winsys_destroy(&ws->base);
+      pipe_mutex_unlock(dev_tab_mutex);
+      return NULL;
+   }
 
    /* Create the screen at the end. The winsys must be initialized
     * completely.
     *
     * Alternatively, we could create the screen based on "ws->gen"
     * and link all drivers into one binary blob. */
    ws->base.screen = screen_create(&ws->base);
    if (!ws->base.screen) {
       amdgpu_winsys_destroy(&ws->base);
       pipe_mutex_unlock(dev_tab_mutex);
-- 
2.7.4