[Mesa-dev] [PATCH 12/14] winsys/radeon: add fine-grained fences for slab buffers

Nicolai Hähnle nhaehnle at gmail.com
Tue Sep 13 09:56:23 UTC 2016


From: Nicolai Hähnle <nicolai.haehnle at amd.com>

Note the logic for adding fences is somewhat different than for amdgpu,
because radeon has no scheduler and we therefore have no guarantee about
the order in which submissions from multiple threads are processed.

(Ironically, this is only an issue when "multi-threaded submission" is
disabled, because "multi-threaded submission" actually means that all
submissions happen from a single thread that happens to be separate from
the application's threads. If we only supported "multi-threaded
submission", the fence handling could be simplified by adding the fences
in that thread where everything is serialized.)
---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c     | 55 ++++++++++++++-
 src/gallium/winsys/radeon/drm/radeon_drm_bo.h     |  4 ++
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c     | 86 ++++++++++++++++++++---
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.c |  2 +
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.h |  1 +
 5 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 1725080..4b7dbdc 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -46,38 +46,89 @@ static inline struct radeon_bo *radeon_bo(struct pb_buffer *bo)
 {
     return (struct radeon_bo *)bo;
 }
 
 struct radeon_bo_va_hole {
     struct list_head list;
     uint64_t         offset;
     uint64_t         size;
 };
 
-static bool radeon_bo_is_busy(struct radeon_bo *bo)
+static bool radeon_real_bo_is_busy(struct radeon_bo *bo)
 {
     struct drm_radeon_gem_busy args = {0};
 
     args.handle = bo->handle;
     return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY,
                                &args, sizeof(args)) != 0;
 }
 
-static void radeon_bo_wait_idle(struct radeon_bo *bo)
+static bool radeon_bo_is_busy(struct radeon_bo *bo)
+{
+    unsigned num_idle;
+    bool busy = false;
+
+    if (bo->handle)
+        return radeon_real_bo_is_busy(bo);
+
+    pipe_mutex_lock(bo->rws->bo_fence_lock);
+    for (num_idle = 0; num_idle < bo->u.slab.num_fences; ++num_idle) {
+        if (radeon_real_bo_is_busy(bo->u.slab.fences[num_idle])) {
+            busy = true;
+            break;
+        }
+        radeon_bo_reference(&bo->u.slab.fences[num_idle], NULL);
+    }
+    memmove(&bo->u.slab.fences[0], &bo->u.slab.fences[num_idle],
+            (bo->u.slab.num_fences - num_idle) * sizeof(bo->u.slab.fences[0]));
+    bo->u.slab.num_fences -= num_idle;
+    pipe_mutex_unlock(bo->rws->bo_fence_lock);
+
+    return busy;
+}
+
+static void radeon_real_bo_wait_idle(struct radeon_bo *bo)
 {
     struct drm_radeon_gem_wait_idle args = {0};
 
     args.handle = bo->handle;
     while (drmCommandWrite(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE,
                            &args, sizeof(args)) == -EBUSY);
 }
 
+static void radeon_bo_wait_idle(struct radeon_bo *bo)
+{
+    if (bo->handle) {
+        radeon_real_bo_wait_idle(bo);
+    } else {
+        pipe_mutex_lock(bo->rws->bo_fence_lock);
+        while (bo->u.slab.num_fences) {
+            struct radeon_bo *fence = NULL;
+            radeon_bo_reference(&fence, bo->u.slab.fences[0]);
+            pipe_mutex_unlock(bo->rws->bo_fence_lock);
+
+            /* Wait without holding the fence lock. */
+            radeon_real_bo_wait_idle(fence);
+
+            pipe_mutex_lock(bo->rws->bo_fence_lock);
+            if (bo->u.slab.num_fences && fence == bo->u.slab.fences[0]) {
+                radeon_bo_reference(&bo->u.slab.fences[0], NULL);
+                memmove(&bo->u.slab.fences[0], &bo->u.slab.fences[1],
+                        (bo->u.slab.num_fences - 1) * sizeof(bo->u.slab.fences[0]));
+                bo->u.slab.num_fences--;
+            }
+            radeon_bo_reference(&fence, NULL);
+        }
+        pipe_mutex_unlock(bo->rws->bo_fence_lock);
+    }
+}
+
 static bool radeon_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
                            enum radeon_bo_usage usage)
 {
     struct radeon_bo *bo = radeon_bo(_buf);
     int64_t abs_timeout;
 
     /* No timeout. Just query. */
     if (timeout == 0)
         return !bo->num_active_ioctls && !radeon_bo_is_busy(bo);
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index 8e35a38..8f767fd 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
@@ -43,20 +43,24 @@ struct radeon_bo {
             struct pb_cache_entry cache_entry;
 
             void *ptr;
             pipe_mutex map_mutex;
             unsigned map_count;
             bool use_reusable_pool;
         } real;
         struct {
             struct pb_slab_entry entry;
             struct radeon_bo *real;
+
+            unsigned num_fences;
+            unsigned max_fences;
+            struct radeon_bo **fences;
         } slab;
     } u;
 
     struct radeon_drm_winsys *rws;
     void *user_ptr; /* from buffer_from_ptr */
 
     uint32_t handle; /* 0 for slab entries */
     uint32_t flink_name;
     uint64_t va;
     uint32_t hash;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 9fbd378..79c09e2 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -464,41 +464,93 @@ void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
             }
         } else {
             fprintf(stderr, "radeon: The kernel rejected CS, "
                     "see dmesg for more information (%i).\n", r);
         }
     }
 
     for (i = 0; i < csc->num_relocs; i++)
         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
+    for (i = 0; i < csc->num_slab_buffers; i++)
+        p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 
     radeon_cs_context_cleanup(csc);
 }
 
 /*
  * Make sure previous submission of this cs are completed
  */
 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 
     /* Wait for any pending ioctl of this CS to complete. */
     if (util_queue_is_initialized(&cs->ws->cs_queue))
         util_queue_job_wait(&cs->flush_completed);
 }
 
+/* Add the given fence to a slab buffer fence list.
+ *
+ * There is a potential race condition when bo participates in submissions on
+ * two or more threads simultaneously. Since we do not know which of the
+ * submissions will be sent to the GPU first, we have to keep the fences
+ * of all submissions.
+ *
+ * However, fences that belong to submissions that have already returned from
+ * their respective ioctl do not have to be kept, because we know that they
+ * will signal earlier.
+ */
+static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
+{
+    unsigned dst;
+
+    assert(fence->num_cs_references);
+
+    /* Cleanup older fences */
+    dst = 0;
+    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
+        if (bo->u.slab.fences[src]->num_cs_references) {
+            bo->u.slab.fences[dst] = bo->u.slab.fences[src];
+            dst++;
+        } else {
+            radeon_bo_reference(&bo->u.slab.fences[src], NULL);
+        }
+    }
+    bo->u.slab.num_fences = dst;
+
+    /* Check available space for the new fence */
+    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
+        unsigned new_max_fences = bo->u.slab.max_fences + 1;
+        struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
+                                                bo->u.slab.max_fences * sizeof(*new_fences),
+                                                new_max_fences * sizeof(*new_fences));
+        if (!new_fences) {
+            fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
+            return;
+        }
+
+        bo->u.slab.fences = new_fences;
+        bo->u.slab.max_fences = new_max_fences;
+    }
+
+    /* Add the new fence */
+    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
+    radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
+    bo->u.slab.num_fences++;
+}
+
 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 
 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                                unsigned flags,
-                               struct pipe_fence_handle **fence)
+                               struct pipe_fence_handle **pfence)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_cs_context *tmp;
 
     switch (cs->ring_type) {
     case RING_DMA:
         /* pad DMA ring to 8 DWs */
         if (cs->ws->info.chip_class <= SI) {
             while (rcs->current.cdw & 7)
                 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
@@ -524,29 +576,45 @@ static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
         break;
     default:
         break;
     }
 
     if (rcs->current.cdw > rcs->current.max_dw) {
        fprintf(stderr, "radeon: command stream overflowed\n");
     }
 
-    if (fence) {
-       if (cs->next_fence) {
-          radeon_fence_reference(fence, cs->next_fence);
-       } else {
-          radeon_fence_reference(fence, NULL);
-          *fence = radeon_cs_create_fence(rcs);
-       }
+    if (pfence || cs->csc->num_slab_buffers) {
+        struct pipe_fence_handle *fence;
+
+        if (cs->next_fence) {
+            fence = cs->next_fence;
+            cs->next_fence = NULL;
+        } else {
+            fence = radeon_cs_create_fence(rcs);
+        }
+
+        if (pfence)
+            radeon_fence_reference(pfence, fence);
+
+        pipe_mutex_lock(cs->ws->bo_fence_lock);
+        for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
+            struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
+            p_atomic_inc(&bo->num_active_ioctls);
+            radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
+        }
+        pipe_mutex_unlock(cs->ws->bo_fence_lock);
+
+        radeon_fence_reference(&fence, NULL);
+    } else {
+        radeon_fence_reference(&cs->next_fence, NULL);
     }
-    radeon_fence_reference(&cs->next_fence, NULL);
 
     radeon_drm_cs_sync_flush(rcs);
 
     /* Swap command streams. */
     tmp = cs->csc;
     cs->csc = cs->cst;
     cs->cst = tmp;
 
     /* If the CS is not empty or overflowed, emit it in a separate thread. */
     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index aa4bf5f..e02f286 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -549,20 +549,21 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
 
     if (ws->gen >= DRV_R600) {
         radeon_surface_manager_free(ws->surf_man);
     }
 
     util_hash_table_destroy(ws->bo_names);
     util_hash_table_destroy(ws->bo_handles);
     util_hash_table_destroy(ws->bo_vas);
     pipe_mutex_destroy(ws->bo_handles_mutex);
     pipe_mutex_destroy(ws->bo_va_mutex);
+    pipe_mutex_destroy(ws->bo_fence_lock);
 
     if (ws->fd >= 0)
         close(ws->fd);
 
     FREE(rws);
 }
 
 static void radeon_query_info(struct radeon_winsys *rws,
                               struct radeon_info *info)
 {
@@ -780,20 +781,21 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
     radeon_surface_init_functions(ws);
 
     pipe_mutex_init(ws->hyperz_owner_mutex);
     pipe_mutex_init(ws->cmask_owner_mutex);
 
     ws->bo_names = util_hash_table_create(handle_hash, handle_compare);
     ws->bo_handles = util_hash_table_create(handle_hash, handle_compare);
     ws->bo_vas = util_hash_table_create(handle_hash, handle_compare);
     pipe_mutex_init(ws->bo_handles_mutex);
     pipe_mutex_init(ws->bo_va_mutex);
+    pipe_mutex_init(ws->bo_fence_lock);
     ws->va_offset = ws->va_start;
     list_inithead(&ws->va_holes);
 
     /* TTM aligns the BO size to the CPU page size */
     ws->info.gart_page_size = sysconf(_SC_PAGESIZE);
 
     if (ws->num_cpus > 1 && debug_get_option_thread())
         util_queue_init(&ws->cs_queue, "radeon_cs", 8, 1);
 
     /* Create the screen at the end. The winsys must be initialized
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 5514980..b30055c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -84,20 +84,21 @@ struct radeon_drm_winsys {
     uint32_t accel_working2;
 
     /* List of buffer GEM names. Protected by bo_handles_mutex. */
     struct util_hash_table *bo_names;
     /* List of buffer handles. Protectded by bo_handles_mutex. */
     struct util_hash_table *bo_handles;
     /* List of buffer virtual memory ranges. Protectded by bo_handles_mutex. */
     struct util_hash_table *bo_vas;
     pipe_mutex bo_handles_mutex;
     pipe_mutex bo_va_mutex;
+    pipe_mutex bo_fence_lock;
 
     uint64_t va_offset;
     struct list_head va_holes;
     bool check_vm;
 
     struct radeon_surface_manager *surf_man;
 
     uint32_t num_cpus;      /* Number of CPUs. */
 
     struct radeon_drm_cs *hyperz_owner;
-- 
2.7.4



More information about the mesa-dev mailing list