[Mesa-dev] [PATCH 05/12] winsys/amdgpu: use only one fence per BO

Fri Sep 9 17:34:07 UTC 2016

From: Nicolai Hähnle <nicolai.haehnle at amd.com>

The fence that is added to the BO during flush is guaranteed to be
signaled after all the fences that were in the fences array of the BO
before the flush, because those fences are added as dependencies for the
submission (and all this happens atomically under the bo_fence_lock).

Therefore, keeping only the last fence around is sufficient.
---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 54 +++++++++++--------------
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |  4 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 66 +++++++++++++++----------------
 3 files changed, 56 insertions(+), 68 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 32df0be..a6d4aa4 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -37,21 +37,20 @@
 #include <xf86drm.h>
 #include <stdio.h>
 #include <inttypes.h>
 
 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
                            enum radeon_bo_usage usage)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    struct amdgpu_winsys *ws = bo->ws;
    int64_t abs_timeout;
-   int i;
 
    if (timeout == 0) {
       if (p_atomic_read(&bo->num_active_ioctls))
          return false;
 
    } else {
       abs_timeout = os_time_get_absolute_timeout(timeout);
 
       /* Wait if any ioctl is being submitted with this buffer. */
       if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
@@ -68,91 +67,82 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
 
       r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
       if (r)
          fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
                  r);
       return !buffer_busy;
    }
 
    if (timeout == 0) {
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         if (bo->fence[i]) {
-            if (amdgpu_fence_wait(bo->fence[i], 0, false)) {
-               /* Release the idle fence to avoid checking it again later. */
-               amdgpu_fence_reference(&bo->fence[i], NULL);
-            } else {
-               pipe_mutex_unlock(ws->bo_fence_lock);
-               return false;
-            }
+      if (bo->fence) {
+         if (amdgpu_fence_wait(bo->fence, 0, false)) {
+            /* Release the idle fence to avoid checking it again later. */
+            amdgpu_fence_reference(&bo->fence, NULL);
+         } else {
+            pipe_mutex_unlock(ws->bo_fence_lock);
+            return false;
          }
+      }
       pipe_mutex_unlock(ws->bo_fence_lock);
       return true;
 
    } else {
-      struct pipe_fence_handle *fence[RING_LAST] = {};
-      bool fence_idle[RING_LAST] = {};
+      struct pipe_fence_handle *fence = NULL;
+      bool fence_idle = false;
       bool buffer_idle = true;
 
-      /* Take references to all fences, so that we can wait for them
+      /* Take a reference to the fences, so that we can wait for it
        * without the lock. */
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         amdgpu_fence_reference(&fence[i], bo->fence[i]);
+      amdgpu_fence_reference(&fence, bo->fence);
       pipe_mutex_unlock(ws->bo_fence_lock);
 
-      /* Now wait for the fences. */
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i]) {
-            if (amdgpu_fence_wait(fence[i], abs_timeout, true))
-               fence_idle[i] = true;
-            else
-               buffer_idle = false;
-         }
+      /* Now wait for the fence. */
+      if (fence) {
+         if (amdgpu_fence_wait(fence, abs_timeout, true))
+            fence_idle = true;
+         else
+            buffer_idle = false;
       }
 
       /* Release idle fences to avoid checking them again later. */
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i] == bo->fence[i] && fence_idle[i])
-            amdgpu_fence_reference(&bo->fence[i], NULL);
-
-         amdgpu_fence_reference(&fence[i], NULL);
-      }
+      if (fence == bo->fence && fence_idle)
+         amdgpu_fence_reference(&bo->fence, NULL);
+      amdgpu_fence_reference(&fence, NULL);
       pipe_mutex_unlock(ws->bo_fence_lock);
 
       return buffer_idle;
    }
 }
 
 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
       struct pb_buffer *buf)
 {
    return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
 }
 
 void amdgpu_bo_destroy(struct pb_buffer *_buf)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
-   int i;
 
    pipe_mutex_lock(bo->ws->global_bo_list_lock);
    LIST_DEL(&bo->global_list_item);
    bo->ws->num_buffers--;
    pipe_mutex_unlock(bo->ws->global_bo_list_lock);
 
    amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
    amdgpu_va_range_free(bo->va_handle);
    amdgpu_bo_free(bo->bo);
 
-   for (i = 0; i < RING_LAST; i++)
-      amdgpu_fence_reference(&bo->fence[i], NULL);
+   amdgpu_fence_reference(&bo->fence, NULL);
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
       bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
       bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size);
 
    if (bo->map_count >= 1) {
       if (bo->initial_domain & RADEON_DOMAIN_VRAM)
          bo->ws->mapped_vram -= bo->base.size;
       else
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 70d9854..07403dd 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -55,22 +55,22 @@ struct amdgpu_winsys_bo {
 
    /* how many command streams, which are being emitted in a separate
     * thread, is this bo referenced in? */
    volatile int num_active_ioctls;
 
    /* whether buffer_get_handle or buffer_from_handle was called,
     * it can only transition from false to true
     */
    volatile int is_shared; /* bool (int for atomicity) */
 
-   /* Fences for buffer synchronization. */
-   struct pipe_fence_handle *fence[RING_LAST];
+   /* Fence for buffer synchronization. */
+   struct pipe_fence_handle *fence;
 
    struct list_head global_list_item;
 };
 
 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf);
 void amdgpu_bo_destroy(struct pb_buffer *_buf);
 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws);
 
 static inline
 struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 73c8a97..7cbf19b 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -822,58 +822,56 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
 }
 
 DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
 
 /* Since the kernel driver doesn't synchronize execution between different
  * rings automatically, we have to add fence dependencies manually.
  */
 static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
 {
    struct amdgpu_cs_context *cs = acs->csc;
-   int i, j;
+   int i;
 
    cs->request.number_of_dependencies = 0;
 
    for (i = 0; i < cs->num_buffers; i++) {
-      for (j = 0; j < RING_LAST; j++) {
-         struct amdgpu_cs_fence *dep;
-         unsigned idx;
-
-         struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence[j];
-         if (!bo_fence)
-            continue;
-
-         if (bo_fence->ctx == acs->ctx &&
-             bo_fence->fence.ip_type == cs->request.ip_type &&
-             bo_fence->fence.ip_instance == cs->request.ip_instance &&
-             bo_fence->fence.ring == cs->request.ring)
-            continue;
-
-         if (amdgpu_fence_wait((void *)bo_fence, 0, false))
-            continue;
-
-         if (bo_fence->submission_in_progress)
-            os_wait_until_zero(&bo_fence->submission_in_progress,
-                               PIPE_TIMEOUT_INFINITE);
-
-         idx = cs->request.number_of_dependencies++;
-         if (idx >= cs->max_dependencies) {
-            unsigned size;
-
-            cs->max_dependencies = idx + 8;
-            size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
-            cs->request.dependencies = realloc(cs->request.dependencies, size);
-         }
-
-         dep = &cs->request.dependencies[idx];
-         memcpy(dep, &bo_fence->fence, sizeof(*dep));
+      struct amdgpu_cs_fence *dep;
+      unsigned idx;
+
+      struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence;
+      if (!bo_fence)
+         continue;
+
+      if (bo_fence->ctx == acs->ctx &&
+          bo_fence->fence.ip_type == cs->request.ip_type &&
+          bo_fence->fence.ip_instance == cs->request.ip_instance &&
+          bo_fence->fence.ring == cs->request.ring)
+         continue;
+
+      if (amdgpu_fence_wait((void *)bo_fence, 0, false))
+         continue;
+
+      if (bo_fence->submission_in_progress)
+         os_wait_until_zero(&bo_fence->submission_in_progress,
+                            PIPE_TIMEOUT_INFINITE);
+
+      idx = cs->request.number_of_dependencies++;
+      if (idx >= cs->max_dependencies) {
+         unsigned size;
+
+         cs->max_dependencies = idx + 8;
+         size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
+         cs->request.dependencies = realloc(cs->request.dependencies, size);
       }
+
+      dep = &cs->request.dependencies[idx];
+      memcpy(dep, &bo_fence->fence, sizeof(*dep));
    }
 }
 
 void amdgpu_cs_submit_ib(void *job, int thread_index)
 {
    struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
    struct amdgpu_winsys *ws = acs->ctx->ws;
    struct amdgpu_cs_context *cs = acs->cst;
    int i, r;
 
@@ -1049,21 +1047,21 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
                                           cur->request.ring);
       }
       if (fence)
          amdgpu_fence_reference(fence, cur->fence);
 
       /* Prepare buffers. */
       pipe_mutex_lock(ws->bo_fence_lock);
       amdgpu_add_fence_dependencies(cs);
       for (i = 0; i < num_buffers; i++) {
          p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls);
-         amdgpu_fence_reference(&cur->buffers[i].bo->fence[cs->ring_type],
+         amdgpu_fence_reference(&cur->buffers[i].bo->fence,
                                 cur->fence);
       }
       pipe_mutex_unlock(ws->bo_fence_lock);
 
       amdgpu_cs_sync_flush(rcs);
 
       /* Swap command streams. "cst" is going to be submitted. */
       cs->csc = cs->cst;
       cs->cst = cur;
 
-- 
2.7.4