Mesa (master): winsys/amdgpu: use only one fence per BO

Mon Sep 12 12:03:40 UTC 2016

Module: Mesa
Branch: master
Commit: 11cbf4d7aea861e37067407ba7a660ea566c1593
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=11cbf4d7aea861e37067407ba7a660ea566c1593

Author: Nicolai Hähnle <nicolai.haehnle at amd.com>
Date:   Wed Sep  7 10:50:14 2016 +0200

winsys/amdgpu: use only one fence per BO

The fence that is added to the BO during flush is guaranteed to be
signaled after all the fences that were in the fences array of the BO
before the flush, because those fences are added as dependencies for the
submission (and all this happens atomically under the bo_fence_lock).

Therefore, keeping only the last fence around is sufficient.

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

---

 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 54 +++++++++++--------------
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h |  4 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 66 +++++++++++++++----------------
 3 files changed, 56 insertions(+), 68 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 32df0be..a6d4aa4 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -44,7 +44,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    struct amdgpu_winsys *ws = bo->ws;
    int64_t abs_timeout;
-   int i;
 
    if (timeout == 0) {
       if (p_atomic_read(&bo->num_active_ioctls))
@@ -75,49 +74,42 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
 
    if (timeout == 0) {
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         if (bo->fence[i]) {
-            if (amdgpu_fence_wait(bo->fence[i], 0, false)) {
-               /* Release the idle fence to avoid checking it again later. */
-               amdgpu_fence_reference(&bo->fence[i], NULL);
-            } else {
-               pipe_mutex_unlock(ws->bo_fence_lock);
-               return false;
-            }
+      if (bo->fence) {
+         if (amdgpu_fence_wait(bo->fence, 0, false)) {
+            /* Release the idle fence to avoid checking it again later. */
+            amdgpu_fence_reference(&bo->fence, NULL);
+         } else {
+            pipe_mutex_unlock(ws->bo_fence_lock);
+            return false;
          }
+      }
       pipe_mutex_unlock(ws->bo_fence_lock);
       return true;
 
    } else {
-      struct pipe_fence_handle *fence[RING_LAST] = {};
-      bool fence_idle[RING_LAST] = {};
+      struct pipe_fence_handle *fence = NULL;
+      bool fence_idle = false;
       bool buffer_idle = true;
 
-      /* Take references to all fences, so that we can wait for them
+      /* Take a reference to the fences, so that we can wait for it
        * without the lock. */
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         amdgpu_fence_reference(&fence[i], bo->fence[i]);
+      amdgpu_fence_reference(&fence, bo->fence);
       pipe_mutex_unlock(ws->bo_fence_lock);
 
-      /* Now wait for the fences. */
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i]) {
-            if (amdgpu_fence_wait(fence[i], abs_timeout, true))
-               fence_idle[i] = true;
-            else
-               buffer_idle = false;
-         }
+      /* Now wait for the fence. */
+      if (fence) {
+         if (amdgpu_fence_wait(fence, abs_timeout, true))
+            fence_idle = true;
+         else
+            buffer_idle = false;
       }
 
       /* Release idle fences to avoid checking them again later. */
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i] == bo->fence[i] && fence_idle[i])
-            amdgpu_fence_reference(&bo->fence[i], NULL);
-
-         amdgpu_fence_reference(&fence[i], NULL);
-      }
+      if (fence == bo->fence && fence_idle)
+         amdgpu_fence_reference(&bo->fence, NULL);
+      amdgpu_fence_reference(&fence, NULL);
       pipe_mutex_unlock(ws->bo_fence_lock);
 
       return buffer_idle;
@@ -133,7 +125,6 @@ static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
 void amdgpu_bo_destroy(struct pb_buffer *_buf)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
-   int i;
 
    pipe_mutex_lock(bo->ws->global_bo_list_lock);
    LIST_DEL(&bo->global_list_item);
@@ -144,8 +135,7 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
    amdgpu_va_range_free(bo->va_handle);
    amdgpu_bo_free(bo->bo);
 
-   for (i = 0; i < RING_LAST; i++)
-      amdgpu_fence_reference(&bo->fence[i], NULL);
+   amdgpu_fence_reference(&bo->fence, NULL);
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
       bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 70d9854..07403dd 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -62,8 +62,8 @@ struct amdgpu_winsys_bo {
     */
    volatile int is_shared; /* bool (int for atomicity) */
 
-   /* Fences for buffer synchronization. */
-   struct pipe_fence_handle *fence[RING_LAST];
+   /* Fence for buffer synchronization. */
+   struct pipe_fence_handle *fence;
 
    struct list_head global_list_item;
 };
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 0bb916e..16dd45a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -827,44 +827,42 @@ DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
 static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
 {
    struct amdgpu_cs_context *cs = acs->csc;
-   int i, j;
+   int i;
 
    cs->request.number_of_dependencies = 0;
 
    for (i = 0; i < cs->num_buffers; i++) {
-      for (j = 0; j < RING_LAST; j++) {
-         struct amdgpu_cs_fence *dep;
-         unsigned idx;
-
-         struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence[j];
-         if (!bo_fence)
-            continue;
-
-         if (bo_fence->ctx == acs->ctx &&
-             bo_fence->fence.ip_type == cs->request.ip_type &&
-             bo_fence->fence.ip_instance == cs->request.ip_instance &&
-             bo_fence->fence.ring == cs->request.ring)
-            continue;
-
-         if (amdgpu_fence_wait((void *)bo_fence, 0, false))
-            continue;
-
-         if (bo_fence->submission_in_progress)
-            os_wait_until_zero(&bo_fence->submission_in_progress,
-                               PIPE_TIMEOUT_INFINITE);
-
-         idx = cs->request.number_of_dependencies++;
-         if (idx >= cs->max_dependencies) {
-            unsigned size;
-
-            cs->max_dependencies = idx + 8;
-            size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
-            cs->request.dependencies = realloc(cs->request.dependencies, size);
-         }
-
-         dep = &cs->request.dependencies[idx];
-         memcpy(dep, &bo_fence->fence, sizeof(*dep));
+      struct amdgpu_cs_fence *dep;
+      unsigned idx;
+
+      struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence;
+      if (!bo_fence)
+         continue;
+
+      if (bo_fence->ctx == acs->ctx &&
+          bo_fence->fence.ip_type == cs->request.ip_type &&
+          bo_fence->fence.ip_instance == cs->request.ip_instance &&
+          bo_fence->fence.ring == cs->request.ring)
+         continue;
+
+      if (amdgpu_fence_wait((void *)bo_fence, 0, false))
+         continue;
+
+      if (bo_fence->submission_in_progress)
+         os_wait_until_zero(&bo_fence->submission_in_progress,
+                            PIPE_TIMEOUT_INFINITE);
+
+      idx = cs->request.number_of_dependencies++;
+      if (idx >= cs->max_dependencies) {
+         unsigned size;
+
+         cs->max_dependencies = idx + 8;
+         size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
+         cs->request.dependencies = realloc(cs->request.dependencies, size);
       }
+
+      dep = &cs->request.dependencies[idx];
+      memcpy(dep, &bo_fence->fence, sizeof(*dep));
    }
 }
 
@@ -1054,7 +1052,7 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
       amdgpu_add_fence_dependencies(cs);
       for (i = 0; i < num_buffers; i++) {
          p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls);
-         amdgpu_fence_reference(&cur->buffers[i].bo->fence[cs->ring_type],
+         amdgpu_fence_reference(&cur->buffers[i].bo->fence,
                                 cur->fence);
       }
       pipe_mutex_unlock(ws->bo_fence_lock);