[Mesa-dev] [PATCH 05/14] winsys/amdgpu: do not synchronize unsynchronized buffers
Nicolai Hähnle
nhaehnle at gmail.com
Tue Sep 13 09:56:16 UTC 2016
From: Nicolai Hähnle <nicolai.haehnle at amd.com>
When a buffer is added to a CS without the SYNCHRONIZED usage flag, we now
no longer add a dependency on the buffer's fence(s).
However, we still need to add a fence to the buffer during flush, so that
cache reclaim works correctly (and in the hypothetical case that the buffer
is later added to a CS _with_ the SYNCHRONIZED flag).
It is now possible that the submissions refererring to a buffer are no longer
linearly ordered, and so we may have to keep multiple fences around. We keep
the fences in a FIFO. It should usually stay quite short (# of contexts * 2,
for gfx + dma rings).
While we're at it, extract amdgpu_add_fence_dependency for a single buffer,
which will make adding the distinction between real buffer and slab cases
easier.
---
src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 73 ++++++++++++++++++---------
src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 6 ++-
src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 84 ++++++++++++++++++++++++-------
3 files changed, 118 insertions(+), 45 deletions(-)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 0dbd0fb..37a7ba1 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -66,83 +66,108 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
int r;
r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
if (r)
fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
r);
return !buffer_busy;
}
if (timeout == 0) {
+ unsigned idle_fences;
+ bool buffer_idle;
+
pipe_mutex_lock(ws->bo_fence_lock);
- if (bo->fence) {
- if (amdgpu_fence_wait(bo->fence, 0, false)) {
- /* Release the idle fence to avoid checking it again later. */
- amdgpu_fence_reference(&bo->fence, NULL);
- } else {
- pipe_mutex_unlock(ws->bo_fence_lock);
- return false;
- }
+
+ for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
+ if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
+ break;
}
+
+ /* Release the idle fences to avoid checking them again later. */
+ for (unsigned i = 0; i < idle_fences; ++i)
+ amdgpu_fence_reference(&bo->fences[i], NULL);
+
+ memmove(&bo->fences[0], &bo->fences[idle_fences],
+ (bo->num_fences - idle_fences) * sizeof(*bo->fences));
+ bo->num_fences -= idle_fences;
+
+ buffer_idle = !bo->num_fences;
pipe_mutex_unlock(ws->bo_fence_lock);
- return true;
+ return buffer_idle;
} else {
- struct pipe_fence_handle *fence = NULL;
- bool fence_idle = false;
bool buffer_idle = true;
- /* Take a reference to the fences, so that we can wait for it
- * without the lock. */
pipe_mutex_lock(ws->bo_fence_lock);
- amdgpu_fence_reference(&fence, bo->fence);
- pipe_mutex_unlock(ws->bo_fence_lock);
+ while (bo->num_fences && buffer_idle) {
+ struct pipe_fence_handle *fence = NULL;
+ bool fence_idle = false;
- /* Now wait for the fence. */
- if (fence) {
+ amdgpu_fence_reference(&fence, bo->fences[0]);
+
+ /* Wait for the fence. */
+ pipe_mutex_unlock(ws->bo_fence_lock);
if (amdgpu_fence_wait(fence, abs_timeout, true))
fence_idle = true;
else
buffer_idle = false;
- }
+ pipe_mutex_lock(ws->bo_fence_lock);
+
+ /* Release an idle fence to avoid checking it again later, keeping in
+ * mind that the fence array may have been modified by other threads.
+ */
+ if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
+ amdgpu_fence_reference(&bo->fences[0], NULL);
+ memmove(&bo->fences[0], &bo->fences[1],
+ (bo->num_fences - 1) * sizeof(*bo->fences));
+ bo->num_fences--;
+ }
- /* Release idle fences to avoid checking them again later. */
- pipe_mutex_lock(ws->bo_fence_lock);
- if (fence == bo->fence && fence_idle)
- amdgpu_fence_reference(&bo->fence, NULL);
- amdgpu_fence_reference(&fence, NULL);
+ amdgpu_fence_reference(&fence, NULL);
+ }
pipe_mutex_unlock(ws->bo_fence_lock);
return buffer_idle;
}
}
static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
struct pb_buffer *buf)
{
return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
}
+static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
+{
+ for (unsigned i = 0; i < bo->num_fences; ++i)
+ amdgpu_fence_reference(&bo->fences[i], NULL);
+
+ FREE(bo->fences);
+ bo->num_fences = 0;
+ bo->max_fences = 0;
+}
+
void amdgpu_bo_destroy(struct pb_buffer *_buf)
{
struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
pipe_mutex_lock(bo->ws->global_bo_list_lock);
LIST_DEL(&bo->global_list_item);
bo->ws->num_buffers--;
pipe_mutex_unlock(bo->ws->global_bo_list_lock);
amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
amdgpu_va_range_free(bo->va_handle);
amdgpu_bo_free(bo->bo);
- amdgpu_fence_reference(&bo->fence, NULL);
+ amdgpu_bo_remove_fences(bo);
if (bo->initial_domain & RADEON_DOMAIN_VRAM)
bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
else if (bo->initial_domain & RADEON_DOMAIN_GTT)
bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size);
if (bo->map_count >= 1) {
if (bo->initial_domain & RADEON_DOMAIN_VRAM)
bo->ws->mapped_vram -= bo->base.size;
else if (bo->initial_domain & RADEON_DOMAIN_GTT)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 07403dd..93cc83a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -55,22 +55,24 @@ struct amdgpu_winsys_bo {
/* how many command streams, which are being emitted in a separate
* thread, is this bo referenced in? */
volatile int num_active_ioctls;
/* whether buffer_get_handle or buffer_from_handle was called,
* it can only transition from false to true
*/
volatile int is_shared; /* bool (int for atomicity) */
- /* Fence for buffer synchronization. */
- struct pipe_fence_handle *fence;
+ /* Fences for buffer synchronization. */
+ unsigned num_fences;
+ unsigned max_fences;
+ struct pipe_fence_handle **fences;
struct list_head global_list_item;
};
bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf);
void amdgpu_bo_destroy(struct pb_buffer *_buf);
void amdgpu_bo_init_functions(struct amdgpu_winsys *ws);
static inline
struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index e2f2974..d5ea705 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -794,63 +794,109 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
list[i].bo_size = cs->buffers[i].bo->base.size;
list[i].vm_address = cs->buffers[i].bo->va;
list[i].priority_usage = cs->buffers[i].priority_usage;
}
}
return cs->num_buffers;
}
DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
-/* Since the kernel driver doesn't synchronize execution between different
- * rings automatically, we have to add fence dependencies manually.
- */
-static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
+static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
+ struct amdgpu_cs_buffer *buffer)
{
struct amdgpu_cs_context *cs = acs->csc;
- int i;
+ struct amdgpu_winsys_bo *bo = buffer->bo;
+ struct amdgpu_cs_fence *dep;
+ unsigned new_num_fences = 0;
- cs->request.number_of_dependencies = 0;
-
- for (i = 0; i < cs->num_buffers; i++) {
- struct amdgpu_cs_fence *dep;
+ for (unsigned j = 0; j < bo->num_fences; ++j) {
+ struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
unsigned idx;
- struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence;
- if (!bo_fence)
- continue;
-
if (bo_fence->ctx == acs->ctx &&
- bo_fence->fence.ip_type == cs->request.ip_type &&
- bo_fence->fence.ip_instance == cs->request.ip_instance &&
- bo_fence->fence.ring == cs->request.ring)
+ bo_fence->fence.ip_type == cs->request.ip_type &&
+ bo_fence->fence.ip_instance == cs->request.ip_instance &&
+ bo_fence->fence.ring == cs->request.ring)
continue;
if (amdgpu_fence_wait((void *)bo_fence, 0, false))
continue;
+ amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
+ new_num_fences++;
+
+ if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
+ continue;
+
if (bo_fence->submission_in_progress)
os_wait_until_zero(&bo_fence->submission_in_progress,
PIPE_TIMEOUT_INFINITE);
idx = cs->request.number_of_dependencies++;
if (idx >= cs->max_dependencies) {
unsigned size;
cs->max_dependencies = idx + 8;
size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
cs->request.dependencies = realloc(cs->request.dependencies, size);
}
dep = &cs->request.dependencies[idx];
memcpy(dep, &bo_fence->fence, sizeof(*dep));
}
+
+ for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
+ amdgpu_fence_reference(&bo->fences[j], NULL);
+
+ bo->num_fences = new_num_fences;
+}
+
+/* Since the kernel driver doesn't synchronize execution between different
+ * rings automatically, we have to add fence dependencies manually.
+ */
+static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
+{
+ struct amdgpu_cs_context *cs = acs->csc;
+ int i;
+
+ cs->request.number_of_dependencies = 0;
+
+ for (i = 0; i < cs->num_buffers; i++)
+ amdgpu_add_fence_dependency(acs, &cs->buffers[i]);
+}
+
+static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo,
+ struct pipe_fence_handle *fence)
+{
+ if (bo->num_fences >= bo->max_fences) {
+ unsigned new_max_fences = MAX2(1, bo->max_fences * 2);
+ struct pipe_fence_handle **new_fences =
+ REALLOC(bo->fences,
+ bo->num_fences * sizeof(*new_fences),
+ new_max_fences * sizeof(*new_fences));
+ if (new_fences) {
+ bo->fences = new_fences;
+ bo->max_fences = new_max_fences;
+ } else {
+ fprintf(stderr, "amdgpu_add_fence: allocation failure, dropping fence\n");
+ if (!bo->num_fences)
+ return;
+
+ bo->num_fences--; /* prefer to keep a more recent fence if possible */
+ amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL);
+ }
+ }
+
+ bo->fences[bo->num_fences] = NULL;
+ amdgpu_fence_reference(&bo->fences[bo->num_fences], fence);
+ bo->num_fences++;
}
void amdgpu_cs_submit_ib(void *job, int thread_index)
{
struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
struct amdgpu_winsys *ws = acs->ctx->ws;
struct amdgpu_cs_context *cs = acs->cst;
int i, r;
cs->request.fence_info.handle = NULL;
@@ -1024,23 +1070,23 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
cur->request.ip_instance,
cur->request.ring);
}
if (fence)
amdgpu_fence_reference(fence, cur->fence);
/* Prepare buffers. */
pipe_mutex_lock(ws->bo_fence_lock);
amdgpu_add_fence_dependencies(cs);
for (i = 0; i < num_buffers; i++) {
- p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls);
- amdgpu_fence_reference(&cur->buffers[i].bo->fence,
- cur->fence);
+ struct amdgpu_winsys_bo *bo = cur->buffers[i].bo;
+ p_atomic_inc(&bo->num_active_ioctls);
+ amdgpu_add_fence(bo, cur->fence);
}
pipe_mutex_unlock(ws->bo_fence_lock);
amdgpu_cs_sync_flush(rcs);
/* Swap command streams. "cst" is going to be submitted. */
cs->csc = cs->cst;
cs->cst = cur;
/* Submit. */
--
2.7.4
More information about the mesa-dev
mailing list