[Mesa-dev] [PATCH v2 07/26] winsys/amdgpu: handle cs_add_fence_dependency for deferred/unsubmitted fences
Marek Olšák
maraeo at gmail.com
Mon Nov 6 18:21:46 UTC 2017
On Mon, Nov 6, 2017 at 11:23 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> The idea is to fix the following interleaving of operations
> that can arise from deferred fences:
>
> Thread 1 / Context 1 Thread 2 / Context 2
> -------------------- --------------------
> f = deferred flush
> <------- application-side synchronization ------->
> fence_server_sync(f)
> ...
> flush()
> flush()
>
> We will now stall in fence_server_sync until the flush of context 1
> has completed.
>
> This scenario was unlikely to occur previously, because applications
> seem to be doing
>
> Thread 1 / Context 1 Thread 2 / Context 2
> -------------------- --------------------
> f = glFenceSync()
> glFlush()
> <------- application-side synchronization ------->
> glWaitSync(f)
>
> ... and indeed they probably *have* to use this ordering to avoid
> deadlocks in the GLX model, where all GL operations conceptually
> go through a single connection to the X server. However, it's less
> clear whether applications have to do this with other WSI (i.e. EGL).
> Besides, even this sequence of GL commands can be translated into
> the Gallium-level sequence outlined above when Gallium threading
> and asynchronous flushes are used. So it makes sense to be more
> robust.
>
> As a side effect, we no longer busy-wait on submission_in_progress.
>
> We won't enable asynchronous flushes on radeon, but add a
> cs_add_fence_dependency stub anyway to document the potential
> issue.
> ---
> src/gallium/drivers/radeon/radeon_winsys.h | 4 +++-
> src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 21 +++++++++++++--------
> src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 9 ++++++---
> src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 19 +++++++++++++++++++
> 4 files changed, 41 insertions(+), 12 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
> index 2d3f646dc65..e8c486cb7f4 100644
> --- a/src/gallium/drivers/radeon/radeon_winsys.h
> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
> @@ -536,21 +536,23 @@ struct radeon_winsys {
> * \return Negative POSIX error code or 0 for success.
> * Asynchronous submissions never return an error.
> */
> int (*cs_flush)(struct radeon_winsys_cs *cs,
> unsigned flags,
> struct pipe_fence_handle **fence);
>
> /**
> * Create a fence before the CS is flushed.
> * The user must flush manually to complete the initializaton of the fence.
> - * The fence must not be used before the flush.
> + *
> + * The fence must not be used for anything except \ref cs_add_fence_dependency
> + * before the flush.
> */
> struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs *cs);
>
> /**
> * Return true if a buffer is referenced by a command stream.
> *
> * \param cs A command stream.
> * \param buf A winsys buffer.
> */
> bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> index 0450ccc3596..0628e547351 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> @@ -43,21 +43,22 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
> {
> struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
>
> fence->reference.count = 1;
> fence->ws = ctx->ws;
> fence->ctx = ctx;
> fence->fence.context = ctx->ctx;
> fence->fence.ip_type = ip_type;
> fence->fence.ip_instance = ip_instance;
> fence->fence.ring = ring;
> - fence->submission_in_progress = true;
> + util_queue_fence_init(&fence->submitted);
> + util_queue_fence_reset(&fence->submitted);
> p_atomic_inc(&ctx->refcount);
> return (struct pipe_fence_handle *)fence;
> }
>
> static struct pipe_fence_handle *
> amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
> {
> struct amdgpu_winsys *ws = amdgpu_winsys(rws);
> struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
>
> @@ -74,66 +75,69 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
> FREE(fence);
> return NULL;
> }
>
> r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd);
> if (r) {
> amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj);
> FREE(fence);
> return NULL;
> }
> +
> + util_queue_fence_init(&fence->submitted);
> +
> return (struct pipe_fence_handle*)fence;
> }
>
> static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
> struct pipe_fence_handle *pfence)
> {
> struct amdgpu_winsys *ws = amdgpu_winsys(rws);
> struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
>
> if (amdgpu_fence_is_syncobj(fence)) {
> int fd, r;
>
> /* Convert syncobj into sync_file. */
> r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd);
> return r ? -1 : fd;
> }
>
> - os_wait_until_zero(&fence->submission_in_progress, PIPE_TIMEOUT_INFINITE);
> + util_queue_fence_wait(&fence->submitted);
>
> /* Convert the amdgpu fence into a fence FD. */
> int fd;
> if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence,
> AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD,
> (uint32_t*)&fd))
> return -1;
>
> return fd;
> }
>
> static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
> uint64_t seq_no,
> uint64_t *user_fence_cpu_address)
> {
> struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
>
> rfence->fence.fence = seq_no;
> rfence->user_fence_cpu_address = user_fence_cpu_address;
> - rfence->submission_in_progress = false;
> + util_queue_fence_signal(&rfence->submitted);
> }
>
> static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
> {
> struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
>
> rfence->signalled = true;
> - rfence->submission_in_progress = false;
> + util_queue_fence_signal(&rfence->submitted);
> }
>
> bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
> bool absolute)
> {
> struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
> uint32_t expired;
> int64_t abs_timeout;
> uint64_t *user_fence_cpu;
> int r;
> @@ -157,22 +161,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
> }
>
> if (absolute)
> abs_timeout = timeout;
> else
> abs_timeout = os_time_get_absolute_timeout(timeout);
>
> /* The fence might not have a number assigned if its IB is being
> * submitted in the other thread right now. Wait until the submission
> * is done. */
> - if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress,
> - abs_timeout))
> + if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout))
> return false;
>
> user_fence_cpu = rfence->user_fence_cpu_address;
> if (user_fence_cpu) {
> if (*user_fence_cpu >= rfence->fence.fence) {
> rfence->signalled = true;
> return true;
> }
>
> /* No timeout, just query: no need for the ioctl. */
> @@ -1022,20 +1025,22 @@ static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
> return amdgpu_fence_wait((void *)fence, 0, false);
> }
>
> static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws,
> struct pipe_fence_handle *pfence)
> {
> struct amdgpu_cs *acs = amdgpu_cs(rws);
> struct amdgpu_cs_context *cs = acs->csc;
> struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
>
> + util_queue_fence_wait(&fence->submitted);
> +
> if (is_noop_fence_dependency(acs, fence))
> return;
>
> unsigned idx = add_fence_dependency_entry(cs);
> amdgpu_fence_reference(&cs->fence_dependencies[idx],
> (struct pipe_fence_handle*)fence);
> }
>
> static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs,
> struct amdgpu_cs_buffer *buffer)
> @@ -1297,21 +1302,21 @@ bo_list_error:
>
> for (unsigned i = 0; i < num_dependencies; i++) {
> struct amdgpu_fence *fence =
> (struct amdgpu_fence*)cs->fence_dependencies[i];
>
> if (amdgpu_fence_is_syncobj(fence)) {
> num_syncobj_dependencies++;
> continue;
> }
>
> - assert(!fence->submission_in_progress);
> + assert(util_queue_fence_is_signalled(&fence->submitted));
> amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[num++]);
> }
>
> chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
> chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num;
> chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
> num_chunks++;
> }
>
> /* Syncobj dependencies. */
> @@ -1320,21 +1325,21 @@ bo_list_error:
> alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
> unsigned num = 0;
>
> for (unsigned i = 0; i < num_dependencies; i++) {
> struct amdgpu_fence *fence =
> (struct amdgpu_fence*)cs->fence_dependencies[i];
>
> if (!amdgpu_fence_is_syncobj(fence))
> continue;
>
> - assert(!fence->submission_in_progress);
> + assert(util_queue_fence_is_signalled(&fence->submitted));
> sem_chunk[num++].handle = fence->syncobj;
> }
>
> chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
> chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num;
> chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
> num_chunks++;
> }
>
> assert(num_chunks <= ARRAY_SIZE(chunks));
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> index b7bc9a20bbf..fbf44b36610 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> @@ -137,23 +137,25 @@ struct amdgpu_cs {
> struct amdgpu_fence {
> struct pipe_reference reference;
> /* If ctx == NULL, this fence is syncobj-based. */
> uint32_t syncobj;
>
> struct amdgpu_winsys *ws;
> struct amdgpu_ctx *ctx; /* submission context */
> struct amdgpu_cs_fence fence;
> uint64_t *user_fence_cpu_address;
>
> - /* If the fence is unknown due to an IB still being submitted
> - * in the other thread. */
> - volatile int submission_in_progress; /* bool (int for atomicity) */
> + /* If the fence has been submitted. This is unsignalled for deferred fences
> + * (cs->next_fence) and while an IB is still being submitted in the submit
> + * thread. */
> + struct util_queue_fence submitted;
> +
> volatile int signalled; /* bool (int for atomicity) */
> };
>
> static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence)
> {
> return fence->ctx == NULL;
> }
>
> static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx)
> {
> @@ -171,20 +173,21 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst,
> struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src;
>
> if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
> struct amdgpu_fence *fence = *rdst;
>
> if (amdgpu_fence_is_syncobj(fence))
> amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj);
> else
> amdgpu_ctx_unref(fence->ctx);
>
> + util_queue_fence_destroy(&fence->submitted);
> FREE(fence);
> }
> *rdst = rsrc;
> }
>
> int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo);
>
> static inline struct amdgpu_ib *
> amdgpu_ib(struct radeon_winsys_cs *base)
> {
> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
> index 7220f3a0240..add88f80aae 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
> @@ -779,28 +779,47 @@ radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
> }
>
> fence = radeon_cs_create_fence(rcs);
> if (!fence)
> return NULL;
>
> radeon_fence_reference(&cs->next_fence, fence);
> return fence;
> }
>
> +static void
> +radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs,
> + struct pipe_fence_handle *fence)
> +{
> + /* TODO: Handle the following unlikely multi-threaded scenario:
> + *
> + * Thread 1 / Context 1 Thread 2 / Context 2
> + * -------------------- --------------------
> + * f = cs_get_next_fence()
> + * cs_add_fence_dependency(f)
> + * cs_flush()
> + * cs_flush()
> + *
> + * We currently assume that this does not happen because we don't support
> + * asynchronous flushes on Radeon.
> + */
We do support asynchronous flushes and deferred fences on Radeon. Not
that it matters.
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
More information about the mesa-dev
mailing list