[Mesa-dev] [PATCH v2 07/26] winsys/amdgpu: handle cs_add_fence_dependency for deferred/unsubmitted fences

Thu Nov 9 13:18:59 UTC 2017

On 06.11.2017 19:21, Marek Olšák wrote:
> On Mon, Nov 6, 2017 at 11:23 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> From: Nicolai HÃ¤hnle <nicolai.haehnle at amd.com>
>>
>> The idea is to fix the following interleaving of operations
>> that can arise from deferred fences:
>>
>>   Thread 1 / Context 1          Thread 2 / Context 2
>>   --------------------          --------------------
>>   f = deferred flush
>>   <------- application-side synchronization ------->
>>                                 fence_server_sync(f)
>>                                 ...
>>                                 flush()
>>   flush()
>>
>> We will now stall in fence_server_sync until the flush of context 1
>> has completed.
>>
>> This scenario was unlikely to occur previously, because applications
>> seem to be doing
>>
>>   Thread 1 / Context 1          Thread 2 / Context 2
>>   --------------------          --------------------
>>   f = glFenceSync()
>>   glFlush()
>>   <------- application-side synchronization ------->
>>                                 glWaitSync(f)
>>
>> ... and indeed they probably *have* to use this ordering to avoid
>> deadlocks in the GLX model, where all GL operations conceptually
>> go through a single connection to the X server. However, it's less
>> clear whether applications have to do this with other WSI (i.e. EGL).
>> Besides, even this sequence of GL commands can be translated into
>> the Gallium-level sequence outlined above when Gallium threading
>> and asynchronous flushes are used. So it makes sense to be more
>> robust.
>>
>> As a side effect, we no longer busy-wait on submission_in_progress.
>>
>> We won't enable asynchronous flushes on radeon, but add a
>> cs_add_fence_dependency stub anyway to document the potential
>> issue.
>> ---
>>   src/gallium/drivers/radeon/radeon_winsys.h    |  4 +++-
>>   src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     | 21 +++++++++++++--------
>>   src/gallium/winsys/amdgpu/drm/amdgpu_cs.h     |  9 ++++++---
>>   src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 19 +++++++++++++++++++
>>   4 files changed, 41 insertions(+), 12 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
>> index 2d3f646dc65..e8c486cb7f4 100644
>> --- a/src/gallium/drivers/radeon/radeon_winsys.h
>> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
>> @@ -536,21 +536,23 @@ struct radeon_winsys {
>>        * \return Negative POSIX error code or 0 for success.
>>        *         Asynchronous submissions never return an error.
>>        */
>>       int (*cs_flush)(struct radeon_winsys_cs *cs,
>>                       unsigned flags,
>>                       struct pipe_fence_handle **fence);
>>
>>       /**
>>        * Create a fence before the CS is flushed.
>>        * The user must flush manually to complete the initializaton of the fence.
>> -     * The fence must not be used before the flush.
>> +     *
>> +     * The fence must not be used for anything except \ref cs_add_fence_dependency
>> +     * before the flush.
>>        */
>>       struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs *cs);
>>
>>       /**
>>        * Return true if a buffer is referenced by a command stream.
>>        *
>>        * \param cs        A command stream.
>>        * \param buf       A winsys buffer.
>>        */
>>       bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
>> index 0450ccc3596..0628e547351 100644
>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
>> @@ -43,21 +43,22 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
>>   {
>>      struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
>>
>>      fence->reference.count = 1;
>>      fence->ws = ctx->ws;
>>      fence->ctx = ctx;
>>      fence->fence.context = ctx->ctx;
>>      fence->fence.ip_type = ip_type;
>>      fence->fence.ip_instance = ip_instance;
>>      fence->fence.ring = ring;
>> -   fence->submission_in_progress = true;
>> +   util_queue_fence_init(&fence->submitted);
>> +   util_queue_fence_reset(&fence->submitted);
>>      p_atomic_inc(&ctx->refcount);
>>      return (struct pipe_fence_handle *)fence;
>>   }
>>
>>   static struct pipe_fence_handle *
>>   amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
>>   {
>>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>      struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
>>
>> @@ -74,66 +75,69 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
>>         FREE(fence);
>>         return NULL;
>>      }
>>
>>      r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd);
>>      if (r) {
>>         amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj);
>>         FREE(fence);
>>         return NULL;
>>      }
>> +
>> +   util_queue_fence_init(&fence->submitted);
>> +
>>      return (struct pipe_fence_handle*)fence;
>>   }
>>
>>   static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
>>                                           struct pipe_fence_handle *pfence)
>>   {
>>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>      struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
>>
>>      if (amdgpu_fence_is_syncobj(fence)) {
>>         int fd, r;
>>
>>         /* Convert syncobj into sync_file. */
>>         r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd);
>>         return r ? -1 : fd;
>>      }
>>
>> -   os_wait_until_zero(&fence->submission_in_progress, PIPE_TIMEOUT_INFINITE);
>> +   util_queue_fence_wait(&fence->submitted);
>>
>>      /* Convert the amdgpu fence into a fence FD. */
>>      int fd;
>>      if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence,
>>                                    AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD,
>>                                    (uint32_t*)&fd))
>>         return -1;
>>
>>      return fd;
>>   }
>>
>>   static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
>>                                      uint64_t seq_no,
>>                                      uint64_t *user_fence_cpu_address)
>>   {
>>      struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
>>
>>      rfence->fence.fence = seq_no;
>>      rfence->user_fence_cpu_address = user_fence_cpu_address;
>> -   rfence->submission_in_progress = false;
>> +   util_queue_fence_signal(&rfence->submitted);
>>   }
>>
>>   static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
>>   {
>>      struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
>>
>>      rfence->signalled = true;
>> -   rfence->submission_in_progress = false;
>> +   util_queue_fence_signal(&rfence->submitted);
>>   }
>>
>>   bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
>>                          bool absolute)
>>   {
>>      struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
>>      uint32_t expired;
>>      int64_t abs_timeout;
>>      uint64_t *user_fence_cpu;
>>      int r;
>> @@ -157,22 +161,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
>>      }
>>
>>      if (absolute)
>>         abs_timeout = timeout;
>>      else
>>         abs_timeout = os_time_get_absolute_timeout(timeout);
>>
>>      /* The fence might not have a number assigned if its IB is being
>>       * submitted in the other thread right now. Wait until the submission
>>       * is done. */
>> -   if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress,
>> -                                       abs_timeout))
>> +   if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout))
>>         return false;
>>
>>      user_fence_cpu = rfence->user_fence_cpu_address;
>>      if (user_fence_cpu) {
>>         if (*user_fence_cpu >= rfence->fence.fence) {
>>            rfence->signalled = true;
>>            return true;
>>         }
>>
>>         /* No timeout, just query: no need for the ioctl. */
>> @@ -1022,20 +1025,22 @@ static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
>>      return amdgpu_fence_wait((void *)fence, 0, false);
>>   }
>>
>>   static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws,
>>                                              struct pipe_fence_handle *pfence)
>>   {
>>      struct amdgpu_cs *acs = amdgpu_cs(rws);
>>      struct amdgpu_cs_context *cs = acs->csc;
>>      struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
>>
>> +   util_queue_fence_wait(&fence->submitted);
>> +
>>      if (is_noop_fence_dependency(acs, fence))
>>         return;
>>
>>      unsigned idx = add_fence_dependency_entry(cs);
>>      amdgpu_fence_reference(&cs->fence_dependencies[idx],
>>                             (struct pipe_fence_handle*)fence);
>>   }
>>
>>   static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs,
>>                                                struct amdgpu_cs_buffer *buffer)
>> @@ -1297,21 +1302,21 @@ bo_list_error:
>>
>>            for (unsigned i = 0; i < num_dependencies; i++) {
>>               struct amdgpu_fence *fence =
>>                  (struct amdgpu_fence*)cs->fence_dependencies[i];
>>
>>               if (amdgpu_fence_is_syncobj(fence)) {
>>                  num_syncobj_dependencies++;
>>                  continue;
>>               }
>>
>> -            assert(!fence->submission_in_progress);
>> +            assert(util_queue_fence_is_signalled(&fence->submitted));
>>               amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[num++]);
>>            }
>>
>>            chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
>>            chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num;
>>            chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
>>            num_chunks++;
>>         }
>>
>>         /* Syncobj dependencies. */
>> @@ -1320,21 +1325,21 @@ bo_list_error:
>>               alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
>>            unsigned num = 0;
>>
>>            for (unsigned i = 0; i < num_dependencies; i++) {
>>               struct amdgpu_fence *fence =
>>                  (struct amdgpu_fence*)cs->fence_dependencies[i];
>>
>>               if (!amdgpu_fence_is_syncobj(fence))
>>                  continue;
>>
>> -            assert(!fence->submission_in_progress);
>> +            assert(util_queue_fence_is_signalled(&fence->submitted));
>>               sem_chunk[num++].handle = fence->syncobj;
>>            }
>>
>>            chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
>>            chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num;
>>            chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
>>            num_chunks++;
>>         }
>>
>>         assert(num_chunks <= ARRAY_SIZE(chunks));
>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
>> index b7bc9a20bbf..fbf44b36610 100644
>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
>> @@ -137,23 +137,25 @@ struct amdgpu_cs {
>>   struct amdgpu_fence {
>>      struct pipe_reference reference;
>>      /* If ctx == NULL, this fence is syncobj-based. */
>>      uint32_t syncobj;
>>
>>      struct amdgpu_winsys *ws;
>>      struct amdgpu_ctx *ctx;  /* submission context */
>>      struct amdgpu_cs_fence fence;
>>      uint64_t *user_fence_cpu_address;
>>
>> -   /* If the fence is unknown due to an IB still being submitted
>> -    * in the other thread. */
>> -   volatile int submission_in_progress; /* bool (int for atomicity) */
>> +   /* If the fence has been submitted. This is unsignalled for deferred fences
>> +    * (cs->next_fence) and while an IB is still being submitted in the submit
>> +    * thread. */
>> +   struct util_queue_fence submitted;
>> +
>>      volatile int signalled;              /* bool (int for atomicity) */
>>   };
>>
>>   static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence)
>>   {
>>      return fence->ctx == NULL;
>>   }
>>
>>   static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx)
>>   {
>> @@ -171,20 +173,21 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst,
>>      struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src;
>>
>>      if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
>>         struct amdgpu_fence *fence = *rdst;
>>
>>         if (amdgpu_fence_is_syncobj(fence))
>>            amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj);
>>         else
>>            amdgpu_ctx_unref(fence->ctx);
>>
>> +      util_queue_fence_destroy(&fence->submitted);
>>         FREE(fence);
>>      }
>>      *rdst = rsrc;
>>   }
>>
>>   int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo);
>>
>>   static inline struct amdgpu_ib *
>>   amdgpu_ib(struct radeon_winsys_cs *base)
>>   {
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>> index 7220f3a0240..add88f80aae 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>> @@ -779,28 +779,47 @@ radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
>>      }
>>
>>      fence = radeon_cs_create_fence(rcs);
>>      if (!fence)
>>         return NULL;
>>
>>      radeon_fence_reference(&cs->next_fence, fence);
>>      return fence;
>>   }
>>
>> +static void
>> +radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs,
>> +                                   struct pipe_fence_handle *fence)
>> +{
>> +   /* TODO: Handle the following unlikely multi-threaded scenario:
>> +    *
>> +    *  Thread 1 / Context 1                   Thread 2 / Context 2
>> +    *  --------------------                   --------------------
>> +    *  f = cs_get_next_fence()
>> +    *                                         cs_add_fence_dependency(f)
>> +    *                                         cs_flush()
>> +    *  cs_flush()
>> +    *
>> +    * We currently assume that this does not happen because we don't support
>> +    * asynchronous flushes on Radeon.
>> +    */
> 
> We do support asynchronous flushes and deferred fences on Radeon. Not
> that it matters.

Well, they're not as asynchronous as on amdgpu, because 
u_threaded_context will sync unconditionally on radeon. You're right 
about deferred fences though.

> Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Thanks!

> 
> Marek
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.