[Mesa-dev] [PATCH v2 10/26] gallium/u_threaded: implement asynchronous flushes

Mon Nov 6 20:55:03 UTC 2017

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek

On Mon, Nov 6, 2017 at 11:23 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> This requires out-of-band creation of fences, and will be signaled to
> the pipe_context::flush implementation by a special TC_FLUSH_ASYNC flag.
>
> v2:
> - remove an incorrect assertion
> - handle fence_server_sync for unsubmitted fences by
>   relying on the improved cs_add_fence_dependency
> - only implement asynchronous flushes on amdgpu
> ---
>  src/gallium/auxiliary/util/u_threaded_context.c    |  96 ++++++++++++++++++-
>  src/gallium/auxiliary/util/u_threaded_context.h    |  59 ++++++++++++
>  .../auxiliary/util/u_threaded_context_calls.h      |   1 +
>  src/gallium/drivers/radeonsi/si_fence.c            | 104 ++++++++++++++++-----
>  src/gallium/drivers/radeonsi/si_pipe.c             |   3 +
>  src/gallium/drivers/radeonsi/si_pipe.h             |   2 +
>  6 files changed, 238 insertions(+), 27 deletions(-)
>
> diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
> index 24fab7f5cb6..0bb645e8522 100644
> --- a/src/gallium/auxiliary/util/u_threaded_context.c
> +++ b/src/gallium/auxiliary/util/u_threaded_context.c
> @@ -81,40 +81,47 @@ tc_debug_check(struct threaded_context *tc)
>
>  static void
>  tc_batch_execute(void *job, int thread_index)
>  {
>     struct tc_batch *batch = job;
>     struct pipe_context *pipe = batch->pipe;
>     struct tc_call *last = &batch->call[batch->num_total_call_slots];
>
>     tc_batch_check(batch);
>
> +   assert(!batch->token);
> +
>     for (struct tc_call *iter = batch->call; iter != last;
>          iter += iter->num_call_slots) {
>        tc_assert(iter->sentinel == TC_SENTINEL);
>        execute_func[iter->call_id](pipe, &iter->payload);
>     }
>
>     tc_batch_check(batch);
>     batch->num_total_call_slots = 0;
>  }
>
>  static void
>  tc_batch_flush(struct threaded_context *tc)
>  {
>     struct tc_batch *next = &tc->batch_slots[tc->next];
>
>     tc_assert(next->num_total_call_slots != 0);
>     tc_batch_check(next);
>     tc_debug_check(tc);
>     p_atomic_add(&tc->num_offloaded_slots, next->num_total_call_slots);
>
> +   if (next->token) {
> +      next->token->tc = NULL;
> +      tc_unflushed_batch_token_reference(&next->token, NULL);
> +   }
> +
>     util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
>                        NULL);
>     tc->last = tc->next;
>     tc->next = (tc->next + 1) % TC_MAX_BATCHES;
>  }
>
>  /* This is the function that adds variable-sized calls into the current
>   * batch. It also flushes the batch if there is not enough space there.
>   * All other higher-level "add" functions use it.
>   */
> @@ -172,40 +179,63 @@ _tc_sync(struct threaded_context *tc, const char *info, const char *func)
>     tc_debug_check(tc);
>
>     /* Only wait for queued calls... */
>     if (!util_queue_fence_is_signalled(&last->fence)) {
>        util_queue_fence_wait(&last->fence);
>        synced = true;
>     }
>
>     tc_debug_check(tc);
>
> +   if (next->token) {
> +      next->token->tc = NULL;
> +      tc_unflushed_batch_token_reference(&next->token, NULL);
> +   }
> +
>     /* .. and execute unflushed calls directly. */
>     if (next->num_total_call_slots) {
>        p_atomic_add(&tc->num_direct_slots, next->num_total_call_slots);
>        tc_batch_execute(next, 0);
>        synced = true;
>     }
>
>     if (synced) {
>        p_atomic_inc(&tc->num_syncs);
>
>        if (tc_strcmp(func, "tc_destroy") != 0)
>           tc_printf("sync %s %s\n", func, info);
>     }
>
>     tc_debug_check(tc);
>  }
>
>  #define tc_sync(tc) _tc_sync(tc, "", __func__)
>  #define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__)
>
> +/**
> + * Call this from fence_finish for same-context fence waits of deferred fences
> + * that haven't been flushed yet.
> + *
> + * The passed pipe_context must be the one passed to pipe_screen::fence_finish,
> + * i.e., the wrapped one.
> + */
> +void
> +threaded_context_flush(struct pipe_context *_pipe,
> +                       struct tc_unflushed_batch_token *token)
> +{
> +   struct threaded_context *tc = threaded_context(_pipe);
> +
> +   /* This is called from the state-tracker / application thread. */
> +   if (token->tc && token->tc == tc)
> +      tc_sync(token->tc);
> +}
> +
>  static void
>  tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src)
>  {
>     *dst = NULL;
>     pipe_resource_reference(dst, src);
>  }
>
>  void
>  threaded_resource_init(struct pipe_resource *res)
>  {
> @@ -1775,36 +1805,94 @@ tc_create_video_buffer(struct pipe_context *_pipe,
>  {
>     unreachable("Threaded context should not be enabled for video APIs");
>     return NULL;
>  }
>
>
>  /********************************************************************
>   * draw, launch, clear, blit, copy, flush
>   */
>
> +struct tc_flush_payload {
> +   struct pipe_fence_handle *fence;
> +   unsigned flags;
> +};
> +
> +static void
> +tc_call_flush(struct pipe_context *pipe, union tc_payload *payload)
> +{
> +   struct tc_flush_payload *p = (struct tc_flush_payload *)payload;
> +   struct pipe_screen *screen = pipe->screen;
> +
> +   pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags);
> +   screen->fence_reference(screen, &p->fence, NULL);
> +}
> +
>  static void
>  tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
>           unsigned flags)
>  {
>     struct threaded_context *tc = threaded_context(_pipe);
>     struct pipe_context *pipe = tc->pipe;
> +   struct pipe_screen *screen = pipe->screen;
>     struct threaded_query *tq, *tmp;
> +   bool async = flags & PIPE_FLUSH_DEFERRED;
> +
> +   if (flags & PIPE_FLUSH_ASYNC) {
> +      struct tc_batch *last = &tc->batch_slots[tc->last];
> +
> +      /* Prefer to do the flush in the driver thread, but avoid the inter-thread
> +       * communication overhead if the driver thread is currently idle and the
> +       * caller is going to wait for the fence immediately anyway.
> +       */
> +      if (!(util_queue_fence_is_signalled(&last->fence) &&
> +            (flags & PIPE_FLUSH_HINT_FINISH)))
> +         async = true;
> +   }
> +
> +   if (async && tc->create_fence) {
> +      if (fence) {
> +         struct tc_unflushed_batch_token *token = NULL;
> +         struct tc_batch *next = &tc->batch_slots[tc->next];
> +
> +         if (!next->token) {
> +            next->token = malloc(sizeof(*next->token));
> +            if (!next->token)
> +               goto out_of_memory;
>
> +            pipe_reference_init(&next->token->ref, 1);
> +            next->token->tc = tc;
> +         }
> +
> +         screen->fence_reference(screen, fence, tc->create_fence(pipe, token));
> +         if (!*fence)
> +            goto out_of_memory;
> +      }
> +
> +      struct tc_flush_payload *p =
> +         tc_add_struct_typed_call(tc, TC_CALL_flush, tc_flush_payload);
> +      p->fence = fence ? *fence : NULL;
> +      p->flags = flags | TC_FLUSH_ASYNC;
> +
> +      if (!(flags & PIPE_FLUSH_DEFERRED))
> +         tc_batch_flush(tc);
> +      return;
> +   }
> +
> +out_of_memory:
>     if (!(flags & PIPE_FLUSH_DEFERRED)) {
>        LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) {
>           tq->flushed = true;
>           LIST_DEL(&tq->head_unflushed);
>        }
>     }
>
> -   /* TODO: deferred flushes? */
>     tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
>                     flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
>     pipe->flush(pipe, fence, flags);
>  }
>
>  /* This is actually variable-sized, because indirect isn't allocated if it's
>   * not needed. */
>  struct tc_full_draw_info {
>     struct pipe_draw_info draw;
>     struct pipe_draw_indirect_info indirect;
> @@ -2240,22 +2328,24 @@ tc_destroy(struct pipe_context *_pipe)
>        u_upload_destroy(tc->base.const_uploader);
>
>     if (tc->base.stream_uploader)
>        u_upload_destroy(tc->base.stream_uploader);
>
>     tc_sync(tc);
>
>     if (util_queue_is_initialized(&tc->queue)) {
>        util_queue_destroy(&tc->queue);
>
> -      for (unsigned i = 0; i < TC_MAX_BATCHES; i++)
> +      for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
>           util_queue_fence_destroy(&tc->batch_slots[i].fence);
> +         assert(!tc->batch_slots[i].token);
> +      }
>     }
>
>     slab_destroy_child(&tc->pool_transfers);
>     assert(tc->batch_slots[tc->next].num_total_call_slots == 0);
>     pipe->destroy(pipe);
>     os_free_aligned(tc);
>  }
>
>  static const tc_execute execute_func[TC_NUM_CALLS] = {
>  #define CALL(name) tc_call_##name,
> @@ -2272,20 +2362,21 @@ static const tc_execute execute_func[TC_NUM_CALLS] = {
>   *                             in pipe_screen.
>   * \param replace_buffer  callback for replacing a pipe_resource's storage
>   *                        with another pipe_resource's storage.
>   * \param out  if successful, the threaded_context will be returned here in
>   *             addition to the return value if "out" != NULL
>   */
>  struct pipe_context *
>  threaded_context_create(struct pipe_context *pipe,
>                          struct slab_parent_pool *parent_transfer_pool,
>                          tc_replace_buffer_storage_func replace_buffer,
> +                        tc_create_fence_func create_fence,
>                          struct threaded_context **out)
>  {
>     struct threaded_context *tc;
>
>     STATIC_ASSERT(sizeof(union tc_payload) <= 8);
>     STATIC_ASSERT(sizeof(struct tc_call) <= 16);
>
>     if (!pipe)
>        return NULL;
>
> @@ -2306,20 +2397,21 @@ threaded_context_create(struct pipe_context *pipe,
>     assert(offsetof(struct threaded_context, batch_slots) % 16 == 0);
>     assert(offsetof(struct threaded_context, batch_slots[0].call) % 16 == 0);
>     assert(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0);
>     assert(offsetof(struct threaded_context, batch_slots[1].call) % 16 == 0);
>
>     /* The driver context isn't wrapped, so set its "priv" to NULL. */
>     pipe->priv = NULL;
>
>     tc->pipe = pipe;
>     tc->replace_buffer_storage = replace_buffer;
> +   tc->create_fence = create_fence;
>     tc->map_buffer_alignment =
>        pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
>     tc->base.priv = pipe; /* priv points to the wrapped driver context */
>     tc->base.screen = pipe->screen;
>     tc->base.destroy = tc_destroy;
>
>     tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader);
>     if (pipe->stream_uploader == pipe->const_uploader)
>        tc->base.const_uploader = tc->base.stream_uploader;
>     else
> diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
> index 57805ee4a1e..7642a39dc3a 100644
> --- a/src/gallium/auxiliary/util/u_threaded_context.h
> +++ b/src/gallium/auxiliary/util/u_threaded_context.h
> @@ -101,20 +101,43 @@
>   * 3) The driver isn't allowed to do buffer invalidations by itself under any
>   *    circumstances. This is necessary for unsychronized maps to map the latest
>   *    version of the buffer. (because invalidations can be queued, while
>   *    unsychronized maps are not queued and they should return the latest
>   *    storage after invalidation). The threaded context always sends
>   *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
>   *    indicate this. Ignoring the flag will lead to failures.
>   *    The threaded context uses its own buffer invalidation mechanism.
>   *
>   *
> + * Rules for fences
> + * ----------------
> + *
> + * Flushes will be executed asynchronously in the driver thread if a
> + * create_fence callback is provided. This affects fence semantics as follows.
> + *
> + * When the threaded context wants to perform an asynchronous flush, it will
> + * use the create_fence callback to pre-create the fence from the calling
> + * thread. This pre-created fence will be passed to pipe_context::flush
> + * together with the TC_FLUSH_ASYNC flag.
> + *
> + * The callback receives the unwrapped context as a parameter, but must use it
> + * in a thread-safe way because it is called from a non-driver thread.
> + *
> + * If the threaded_context does not immediately flush the current batch, the
> + * callback also receives a tc_unflushed_batch_token. If fence_finish is called
> + * on the returned fence in the context that created the fence,
> + * threaded_context_flush must be called.
> + *
> + * The driver must implement pipe_context::fence_server_sync properly, since
> + * the threaded context handles PIPE_FLUSH_ASYNC.
> + *
> + *
>   * Additional requirements
>   * -----------------------
>   *
>   * get_query_result:
>   *    If threaded_query::flushed == true, get_query_result should assume that
>   *    it's called from a non-driver thread, in which case the driver shouldn't
>   *    use the context in an unsafe way.
>   *
>   * replace_buffer_storage:
>   *    The driver has to implement this callback, which will be called when
> @@ -153,32 +176,40 @@
>   * The batches are ordered in a ring and reused once they are idle again.
>   * The batching is necessary for low queue/mutex overhead.
>   *
>   */
>
>  #ifndef U_THREADED_CONTEXT_H
>  #define U_THREADED_CONTEXT_H
>
>  #include "pipe/p_context.h"
>  #include "pipe/p_state.h"
> +#include "util/u_inlines.h"
>  #include "util/u_queue.h"
>  #include "util/u_range.h"
>  #include "util/slab.h"
>
> +struct threaded_context;
> +struct tc_unflushed_batch_token;
> +
>  /* These are transfer flags sent to drivers. */
>  /* Never infer whether it's safe to use unsychronized mappings: */
>  #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
>  /* Don't invalidate buffers: */
>  #define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
>  /* transfer_map is called from a non-driver thread: */
>  #define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)
>
> +/* Custom flush flags sent to drivers. */
> +/* fence is pre-populated with a fence created by the create_fence callback */
> +#define TC_FLUSH_ASYNC        (1u << 31)
> +
>  /* Size of the queue = number of batch slots in memory.
>   * - 1 batch is always idle and records new commands
>   * - 1 batch is being executed
>   * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
>   *
>   * Use a size as small as possible for low CPU L2 cache usage but large enough
>   * so that the queue isn't stalled too often for not having enough idle batch
>   * slots.
>   */
>  #define TC_MAX_BATCHES        10
> @@ -197,20 +228,22 @@
>  /* Threshold for when to enqueue buffer/texture_subdata as-is.
>   * If the upload size is greater than this, it will do instead:
>   * - for buffers: DISCARD_RANGE is done by the threaded context
>   * - for textures: sync and call the driver directly
>   */
>  #define TC_MAX_SUBDATA_BYTES        320
>
>  typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
>                                                 struct pipe_resource *dst,
>                                                 struct pipe_resource *src);
> +typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
> +                                                          struct tc_unflushed_batch_token *token);
>
>  struct threaded_resource {
>     struct pipe_resource b;
>     const struct u_resource_vtbl *vtbl;
>
>     /* Since buffer invalidations are queued, we can't use the base resource
>      * for unsychronized mappings. This points to the latest version of
>      * the buffer after the latest invalidation. It's only used for unsychro-
>      * nized mappings in the non-driver thread. Initially it's set to &b.
>      */
> @@ -280,33 +313,45 @@ union tc_payload {
>  #endif
>
>  /* Each call slot should be aligned to its own size for optimal cache usage. */
>  struct ALIGN16 tc_call {
>     unsigned sentinel;
>     ushort num_call_slots;
>     ushort call_id;
>     union tc_payload payload;
>  };
>
> +/**
> + * A token representing an unflushed batch.
> + *
> + * See the general rules for fences for an explanation.
> + */
> +struct tc_unflushed_batch_token {
> +   struct pipe_reference ref;
> +   struct threaded_context *tc;
> +};
> +
>  struct tc_batch {
>     struct pipe_context *pipe;
>     unsigned sentinel;
>     unsigned num_total_call_slots;
> +   struct tc_unflushed_batch_token *token;
>     struct util_queue_fence fence;
>     struct tc_call call[TC_CALLS_PER_BATCH];
>  };
>
>  struct threaded_context {
>     struct pipe_context base;
>     struct pipe_context *pipe;
>     struct slab_child_pool pool_transfers;
>     tc_replace_buffer_storage_func replace_buffer_storage;
> +   tc_create_fence_func create_fence;
>     unsigned map_buffer_alignment;
>
>     struct list_head unflushed_queries;
>
>     /* Counters for the HUD. */
>     unsigned num_offloaded_slots;
>     unsigned num_direct_slots;
>     unsigned num_syncs;
>
>     struct util_queue queue;
> @@ -317,22 +362,27 @@ struct threaded_context {
>  };
>
>  void threaded_resource_init(struct pipe_resource *res);
>  void threaded_resource_deinit(struct pipe_resource *res);
>  struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
>
>  struct pipe_context *
>  threaded_context_create(struct pipe_context *pipe,
>                          struct slab_parent_pool *parent_transfer_pool,
>                          tc_replace_buffer_storage_func replace_buffer,
> +                        tc_create_fence_func create_fence,
>                          struct threaded_context **out);
>
> +void
> +threaded_context_flush(struct pipe_context *_pipe,
> +                       struct tc_unflushed_batch_token *token);
> +
>  static inline struct threaded_context *
>  threaded_context(struct pipe_context *pipe)
>  {
>     return (struct threaded_context*)pipe;
>  }
>
>  static inline struct threaded_resource *
>  threaded_resource(struct pipe_resource *res)
>  {
>     return (struct threaded_resource*)res;
> @@ -343,11 +393,20 @@ threaded_query(struct pipe_query *q)
>  {
>     return (struct threaded_query*)q;
>  }
>
>  static inline struct threaded_transfer *
>  threaded_transfer(struct pipe_transfer *transfer)
>  {
>     return (struct threaded_transfer*)transfer;
>  }
>
> +static inline void
> +tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
> +                                   struct tc_unflushed_batch_token *src)
> +{
> +   if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src))
> +      free(*dst);
> +   *dst = src;
> +}
> +
>  #endif
> diff --git a/src/gallium/auxiliary/util/u_threaded_context_calls.h b/src/gallium/auxiliary/util/u_threaded_context_calls.h
> index 546819a2580..1356c54baf2 100644
> --- a/src/gallium/auxiliary/util/u_threaded_context_calls.h
> +++ b/src/gallium/auxiliary/util/u_threaded_context_calls.h
> @@ -1,10 +1,11 @@
> +CALL(flush)
>  CALL(destroy_query)
>  CALL(begin_query)
>  CALL(end_query)
>  CALL(get_query_result_resource)
>  CALL(render_condition)
>  CALL(bind_sampler_states)
>  CALL(set_framebuffer_state)
>  CALL(set_tess_state)
>  CALL(set_constant_buffer)
>  CALL(set_scissor_states)
> diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
> index b416c47aa30..701e8df9cfc 100644
> --- a/src/gallium/drivers/radeonsi/si_fence.c
> +++ b/src/gallium/drivers/radeonsi/si_fence.c
> @@ -19,27 +19,30 @@
>   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
>   * SOFTWARE.
>   *
>   */
>
>  #include <libsync.h>
>
>  #include "util/os_time.h"
>  #include "util/u_memory.h"
> +#include "util/u_queue.h"
>
>  #include "si_pipe.h"
>
>  struct si_multi_fence {
>         struct pipe_reference reference;
>         struct pipe_fence_handle *gfx;
>         struct pipe_fence_handle *sdma;
> +       struct tc_unflushed_batch_token *tc_token;
> +       struct util_queue_fence ready;
>
>         /* If the context wasn't flushed at fence creation, this is non-NULL. */
>         struct {
>                 struct r600_common_context *ctx;
>                 unsigned ib_index;
>         } gfx_unflushed;
>  };
>
>  static void si_add_fence_dependency(struct r600_common_context *rctx,
>                                     struct pipe_fence_handle *fence)
> @@ -55,46 +58,62 @@ static void si_fence_reference(struct pipe_screen *screen,
>                                struct pipe_fence_handle **dst,
>                                struct pipe_fence_handle *src)
>  {
>         struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
>         struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
>         struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
>
>         if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
>                 ws->fence_reference(&(*rdst)->gfx, NULL);
>                 ws->fence_reference(&(*rdst)->sdma, NULL);
> +               tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL);
>                 FREE(*rdst);
>         }
>          *rdst = rsrc;
>  }
>
> +static struct si_multi_fence *si_create_multi_fence()
> +{
> +       struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
> +       if (!fence)
> +               return NULL;
> +
> +       pipe_reference_init(&fence->reference, 1);
> +       util_queue_fence_init(&fence->ready);
> +
> +       return fence;
> +}
> +
> +struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
> +                                         struct tc_unflushed_batch_token *tc_token)
> +{
> +       struct si_multi_fence *fence = si_create_multi_fence();
> +       if (!fence)
> +               return NULL;
> +
> +       util_queue_fence_reset(&fence->ready);
> +       tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
> +
> +       return (struct pipe_fence_handle *)fence;
> +}
> +
>  static void si_fence_server_sync(struct pipe_context *ctx,
>                                  struct pipe_fence_handle *fence)
>  {
>         struct r600_common_context *rctx = (struct r600_common_context *)ctx;
>         struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
>
> -       /* Only amdgpu needs to handle fence dependencies (for fence imports).
> -        * radeon synchronizes all rings by default and will not implement
> -        * fence imports.
> -        */
> -       if (rctx->screen->info.drm_major == 2)
> -               return;
> +       util_queue_fence_wait(&rfence->ready);
>
> -       /* Only imported fences need to be handled by fence_server_sync,
> -        * because the winsys handles synchronizations automatically for BOs
> -        * within the process.
> -        *
> -        * Simply skip unflushed fences here, and the winsys will drop no-op
> -        * dependencies (i.e. dependencies within the same ring).
> -        */
> -       if (rfence->gfx_unflushed.ctx)
> +       /* Unflushed fences from the same context are no-ops. */
> +       if (rfence->gfx_unflushed.ctx &&
> +           rfence->gfx_unflushed.ctx == rctx)
>                 return;
>
>         /* All unflushed commands will not start execution before
>          * this fence dependency is signalled.
>          *
>          * Should we flush the context to allow more GPU parallelism?
>          */
>         if (rfence->sdma)
>                 si_add_fence_dependency(rctx, rfence->sdma);
>         if (rfence->gfx)
> @@ -107,20 +126,44 @@ static boolean si_fence_finish(struct pipe_screen *screen,
>                                uint64_t timeout)
>  {
>         struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
>         struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
>         struct r600_common_context *rctx;
>         int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
>
>         ctx = threaded_context_unwrap_sync(ctx);
>         rctx = ctx ? (struct r600_common_context*)ctx : NULL;
>
> +       if (!util_queue_fence_is_signalled(&rfence->ready)) {
> +               if (!timeout)
> +                       return false;
> +
> +               if (rfence->tc_token) {
> +                       /* Ensure that si_flush_from_st will be called for
> +                        * this fence, but only if we're in the API thread
> +                        * where the context is current.
> +                        *
> +                        * Note that the batch containing the flush may already
> +                        * be in flight in the driver thread, so the fence
> +                        * may not be ready yet when this call returns.
> +                        */
> +                       threaded_context_flush(ctx, rfence->tc_token);
> +               }
> +
> +               if (timeout == PIPE_TIMEOUT_INFINITE) {
> +                       util_queue_fence_wait(&rfence->ready);
> +               } else {
> +                       if (!util_queue_fence_wait_timeout(&rfence->ready, abs_timeout))
> +                               return false;
> +               }
> +       }
> +
>         if (rfence->sdma) {
>                 if (!rws->fence_wait(rws, rfence->sdma, timeout))
>                         return false;
>
>                 /* Recompute the timeout after waiting. */
>                 if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
>                         int64_t time = os_time_get_nano();
>                         timeout = abs_timeout > time ? abs_timeout - time : 0;
>                 }
>         }
> @@ -153,45 +196,46 @@ static void si_create_fence_fd(struct pipe_context *ctx,
>  {
>         struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen;
>         struct radeon_winsys *ws = rscreen->ws;
>         struct si_multi_fence *rfence;
>
>         *pfence = NULL;
>
>         if (!rscreen->info.has_sync_file)
>                 return;
>
> -       rfence = CALLOC_STRUCT(si_multi_fence);
> +       rfence = si_create_multi_fence();
>         if (!rfence)
>                 return;
>
> -       pipe_reference_init(&rfence->reference, 1);
>         rfence->gfx = ws->fence_import_sync_file(ws, fd);
>         if (!rfence->gfx) {
>                 FREE(rfence);
>                 return;
>         }
>
>         *pfence = (struct pipe_fence_handle*)rfence;
>  }
>
>  static int si_fence_get_fd(struct pipe_screen *screen,
>                            struct pipe_fence_handle *fence)
>  {
>         struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
>         struct radeon_winsys *ws = rscreen->ws;
>         struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
>         int gfx_fd = -1, sdma_fd = -1;
>
>         if (!rscreen->info.has_sync_file)
>                 return -1;
>
> +       util_queue_fence_wait(&rfence->ready);
> +
>         /* Deferred fences aren't supported. */
>         assert(!rfence->gfx_unflushed.ctx);
>         if (rfence->gfx_unflushed.ctx)
>                 return -1;
>
>         if (rfence->sdma) {
>                 sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma);
>                 if (sdma_fd == -1)
>                         return -1;
>         }
> @@ -253,40 +297,50 @@ static void si_flush_from_st(struct pipe_context *ctx,
>                     fence) {
>                         gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
>                         deferred_fence = true;
>                 } else {
>                         rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
>                 }
>         }
>
>         /* Both engines can signal out of order, so we need to keep both fences. */
>         if (fence) {
> -               struct si_multi_fence *multi_fence =
> -                               CALLOC_STRUCT(si_multi_fence);
> -               if (!multi_fence) {
> -                       ws->fence_reference(&sdma_fence, NULL);
> -                       ws->fence_reference(&gfx_fence, NULL);
> -                       goto finish;
> +               struct si_multi_fence *multi_fence;
> +
> +               if (flags & TC_FLUSH_ASYNC) {
> +                       multi_fence = (struct si_multi_fence *)*fence;
> +                       assert(multi_fence);
> +               } else {
> +                       multi_fence = si_create_multi_fence();
> +                       if (!multi_fence) {
> +                               ws->fence_reference(&sdma_fence, NULL);
> +                               ws->fence_reference(&gfx_fence, NULL);
> +                               goto finish;
> +                       }
> +
> +                       screen->fence_reference(screen, fence, NULL);
> +                       *fence = (struct pipe_fence_handle*)multi_fence;
>                 }
>
> -               multi_fence->reference.count = 1;
>                 /* If both fences are NULL, fence_finish will always return true. */
>                 multi_fence->gfx = gfx_fence;
>                 multi_fence->sdma = sdma_fence;
>
>                 if (deferred_fence) {
>                         multi_fence->gfx_unflushed.ctx = rctx;
>                         multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
>                 }
>
> -               screen->fence_reference(screen, fence, NULL);
> -               *fence = (struct pipe_fence_handle*)multi_fence;
> +               if (flags & TC_FLUSH_ASYNC) {
> +                       util_queue_fence_signal(&multi_fence->ready);
> +                       tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
> +               }
>         }
>  finish:
>         if (!(flags & PIPE_FLUSH_DEFERRED)) {
>                 if (rctx->dma.cs)
>                         ws->cs_sync_flush(rctx->dma.cs);
>                 ws->cs_sync_flush(rctx->gfx.cs);
>         }
>  }
>
>  void si_init_fence_functions(struct si_context *ctx)
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 8d7fb52350f..10225353907 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -398,22 +398,25 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
>          * those.
>          */
>         if (flags & (PIPE_CONTEXT_COMPUTE_ONLY | PIPE_CONTEXT_DEBUG))
>                 return ctx;
>
>         /* When shaders are logged to stderr, asynchronous compilation is
>          * disabled too. */
>         if (sscreen->b.debug_flags & DBG_ALL_SHADERS)
>                 return ctx;
>
> +       /* Use asynchronous flushes only on amdgpu, since the radeon
> +        * implementation for fence_server_sync is incomplete. */
>         return threaded_context_create(ctx, &sscreen->b.pool_transfers,
>                                        si_replace_buffer_storage,
> +                                      sscreen->b.info.drm_major >= 3 ? si_create_fence : NULL,
>                                        &((struct si_context*)ctx)->b.tc);
>  }
>
>  /*
>   * pipe_screen
>   */
>  static bool si_have_tgsi_compute(struct si_screen *sscreen)
>  {
>         /* Old kernels disallowed some register writes for SI
>          * that are used for indirect dispatches. */
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 5253dbc43ea..fc6197ab886 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -596,20 +596,22 @@ void si_init_debug_functions(struct si_context *sctx);
>  void si_check_vm_faults(struct r600_common_context *ctx,
>                         struct radeon_saved_cs *saved, enum ring_type ring);
>  bool si_replace_shader(unsigned num, struct ac_shader_binary *binary);
>
>  /* si_dma.c */
>  void si_init_dma_functions(struct si_context *sctx);
>
>  /* si_fence.c */
>  void si_init_fence_functions(struct si_context *ctx);
>  void si_init_screen_fence_functions(struct si_screen *screen);
> +struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
> +                                         struct tc_unflushed_batch_token *tc_token);
>
>  /* si_hw_context.c */
>  void si_destroy_saved_cs(struct si_saved_cs *scs);
>  void si_context_gfx_flush(void *context, unsigned flags,
>                           struct pipe_fence_handle **fence);
>  void si_begin_new_cs(struct si_context *ctx);
>  void si_need_cs_space(struct si_context *ctx);
>
>  /* si_compute.c */
>  void si_init_compute_functions(struct si_context *sctx);
> --
> 2.11.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev