[Mesa-dev] [PATCH] winsys/amdgpu: add back multithreaded command submission

Wed May 18 17:48:34 UTC 2016

On 18.05.2016 11:58, Marek Olšák wrote:
> On Sat, May 7, 2016 at 5:12 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> Looks good to me, just two remarks below...
>>
>>
>> On 06.05.2016 13:31, Marek Olšák wrote:
>>>
>>> From: Marek Olšák <marek.olsak at amd.com>
>>>
>>> Ported from the initial amdgpu winsys from the private AMD branch.
>>>
>>> The thread creates the buffer list, submits IBs, and cleans up
>>> the submission context, which can also destroy buffers.
>>>
>>> 3-5% reduction in CPU overhead is expected for apps submitting a lot
>>> of IBs per frame. This is most visible with DMA IBs.
>>> ---
>>>    src/gallium/winsys/amdgpu/drm/amdgpu_bo.c     |  26 ++-
>>>    src/gallium/winsys/amdgpu/drm/amdgpu_bo.h     |   4 +
>>>    src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     | 311
>>> +++++++++++++++++---------
>>>    src/gallium/winsys/amdgpu/drm/amdgpu_cs.h     |  52 +++--
>>>    src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c |  61 +++++
>>>    src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h |   9 +
>>>    6 files changed, 333 insertions(+), 130 deletions(-)
>>>
>>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>> index 37a41c0..ec5fa6a 100644
>>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>> @@ -43,8 +43,21 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf,
>>> uint64_t timeout,
>>>    {
>>>       struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
>>>       struct amdgpu_winsys *ws = bo->ws;
>>> +   int64_t abs_timeout;
>>>       int i;
>>>
>>> +   if (timeout == 0) {
>>> +      if (p_atomic_read(&bo->num_active_ioctls))
>>> +         return false;
>>> +
>>> +   } else {
>>> +      abs_timeout = os_time_get_absolute_timeout(timeout);
>>> +
>>> +      /* Wait if any ioctl is being submitted with this buffer. */
>>> +      if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls,
>>> abs_timeout))
>>> +         return false;
>>> +   }
>>
>>
>> I'd suggest to do the cs_sync_flush here instead of below - there is less
>> action at a distance, and some additional code paths end up covered by a
>> flush as well.
>
> Unfortunately, amdgpu_bo_wait is exposed via the winsys interface and
> doesn't accept a CS. We could extend it to accept a CS or two, but
> that would mean adding most of what r600_buffer_map_sync_with_rings is
> doing. It would be a bigger cleanup and I think it should be done as a
> separate patch if we wanted to go down that road.

Okay, fair enough. Let's keep an eye out for whether some use cases get 
into busy waits, just in case.

The amdgpu_cs_sync_flush should be added to the other branch of the 
PIPE_TRANSFER_WRITE check in amdgpu_bo_map though, right?

>
>>
>>
>>> +
>>>       if (bo->is_shared) {
>>>          /* We can't use user fences for shared buffers, because user
>>> fences
>>>           * are local to this process only. If we want to wait for all
>>> buffer
>>> @@ -61,7 +74,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf,
>>> uint64_t timeout,
>>>       }
>>>
>>>       if (timeout == 0) {
>>> -      /* Timeout == 0 is quite simple. */
>>>          pipe_mutex_lock(ws->bo_fence_lock);
>>>          for (i = 0; i < RING_LAST; i++)
>>>             if (bo->fence[i]) {
>>> @@ -80,7 +92,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf,
>>> uint64_t timeout,
>>>          struct pipe_fence_handle *fence[RING_LAST] = {};
>>>          bool fence_idle[RING_LAST] = {};
>>>          bool buffer_idle = true;
>>> -      int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
>>>
>>>          /* Take references to all fences, so that we can wait for them
>>>           * without the lock. */
>>> @@ -214,8 +225,15 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
>>>                               RADEON_USAGE_WRITE);
>>>             } else {
>>>                /* Mapping for write. */
>>> -            if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo))
>>> -               cs->flush_cs(cs->flush_data, 0, NULL);
>>> +            if (cs) {
>>> +               if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
>>> +                  cs->flush_cs(cs->flush_data, 0, NULL);
>>> +               } else {
>>> +                  /* Try to avoid busy-waiting in radeon_bo_wait. */
>>> +                  if (p_atomic_read(&bo->num_active_ioctls))
>>> +                     amdgpu_cs_sync_flush(rcs);
>>> +               }
>>> +            }
>>>
>>>                amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
>>>                               RADEON_USAGE_READWRITE);
>>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
>>> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
>>> index 69ada10..a768771 100644
>>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
>>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
>>> @@ -53,6 +53,10 @@ struct amdgpu_winsys_bo {
>>>       /* how many command streams is this bo referenced in? */
>>>       int num_cs_references;
>>>
>>> +   /* how many command streams, which are being emitted in a separate
>>> +    * thread, is this bo referenced in? */
>>> +   volatile int num_active_ioctls;
>>> +
>>>       /* whether buffer_get_handle or buffer_from_handle was called,
>>>        * it can only transition from false to true
>>>        */
>>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
>>> b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
>>> index 03e45a9..419523b 100644
>>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
>>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
>>> @@ -50,6 +50,7 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned
>>> ip_type,
>>>       fence->fence.ip_type = ip_type;
>>>       fence->fence.ip_instance = ip_instance;
>>>       fence->fence.ring = ring;
>>> +   fence->submission_in_progress = true;
>>>       p_atomic_inc(&ctx->refcount);
>>>       return (struct pipe_fence_handle *)fence;
>>>    }
>>> @@ -62,6 +63,7 @@ static void amdgpu_fence_submitted(struct
>>> pipe_fence_handle *fence,
>>>
>>>       rfence->fence.fence = request->seq_no;
>>>       rfence->user_fence_cpu_address = user_fence_cpu_address;
>>> +   rfence->submission_in_progress = false;
>>>    }
>>>
>>>    static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
>>> @@ -69,6 +71,7 @@ static void amdgpu_fence_signalled(struct
>>> pipe_fence_handle *fence)
>>>       struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
>>>
>>>       rfence->signalled = true;
>>> +   rfence->submission_in_progress = false;
>>>    }
>>>
>>>    bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t
>>> timeout,
>>> @@ -88,6 +91,13 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence,
>>> uint64_t timeout,
>>>       else
>>>          abs_timeout = os_time_get_absolute_timeout(timeout);
>>>
>>> +   /* The fence might not have a number assigned if its IB is being
>>> +    * submitted in the other thread right now. Wait until the submission
>>> +    * is done. */
>>> +   if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress,
>>> +                                       abs_timeout))
>>> +      return false;
>>> +
>>>       user_fence_cpu = rfence->user_fence_cpu_address;
>>>       if (user_fence_cpu && *user_fence_cpu >= rfence->fence.fence) {
>>>          rfence->signalled = true;
>>> @@ -257,7 +267,7 @@ static bool amdgpu_get_new_ib(struct radeon_winsys
>>> *ws, struct amdgpu_ib *ib,
>>>       return true;
>>>    }
>>>
>>> -static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
>>> +static boolean amdgpu_init_cs_context(struct amdgpu_cs_context *cs,
>>>                                          enum ring_type ring_type)
>>>    {
>>>       int i;
>>> @@ -308,10 +318,18 @@ static boolean amdgpu_init_cs_context(struct
>>> amdgpu_cs *cs,
>>>       for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
>>>          cs->buffer_indices_hashlist[i] = -1;
>>>       }
>>> +
>>> +   cs->request.number_of_ibs = 1;
>>> +   cs->request.ibs = &cs->ib[IB_MAIN];
>>> +
>>> +   cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
>>> +   cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE |
>>> +                                     AMDGPU_IB_FLAG_PREAMBLE;
>>> +
>>>       return TRUE;
>>>    }
>>>
>>> -static void amdgpu_cs_context_cleanup(struct amdgpu_cs *cs)
>>> +static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
>>>    {
>>>       unsigned i;
>>>
>>> @@ -325,13 +343,14 @@ static void amdgpu_cs_context_cleanup(struct
>>> amdgpu_cs *cs)
>>>       cs->num_buffers = 0;
>>>       cs->used_gart = 0;
>>>       cs->used_vram = 0;
>>> +   amdgpu_fence_reference(&cs->fence, NULL);
>>>
>>>       for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
>>>          cs->buffer_indices_hashlist[i] = -1;
>>>       }
>>>    }
>>>
>>> -static void amdgpu_destroy_cs_context(struct amdgpu_cs *cs)
>>> +static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
>>>    {
>>>       amdgpu_cs_context_cleanup(cs);
>>>       FREE(cs->flags);
>>> @@ -356,24 +375,35 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
>>>          return NULL;
>>>       }
>>>
>>> +   pipe_semaphore_init(&cs->flush_completed, 1);
>>> +
>>>       cs->ctx = ctx;
>>>       cs->flush_cs = flush;
>>>       cs->flush_data = flush_ctx;
>>>       cs->ring_type = ring_type;
>>>
>>> -   if (!amdgpu_init_cs_context(cs, ring_type)) {
>>> +   if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) {
>>>          FREE(cs);
>>>          return NULL;
>>>       }
>>>
>>> -   if (!amdgpu_get_new_ib(&ctx->ws->base, &cs->main, &cs->ib[IB_MAIN],
>>> IB_MAIN)) {
>>> -      amdgpu_destroy_cs_context(cs);
>>> +   if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) {
>>> +      amdgpu_destroy_cs_context(&cs->csc1);
>>>          FREE(cs);
>>>          return NULL;
>>>       }
>>>
>>> -   cs->request.number_of_ibs = 1;
>>> -   cs->request.ibs = &cs->ib[IB_MAIN];
>>> +   /* Set the first submission context as current. */
>>> +   cs->csc = &cs->csc1;
>>> +   cs->cst = &cs->csc2;
>>> +
>>> +   if (!amdgpu_get_new_ib(&ctx->ws->base, &cs->main,
>>> &cs->csc->ib[IB_MAIN],
>>> +                          IB_MAIN)) {
>>> +      amdgpu_destroy_cs_context(&cs->csc2);
>>> +      amdgpu_destroy_cs_context(&cs->csc1);
>>> +      FREE(cs);
>>> +      return NULL;
>>> +   }
>>>
>>>       p_atomic_inc(&ctx->ws->num_cs);
>>>       return &cs->main.base;
>>> @@ -389,12 +419,15 @@ amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
>>>       if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
>>>          return NULL;
>>>
>>> -   if (!amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST],
>>> IB_CONST))
>>> +   if (!amdgpu_get_new_ib(&ws->base, &cs->const_ib,
>>> &cs->csc->ib[IB_CONST],
>>> +                          IB_CONST))
>>>          return NULL;
>>>
>>> -   cs->request.number_of_ibs = 2;
>>> -   cs->request.ibs = &cs->ib[IB_CONST];
>>> -   cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
>>> +   cs->csc->request.number_of_ibs = 2;
>>> +   cs->csc->request.ibs = &cs->csc->ib[IB_CONST];
>>> +
>>> +   cs->cst->request.number_of_ibs = 2;
>>> +   cs->cst->request.ibs = &cs->cst->ib[IB_CONST];
>>>
>>>       return &cs->const_ib.base;
>>>    }
>>> @@ -412,19 +445,21 @@ amdgpu_cs_add_const_preamble_ib(struct
>>> radeon_winsys_cs *rcs)
>>>          return NULL;
>>>
>>>       if (!amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib,
>>> -                                 &cs->ib[IB_CONST_PREAMBLE],
>>> IB_CONST_PREAMBLE))
>>> +                          &cs->csc->ib[IB_CONST_PREAMBLE],
>>> IB_CONST_PREAMBLE))
>>>          return NULL;
>>>
>>> -   cs->request.number_of_ibs = 3;
>>> -   cs->request.ibs = &cs->ib[IB_CONST_PREAMBLE];
>>> -   cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE |
>>> AMDGPU_IB_FLAG_PREAMBLE;
>>> +   cs->csc->request.number_of_ibs = 3;
>>> +   cs->csc->request.ibs = &cs->csc->ib[IB_CONST_PREAMBLE];
>>> +
>>> +   cs->cst->request.number_of_ibs = 3;
>>> +   cs->cst->request.ibs = &cs->cst->ib[IB_CONST_PREAMBLE];
>>>
>>>       return &cs->const_preamble_ib.base;
>>>    }
>>>
>>>    #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
>>>
>>> -int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo
>>> *bo)
>>> +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct
>>> amdgpu_winsys_bo *bo)
>>>    {
>>>       unsigned hash = bo->unique_id &
>>> (Elements(cs->buffer_indices_hashlist)-1);
>>>       int i = cs->buffer_indices_hashlist[hash];
>>> @@ -452,13 +487,14 @@ int amdgpu_lookup_buffer(struct amdgpu_cs *cs,
>>> struct amdgpu_winsys_bo *bo)
>>>       return -1;
>>>    }
>>>
>>> -static unsigned amdgpu_add_buffer(struct amdgpu_cs *cs,
>>> +static unsigned amdgpu_add_buffer(struct amdgpu_cs *acs,
>>>                                     struct amdgpu_winsys_bo *bo,
>>>                                     enum radeon_bo_usage usage,
>>>                                     enum radeon_bo_domain domains,
>>>                                     unsigned priority,
>>>                                     enum radeon_bo_domain *added_domains)
>>>    {
>>> +   struct amdgpu_cs_context *cs = acs->csc;
>>>       struct amdgpu_cs_buffer *buffer;
>>>       unsigned hash = bo->unique_id &
>>> (Elements(cs->buffer_indices_hashlist)-1);
>>>       int i = -1;
>>> @@ -526,9 +562,9 @@ static unsigned amdgpu_cs_add_buffer(struct
>>> radeon_winsys_cs *rcs,
>>>                                         priority, &added_domains);
>>>
>>>       if (added_domains & RADEON_DOMAIN_VRAM)
>>> -      cs->used_vram += bo->base.size;
>>> +      cs->csc->used_vram += bo->base.size;
>>>       else if (added_domains & RADEON_DOMAIN_GTT)
>>> -      cs->used_gart += bo->base.size;
>>> +      cs->csc->used_gart += bo->base.size;
>>>
>>>       return index;
>>>    }
>>> @@ -538,7 +574,7 @@ static int amdgpu_cs_lookup_buffer(struct
>>> radeon_winsys_cs *rcs,
>>>    {
>>>       struct amdgpu_cs *cs = amdgpu_cs(rcs);
>>>
>>> -   return amdgpu_lookup_buffer(cs, (struct amdgpu_winsys_bo*)buf);
>>> +   return amdgpu_lookup_buffer(cs->csc, (struct amdgpu_winsys_bo*)buf);
>>>    }
>>>
>>>    static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
>>> @@ -551,8 +587,8 @@ static boolean amdgpu_cs_memory_below_limit(struct
>>> radeon_winsys_cs *rcs, uint64
>>>       struct amdgpu_cs *cs = amdgpu_cs(rcs);
>>>       struct amdgpu_winsys *ws = cs->ctx->ws;
>>>
>>> -   vram += cs->used_vram;
>>> -   gtt += cs->used_gart;
>>> +   vram += cs->csc->used_vram;
>>> +   gtt += cs->csc->used_gart;
>>>
>>>       /* Anything that goes above the VRAM size should go to GTT. */
>>>       if (vram > ws->info.vram_size)
>>> @@ -565,7 +601,7 @@ static boolean amdgpu_cs_memory_below_limit(struct
>>> radeon_winsys_cs *rcs, uint64
>>>    static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
>>>                                              struct radeon_bo_list_item
>>> *list)
>>>    {
>>> -    struct amdgpu_cs *cs = amdgpu_cs(rcs);
>>> +    struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
>>>        int i;
>>>
>>>        if (list) {
>>> @@ -578,26 +614,18 @@ static unsigned amdgpu_cs_get_buffer_list(struct
>>> radeon_winsys_cs *rcs,
>>>        return cs->num_buffers;
>>>    }
>>>
>>> -static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
>>> -                                    struct pipe_fence_handle **out_fence)
>>> -{
>>> -   struct amdgpu_winsys *ws = cs->ctx->ws;
>>> -   struct pipe_fence_handle *fence;
>>> -   int i, j, r;
>>> +DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
>>>
>>> -   /* Create a fence. */
>>> -   fence = amdgpu_fence_create(cs->ctx,
>>> -                               cs->request.ip_type,
>>> -                               cs->request.ip_instance,
>>> -                               cs->request.ring);
>>> -   if (out_fence)
>>> -      amdgpu_fence_reference(out_fence, fence);
>>> +/* Since the kernel driver doesn't synchronize execution between
>>> different
>>> + * rings automatically, we have to add fence dependencies manually.
>>> + */
>>> +static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
>>> +{
>>> +   struct amdgpu_cs_context *cs = acs->csc;
>>> +   int i, j;
>>>
>>>       cs->request.number_of_dependencies = 0;
>>>
>>> -   /* Since the kernel driver doesn't synchronize execution between
>>> different
>>> -    * rings automatically, we have to add fence dependencies manually. */
>>> -   pipe_mutex_lock(ws->bo_fence_lock);
>>>       for (i = 0; i < cs->num_buffers; i++) {
>>>          for (j = 0; j < RING_LAST; j++) {
>>>             struct amdgpu_cs_fence *dep;
>>> @@ -607,7 +635,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs
>>> *cs,
>>>             if (!bo_fence)
>>>                continue;
>>>
>>> -         if (bo_fence->ctx == cs->ctx &&
>>> +         if (bo_fence->ctx == acs->ctx &&
>>>                 bo_fence->fence.ip_type == cs->request.ip_type &&
>>>                 bo_fence->fence.ip_instance == cs->request.ip_instance &&
>>>                 bo_fence->fence.ring == cs->request.ring)
>>> @@ -616,6 +644,10 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs
>>> *cs,
>>>             if (amdgpu_fence_wait((void *)bo_fence, 0, false))
>>>                continue;
>>>
>>> +         if (bo_fence->submission_in_progress)
>>> +            os_wait_until_zero(&bo_fence->submission_in_progress,
>>> +                               PIPE_TIMEOUT_INFINITE);
>>> +
>>>             idx = cs->request.number_of_dependencies++;
>>>             if (idx >= cs->max_dependencies) {
>>>                unsigned size;
>>> @@ -629,14 +661,62 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs
>>> *cs,
>>>             memcpy(dep, &bo_fence->fence, sizeof(*dep));
>>>          }
>>>       }
>>> +}
>>> +
>>> +void amdgpu_cs_submit_ib(struct amdgpu_cs *acs)
>>> +{
>>> +   struct amdgpu_winsys *ws = acs->ctx->ws;
>>> +   struct amdgpu_cs_context *cs = acs->cst;
>>> +   int i, r;
>>>
>>>       cs->request.fence_info.handle = NULL;
>>> -   if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type !=
>>> AMDGPU_HW_IP_VCE) {
>>> -       cs->request.fence_info.handle = cs->ctx->user_fence_bo;
>>> -       cs->request.fence_info.offset = cs->ring_type;
>>> +   if (cs->request.ip_type != AMDGPU_HW_IP_UVD &&
>>> +       cs->request.ip_type != AMDGPU_HW_IP_VCE) {
>>> +       cs->request.fence_info.handle = acs->ctx->user_fence_bo;
>>> +       cs->request.fence_info.offset = acs->ring_type;
>>> +   }
>>> +
>>> +   /* Create the buffer list.
>>> +    * Use a buffer list containing all allocated buffers if requested.
>>> +    */
>>> +   if (debug_get_option_all_bos()) {
>>> +      struct amdgpu_winsys_bo *bo;
>>> +      amdgpu_bo_handle *handles;
>>> +      unsigned num = 0;
>>> +
>>> +      pipe_mutex_lock(ws->global_bo_list_lock);
>>> +
>>> +      handles = malloc(sizeof(handles[0]) * ws->num_buffers);
>>> +      if (!handles) {
>>> +         pipe_mutex_unlock(ws->global_bo_list_lock);
>>> +         amdgpu_cs_context_cleanup(cs);
>>> +         return;
>>> +      }
>>> +
>>> +      LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
>>> +         assert(num < ws->num_buffers);
>>> +         handles[num++] = bo->bo;
>>> +      }
>>> +
>>> +      r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
>>> +                                handles, NULL,
>>> +                                &cs->request.resources);
>>> +      free(handles);
>>> +      pipe_mutex_unlock(ws->global_bo_list_lock);
>>> +   } else {
>>> +      r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
>>> +                                cs->handles, cs->flags,
>>> +                                &cs->request.resources);
>>>       }
>>>
>>> -   r = amdgpu_cs_submit(cs->ctx->ctx, 0, &cs->request, 1);
>>> +   if (r) {
>>> +      fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
>>> +      cs->request.resources = NULL;
>>> +      amdgpu_fence_signalled(cs->fence);
>>> +      goto cleanup;
>>> +   }
>>> +
>>> +   r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1);
>>>       if (r) {
>>>          if (r == -ENOMEM)
>>>             fprintf(stderr, "amdgpu: Not enough memory for command
>>> submission.\n");
>>> @@ -644,30 +724,43 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs
>>> *cs,
>>>             fprintf(stderr, "amdgpu: The CS has been rejected, "
>>>                     "see dmesg for more information.\n");
>>>
>>> -      amdgpu_fence_signalled(fence);
>>> +      amdgpu_fence_signalled(cs->fence);
>>>       } else {
>>>          /* Success. */
>>>          uint64_t *user_fence = NULL;
>>> -      if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type
>>> != AMDGPU_HW_IP_VCE)
>>> -         user_fence = cs->ctx->user_fence_cpu_address_base +
>>> +      if (cs->request.ip_type != AMDGPU_HW_IP_UVD &&
>>> +          cs->request.ip_type != AMDGPU_HW_IP_VCE)
>>> +         user_fence = acs->ctx->user_fence_cpu_address_base +
>>>                          cs->request.fence_info.offset;
>>> -      amdgpu_fence_submitted(fence, &cs->request, user_fence);
>>> -
>>> -      for (i = 0; i < cs->num_buffers; i++)
>>> -         amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->ring_type],
>>> -                                fence);
>>> +      amdgpu_fence_submitted(cs->fence, &cs->request, user_fence);
>>>       }
>>> -   pipe_mutex_unlock(ws->bo_fence_lock);
>>> -   amdgpu_fence_reference(&fence, NULL);
>>> +
>>> +   /* Cleanup. */
>>> +   if (cs->request.resources)
>>> +      amdgpu_bo_list_destroy(cs->request.resources);
>>> +
>>> +cleanup:
>>> +   for (i = 0; i < cs->num_buffers; i++)
>>> +      p_atomic_dec(&cs->buffers[i].bo->num_active_ioctls);
>>> +
>>> +   amdgpu_cs_context_cleanup(cs);
>>>    }
>>>
>>> -static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
>>> +/* Make sure the previous submission is completed. */
>>> +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
>>>    {
>>> -   /* no-op */
>>> +   struct amdgpu_cs *cs = amdgpu_cs(rcs);
>>> +
>>> +   /* Wait for any pending ioctl of this CS to complete. */
>>> +   if (cs->ctx->ws->thread) {
>>> +      /* wait and set the semaphore to "busy" */
>>> +      pipe_semaphore_wait(&cs->flush_completed);
>>> +      /* set the semaphore to "idle" */
>>> +      pipe_semaphore_signal(&cs->flush_completed);
>>> +   }
>>>    }
>>>
>>>    DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
>>> -DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
>>>
>>>    static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
>>>                                unsigned flags,
>>> @@ -720,74 +813,69 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs
>>> *rcs,
>>>                               RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
>>>
>>>       /* If the CS is not empty or overflowed.... */
>>> -   if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw &&
>>> !debug_get_option_noop()) {
>>> -      int r;
>>> -
>>> -      /* Use a buffer list containing all allocated buffers if requested.
>>> */
>>> -      if (debug_get_option_all_bos()) {
>>> -         struct amdgpu_winsys_bo *bo;
>>> -         amdgpu_bo_handle *handles;
>>> -         unsigned num = 0;
>>> -
>>> -         pipe_mutex_lock(ws->global_bo_list_lock);
>>> -
>>> -         handles = malloc(sizeof(handles[0]) * ws->num_buffers);
>>> -         if (!handles) {
>>> -            pipe_mutex_unlock(ws->global_bo_list_lock);
>>> -            goto cleanup;
>>> -         }
>>> +   if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw &&
>>> +       !debug_get_option_noop()) {
>>> +      struct amdgpu_cs_context *cur = cs->csc;
>>> +      unsigned i, num_buffers = cur->num_buffers;
>>>
>>> -         LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
>>> -            assert(num < ws->num_buffers);
>>> -            handles[num++] = bo->bo;
>>> -         }
>>> -
>>> -         r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
>>> -                                   handles, NULL,
>>> -                                   &cs->request.resources);
>>> -         free(handles);
>>> -         pipe_mutex_unlock(ws->global_bo_list_lock);
>>> -      } else {
>>> -         r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
>>> -                                   cs->handles, cs->flags,
>>> -                                   &cs->request.resources);
>>> -      }
>>> -
>>> -      if (r) {
>>> -         fprintf(stderr, "amdgpu: resource list creation failed (%d)\n",
>>> r);
>>> -         cs->request.resources = NULL;
>>> -        goto cleanup;
>>> -      }
>>> -
>>> -      cs->ib[IB_MAIN].size = cs->main.base.cdw;
>>> +      /* Set IB sizes. */
>>> +      cur->ib[IB_MAIN].size = cs->main.base.cdw;
>>>          cs->main.used_ib_space += cs->main.base.cdw * 4;
>>>
>>>          if (cs->const_ib.ib_mapped) {
>>> -         cs->ib[IB_CONST].size = cs->const_ib.base.cdw;
>>> +         cur->ib[IB_CONST].size = cs->const_ib.base.cdw;
>>>             cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4;
>>>          }
>>>
>>>          if (cs->const_preamble_ib.ib_mapped) {
>>> -         cs->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw;
>>> +         cur->ib[IB_CONST_PREAMBLE].size =
>>> cs->const_preamble_ib.base.cdw;
>>>             cs->const_preamble_ib.used_ib_space +=
>>> cs->const_preamble_ib.base.cdw * 4;
>>>          }
>>>
>>> -      amdgpu_cs_do_submission(cs, fence);
>>> +      /* Create a fence. */
>>> +      amdgpu_fence_reference(&cur->fence, NULL);
>>> +      cur->fence = amdgpu_fence_create(cs->ctx,
>>> +                                           cur->request.ip_type,
>>> +                                           cur->request.ip_instance,
>>> +                                           cur->request.ring);
>>> +      if (fence)
>>> +         amdgpu_fence_reference(fence, cur->fence);
>>> +
>>> +      /* Prepare buffers. */
>>> +      pipe_mutex_lock(ws->bo_fence_lock);
>>> +      amdgpu_add_fence_dependencies(cs);
>>> +      for (i = 0; i < num_buffers; i++) {
>>> +         p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls);
>>> +
>>> amdgpu_fence_reference(&cur->buffers[i].bo->fence[cs->ring_type],
>>> +                                cur->fence);
>>> +      }
>>> +      pipe_mutex_unlock(ws->bo_fence_lock);
>>>
>>> -      /* Cleanup. */
>>> -      if (cs->request.resources)
>>> -         amdgpu_bo_list_destroy(cs->request.resources);
>>> -   }
>>> +      amdgpu_cs_sync_flush(rcs);
>>>
>>> -cleanup:
>>> -   amdgpu_cs_context_cleanup(cs);
>>> +      /* Swap command streams. "cst" is going to be submitted. */
>>> +      cs->csc = cs->cst;
>>> +      cs->cst = cur;
>>> +
>>> +      /* Submit. */
>>> +      if (ws->thread && (flags & RADEON_FLUSH_ASYNC)) {
>>> +         /* Set the semaphore to "busy". */
>>> +         pipe_semaphore_wait(&cs->flush_completed);
>>> +         amdgpu_ws_queue_cs(ws, cs);
>>> +      } else {
>>> +         amdgpu_cs_submit_ib(cs);
>>> +      }
>>> +   } else {
>>> +      amdgpu_cs_context_cleanup(cs->csc);
>>> +   }
>>>
>>> -   amdgpu_get_new_ib(&ws->base, &cs->main, &cs->ib[IB_MAIN], IB_MAIN);
>>> +   amdgpu_get_new_ib(&ws->base, &cs->main, &cs->csc->ib[IB_MAIN],
>>> IB_MAIN);
>>>       if (cs->const_ib.ib_mapped)
>>> -      amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST],
>>> IB_CONST);
>>> +      amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->csc->ib[IB_CONST],
>>> +                        IB_CONST);
>>>       if (cs->const_preamble_ib.ib_mapped)
>>>          amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib,
>>> -                                 &cs->ib[IB_CONST_PREAMBLE],
>>> IB_CONST_PREAMBLE);
>>> +                        &cs->csc->ib[IB_CONST_PREAMBLE],
>>> IB_CONST_PREAMBLE);
>>>
>>>       ws->num_cs_flushes++;
>>>    }
>>> @@ -796,11 +884,14 @@ static void amdgpu_cs_destroy(struct
>>> radeon_winsys_cs *rcs)
>>>    {
>>>       struct amdgpu_cs *cs = amdgpu_cs(rcs);
>>>
>>> -   amdgpu_destroy_cs_context(cs);
>>> +   amdgpu_cs_sync_flush(rcs);
>>> +   pipe_semaphore_destroy(&cs->flush_completed);
>>>       p_atomic_dec(&cs->ctx->ws->num_cs);
>>>       pb_reference(&cs->main.big_ib_buffer, NULL);
>>>       pb_reference(&cs->const_ib.big_ib_buffer, NULL);
>>>       pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL);
>>> +   amdgpu_destroy_cs_context(&cs->csc1);
>>> +   amdgpu_destroy_cs_context(&cs->csc2);
>>>       FREE(cs);
>>>    }
>>>
>>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
>>> b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
>>> index 4ed830b..1caec0a 100644
>>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
>>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
>>> @@ -66,18 +66,7 @@ enum {
>>>       IB_NUM
>>>    };
>>>
>>> -struct amdgpu_cs {
>>> -   struct amdgpu_ib main; /* must be first because this is inherited */
>>> -   struct amdgpu_ib const_ib; /* optional constant engine IB */
>>> -   struct amdgpu_ib const_preamble_ib;
>>> -   struct amdgpu_ctx *ctx;
>>> -
>>> -   /* Flush CS. */
>>> -   void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle
>>> **fence);
>>> -   void *flush_data;
>>> -
>>> -   /* amdgpu_cs_submit parameters */
>>> -   enum ring_type              ring_type;
>>> +struct amdgpu_cs_context {
>>>       struct amdgpu_cs_request    request;
>>>       struct amdgpu_cs_ib_info    ib[IB_NUM];
>>>
>>> @@ -94,6 +83,32 @@ struct amdgpu_cs {
>>>       uint64_t                    used_gart;
>>>
>>>       unsigned                    max_dependencies;
>>> +
>>> +   struct pipe_fence_handle    *fence;
>>> +};
>>> +
>>> +struct amdgpu_cs {
>>> +   struct amdgpu_ib main; /* must be first because this is inherited */
>>> +   struct amdgpu_ib const_ib; /* optional constant engine IB */
>>> +   struct amdgpu_ib const_preamble_ib;
>>> +   struct amdgpu_ctx *ctx;
>>> +   enum ring_type ring_type;
>>> +
>>> +   /* We flip between these two CS. While one is being consumed
>>> +    * by the kernel in another thread, the other one is being filled
>>> +    * by the pipe driver. */
>>> +   struct amdgpu_cs_context csc1;
>>> +   struct amdgpu_cs_context csc2;
>>> +   /* The currently-used CS. */
>>> +   struct amdgpu_cs_context *csc;
>>> +   /* The CS being currently-owned by the other thread. */
>>> +   struct amdgpu_cs_context *cst;
>>> +
>>> +   /* Flush CS. */
>>> +   void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle
>>> **fence);
>>> +   void *flush_data;
>>> +
>>> +   pipe_semaphore flush_completed;
>>>    };
>>>
>>>    struct amdgpu_fence {
>>> @@ -103,6 +118,9 @@ struct amdgpu_fence {
>>>       struct amdgpu_cs_fence fence;
>>>       uint64_t *user_fence_cpu_address;
>>>
>>> +   /* If the fence is unknown due to an IB still being submitted
>>> +    * in the other thread. */
>>> +   volatile int submission_in_progress; /* bool (int for atomicity) */
>>>       volatile int signalled;              /* bool (int for atomicity) */
>>>    };
>>>
>>> @@ -128,7 +146,7 @@ static inline void amdgpu_fence_reference(struct
>>> pipe_fence_handle **dst,
>>>       *rdst = rsrc;
>>>    }
>>>
>>> -int amdgpu_lookup_buffer(struct amdgpu_cs *csc, struct amdgpu_winsys_bo
>>> *bo);
>>> +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct
>>> amdgpu_winsys_bo *bo);
>>>
>>>    static inline struct amdgpu_cs *
>>>    amdgpu_cs(struct radeon_winsys_cs *base)
>>> @@ -142,7 +160,7 @@ amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs,
>>>    {
>>>       int num_refs = bo->num_cs_references;
>>>       return num_refs == bo->ws->num_cs ||
>>> -         (num_refs && amdgpu_lookup_buffer(cs, bo) != -1);
>>> +         (num_refs && amdgpu_lookup_buffer(cs->csc, bo) != -1);
>>>    }
>>>
>>>    static inline boolean
>>> @@ -155,11 +173,11 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct
>>> amdgpu_cs *cs,
>>>       if (!bo->num_cs_references)
>>>          return FALSE;
>>>
>>> -   index = amdgpu_lookup_buffer(cs, bo);
>>> +   index = amdgpu_lookup_buffer(cs->csc, bo);
>>>       if (index == -1)
>>>          return FALSE;
>>>
>>> -   return (cs->buffers[index].usage & usage) != 0;
>>> +   return (cs->csc->buffers[index].usage & usage) != 0;
>>>    }
>>>
>>>    static inline boolean
>>> @@ -170,6 +188,8 @@ amdgpu_bo_is_referenced_by_any_cs(struct
>>> amdgpu_winsys_bo *bo)
>>>
>>>    bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t
>>> timeout,
>>>                           bool absolute);
>>> +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs);
>>>    void amdgpu_cs_init_functions(struct amdgpu_winsys *ws);
>>> +void amdgpu_cs_submit_ib(struct amdgpu_cs *cs);
>>>
>>>    #endif
>>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
>>> b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
>>> index e6db145..4bd3165 100644
>>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
>>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
>>> @@ -308,6 +308,13 @@ static void amdgpu_winsys_destroy(struct
>>> radeon_winsys *rws)
>>>    {
>>>       struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
>>>
>>> +   if (ws->thread) {
>>> +      ws->kill_thread = 1;
>>> +      pipe_semaphore_signal(&ws->cs_queued);
>>> +      pipe_thread_wait(ws->thread);
>>> +   }
>>> +   pipe_semaphore_destroy(&ws->cs_queued);
>>> +   pipe_mutex_destroy(ws->cs_queue_lock);
>>>       pipe_mutex_destroy(ws->bo_fence_lock);
>>>       pb_cache_deinit(&ws->bo_cache);
>>>       pipe_mutex_destroy(ws->global_bo_list_lock);
>>> @@ -392,6 +399,55 @@ static int compare_dev(void *key1, void *key2)
>>>       return key1 != key2;
>>>    }
>>>
>>> +void amdgpu_ws_queue_cs(struct amdgpu_winsys *ws, struct amdgpu_cs *cs)
>>> +{
>>> +retry:
>>> +   pipe_mutex_lock(ws->cs_queue_lock);
>>> +   if (ws->num_enqueued_cs >= ARRAY_SIZE(ws->cs_queue)) {
>>> +      /* no room left for a flush */
>>> +      pipe_mutex_unlock(ws->cs_queue_lock);
>>> +      goto retry;
>>> +   }
>>
>>
>> This is a busy loop - shouldn't happen in normal programs, but it's still
>> not very nice. How about adding a cs_queue_space semaphore that is
>> initialized with the array size? Alternatively, by open-coding that
>> semaphore (mutex + condvar), one additional mutex could be avoided, but I
>> don't feel strongly either way.
>
> OK, I'll add the semaphore.

Thanks.

Nicolai

> Marek
>