[Mesa-dev] [PATCH] winsys/amdgpu: add back multithreaded command submission
Nicolai Hähnle
nhaehnle at gmail.com
Sat May 7 15:12:49 UTC 2016
Looks good to me, just two remarks below...
On 06.05.2016 13:31, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> Ported from the initial amdgpu winsys from the private AMD branch.
>
> The thread creates the buffer list, submits IBs, and cleans up
> the submission context, which can also destroy buffers.
>
> 3-5% reduction in CPU overhead is expected for apps submitting a lot
> of IBs per frame. This is most visible with DMA IBs.
> ---
> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 26 ++-
> src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 4 +
> src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 311 +++++++++++++++++---------
> src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 52 +++--
> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 61 +++++
> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 9 +
> 6 files changed, 333 insertions(+), 130 deletions(-)
>
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 37a41c0..ec5fa6a 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -43,8 +43,21 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
> {
> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
> struct amdgpu_winsys *ws = bo->ws;
> + int64_t abs_timeout;
> int i;
>
> + if (timeout == 0) {
> + if (p_atomic_read(&bo->num_active_ioctls))
> + return false;
> +
> + } else {
> + abs_timeout = os_time_get_absolute_timeout(timeout);
> +
> + /* Wait if any ioctl is being submitted with this buffer. */
> + if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
> + return false;
> + }
I'd suggest to do the cs_sync_flush here instead of below - there is
less action at a distance, and some additional code paths end up covered
by a flush as well.
> +
> if (bo->is_shared) {
> /* We can't use user fences for shared buffers, because user fences
> * are local to this process only. If we want to wait for all buffer
> @@ -61,7 +74,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
> }
>
> if (timeout == 0) {
> - /* Timeout == 0 is quite simple. */
> pipe_mutex_lock(ws->bo_fence_lock);
> for (i = 0; i < RING_LAST; i++)
> if (bo->fence[i]) {
> @@ -80,7 +92,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
> struct pipe_fence_handle *fence[RING_LAST] = {};
> bool fence_idle[RING_LAST] = {};
> bool buffer_idle = true;
> - int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
>
> /* Take references to all fences, so that we can wait for them
> * without the lock. */
> @@ -214,8 +225,15 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
> RADEON_USAGE_WRITE);
> } else {
> /* Mapping for write. */
> - if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo))
> - cs->flush_cs(cs->flush_data, 0, NULL);
> + if (cs) {
> + if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
> + cs->flush_cs(cs->flush_data, 0, NULL);
> + } else {
> + /* Try to avoid busy-waiting in radeon_bo_wait. */
> + if (p_atomic_read(&bo->num_active_ioctls))
> + amdgpu_cs_sync_flush(rcs);
> + }
> + }
>
> amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
> RADEON_USAGE_READWRITE);
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
> index 69ada10..a768771 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
> @@ -53,6 +53,10 @@ struct amdgpu_winsys_bo {
> /* how many command streams is this bo referenced in? */
> int num_cs_references;
>
> + /* how many command streams, which are being emitted in a separate
> + * thread, is this bo referenced in? */
> + volatile int num_active_ioctls;
> +
> /* whether buffer_get_handle or buffer_from_handle was called,
> * it can only transition from false to true
> */
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> index 03e45a9..419523b 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> @@ -50,6 +50,7 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
> fence->fence.ip_type = ip_type;
> fence->fence.ip_instance = ip_instance;
> fence->fence.ring = ring;
> + fence->submission_in_progress = true;
> p_atomic_inc(&ctx->refcount);
> return (struct pipe_fence_handle *)fence;
> }
> @@ -62,6 +63,7 @@ static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
>
> rfence->fence.fence = request->seq_no;
> rfence->user_fence_cpu_address = user_fence_cpu_address;
> + rfence->submission_in_progress = false;
> }
>
> static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
> @@ -69,6 +71,7 @@ static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
> struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
>
> rfence->signalled = true;
> + rfence->submission_in_progress = false;
> }
>
> bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
> @@ -88,6 +91,13 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
> else
> abs_timeout = os_time_get_absolute_timeout(timeout);
>
> + /* The fence might not have a number assigned if its IB is being
> + * submitted in the other thread right now. Wait until the submission
> + * is done. */
> + if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress,
> + abs_timeout))
> + return false;
> +
> user_fence_cpu = rfence->user_fence_cpu_address;
> if (user_fence_cpu && *user_fence_cpu >= rfence->fence.fence) {
> rfence->signalled = true;
> @@ -257,7 +267,7 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
> return true;
> }
>
> -static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
> +static boolean amdgpu_init_cs_context(struct amdgpu_cs_context *cs,
> enum ring_type ring_type)
> {
> int i;
> @@ -308,10 +318,18 @@ static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
> for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
> cs->buffer_indices_hashlist[i] = -1;
> }
> +
> + cs->request.number_of_ibs = 1;
> + cs->request.ibs = &cs->ib[IB_MAIN];
> +
> + cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
> + cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE |
> + AMDGPU_IB_FLAG_PREAMBLE;
> +
> return TRUE;
> }
>
> -static void amdgpu_cs_context_cleanup(struct amdgpu_cs *cs)
> +static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
> {
> unsigned i;
>
> @@ -325,13 +343,14 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs *cs)
> cs->num_buffers = 0;
> cs->used_gart = 0;
> cs->used_vram = 0;
> + amdgpu_fence_reference(&cs->fence, NULL);
>
> for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
> cs->buffer_indices_hashlist[i] = -1;
> }
> }
>
> -static void amdgpu_destroy_cs_context(struct amdgpu_cs *cs)
> +static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
> {
> amdgpu_cs_context_cleanup(cs);
> FREE(cs->flags);
> @@ -356,24 +375,35 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
> return NULL;
> }
>
> + pipe_semaphore_init(&cs->flush_completed, 1);
> +
> cs->ctx = ctx;
> cs->flush_cs = flush;
> cs->flush_data = flush_ctx;
> cs->ring_type = ring_type;
>
> - if (!amdgpu_init_cs_context(cs, ring_type)) {
> + if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) {
> FREE(cs);
> return NULL;
> }
>
> - if (!amdgpu_get_new_ib(&ctx->ws->base, &cs->main, &cs->ib[IB_MAIN], IB_MAIN)) {
> - amdgpu_destroy_cs_context(cs);
> + if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) {
> + amdgpu_destroy_cs_context(&cs->csc1);
> FREE(cs);
> return NULL;
> }
>
> - cs->request.number_of_ibs = 1;
> - cs->request.ibs = &cs->ib[IB_MAIN];
> + /* Set the first submission context as current. */
> + cs->csc = &cs->csc1;
> + cs->cst = &cs->csc2;
> +
> + if (!amdgpu_get_new_ib(&ctx->ws->base, &cs->main, &cs->csc->ib[IB_MAIN],
> + IB_MAIN)) {
> + amdgpu_destroy_cs_context(&cs->csc2);
> + amdgpu_destroy_cs_context(&cs->csc1);
> + FREE(cs);
> + return NULL;
> + }
>
> p_atomic_inc(&ctx->ws->num_cs);
> return &cs->main.base;
> @@ -389,12 +419,15 @@ amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
> if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
> return NULL;
>
> - if (!amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST], IB_CONST))
> + if (!amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->csc->ib[IB_CONST],
> + IB_CONST))
> return NULL;
>
> - cs->request.number_of_ibs = 2;
> - cs->request.ibs = &cs->ib[IB_CONST];
> - cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
> + cs->csc->request.number_of_ibs = 2;
> + cs->csc->request.ibs = &cs->csc->ib[IB_CONST];
> +
> + cs->cst->request.number_of_ibs = 2;
> + cs->cst->request.ibs = &cs->cst->ib[IB_CONST];
>
> return &cs->const_ib.base;
> }
> @@ -412,19 +445,21 @@ amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs)
> return NULL;
>
> if (!amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib,
> - &cs->ib[IB_CONST_PREAMBLE], IB_CONST_PREAMBLE))
> + &cs->csc->ib[IB_CONST_PREAMBLE], IB_CONST_PREAMBLE))
> return NULL;
>
> - cs->request.number_of_ibs = 3;
> - cs->request.ibs = &cs->ib[IB_CONST_PREAMBLE];
> - cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | AMDGPU_IB_FLAG_PREAMBLE;
> + cs->csc->request.number_of_ibs = 3;
> + cs->csc->request.ibs = &cs->csc->ib[IB_CONST_PREAMBLE];
> +
> + cs->cst->request.number_of_ibs = 3;
> + cs->cst->request.ibs = &cs->cst->ib[IB_CONST_PREAMBLE];
>
> return &cs->const_preamble_ib.base;
> }
>
> #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
>
> -int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
> +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
> {
> unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
> int i = cs->buffer_indices_hashlist[hash];
> @@ -452,13 +487,14 @@ int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
> return -1;
> }
>
> -static unsigned amdgpu_add_buffer(struct amdgpu_cs *cs,
> +static unsigned amdgpu_add_buffer(struct amdgpu_cs *acs,
> struct amdgpu_winsys_bo *bo,
> enum radeon_bo_usage usage,
> enum radeon_bo_domain domains,
> unsigned priority,
> enum radeon_bo_domain *added_domains)
> {
> + struct amdgpu_cs_context *cs = acs->csc;
> struct amdgpu_cs_buffer *buffer;
> unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
> int i = -1;
> @@ -526,9 +562,9 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
> priority, &added_domains);
>
> if (added_domains & RADEON_DOMAIN_VRAM)
> - cs->used_vram += bo->base.size;
> + cs->csc->used_vram += bo->base.size;
> else if (added_domains & RADEON_DOMAIN_GTT)
> - cs->used_gart += bo->base.size;
> + cs->csc->used_gart += bo->base.size;
>
> return index;
> }
> @@ -538,7 +574,7 @@ static int amdgpu_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
> {
> struct amdgpu_cs *cs = amdgpu_cs(rcs);
>
> - return amdgpu_lookup_buffer(cs, (struct amdgpu_winsys_bo*)buf);
> + return amdgpu_lookup_buffer(cs->csc, (struct amdgpu_winsys_bo*)buf);
> }
>
> static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
> @@ -551,8 +587,8 @@ static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64
> struct amdgpu_cs *cs = amdgpu_cs(rcs);
> struct amdgpu_winsys *ws = cs->ctx->ws;
>
> - vram += cs->used_vram;
> - gtt += cs->used_gart;
> + vram += cs->csc->used_vram;
> + gtt += cs->csc->used_gart;
>
> /* Anything that goes above the VRAM size should go to GTT. */
> if (vram > ws->info.vram_size)
> @@ -565,7 +601,7 @@ static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64
> static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
> struct radeon_bo_list_item *list)
> {
> - struct amdgpu_cs *cs = amdgpu_cs(rcs);
> + struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
> int i;
>
> if (list) {
> @@ -578,26 +614,18 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
> return cs->num_buffers;
> }
>
> -static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
> - struct pipe_fence_handle **out_fence)
> -{
> - struct amdgpu_winsys *ws = cs->ctx->ws;
> - struct pipe_fence_handle *fence;
> - int i, j, r;
> +DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
>
> - /* Create a fence. */
> - fence = amdgpu_fence_create(cs->ctx,
> - cs->request.ip_type,
> - cs->request.ip_instance,
> - cs->request.ring);
> - if (out_fence)
> - amdgpu_fence_reference(out_fence, fence);
> +/* Since the kernel driver doesn't synchronize execution between different
> + * rings automatically, we have to add fence dependencies manually.
> + */
> +static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
> +{
> + struct amdgpu_cs_context *cs = acs->csc;
> + int i, j;
>
> cs->request.number_of_dependencies = 0;
>
> - /* Since the kernel driver doesn't synchronize execution between different
> - * rings automatically, we have to add fence dependencies manually. */
> - pipe_mutex_lock(ws->bo_fence_lock);
> for (i = 0; i < cs->num_buffers; i++) {
> for (j = 0; j < RING_LAST; j++) {
> struct amdgpu_cs_fence *dep;
> @@ -607,7 +635,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
> if (!bo_fence)
> continue;
>
> - if (bo_fence->ctx == cs->ctx &&
> + if (bo_fence->ctx == acs->ctx &&
> bo_fence->fence.ip_type == cs->request.ip_type &&
> bo_fence->fence.ip_instance == cs->request.ip_instance &&
> bo_fence->fence.ring == cs->request.ring)
> @@ -616,6 +644,10 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
> if (amdgpu_fence_wait((void *)bo_fence, 0, false))
> continue;
>
> + if (bo_fence->submission_in_progress)
> + os_wait_until_zero(&bo_fence->submission_in_progress,
> + PIPE_TIMEOUT_INFINITE);
> +
> idx = cs->request.number_of_dependencies++;
> if (idx >= cs->max_dependencies) {
> unsigned size;
> @@ -629,14 +661,62 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
> memcpy(dep, &bo_fence->fence, sizeof(*dep));
> }
> }
> +}
> +
> +void amdgpu_cs_submit_ib(struct amdgpu_cs *acs)
> +{
> + struct amdgpu_winsys *ws = acs->ctx->ws;
> + struct amdgpu_cs_context *cs = acs->cst;
> + int i, r;
>
> cs->request.fence_info.handle = NULL;
> - if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE) {
> - cs->request.fence_info.handle = cs->ctx->user_fence_bo;
> - cs->request.fence_info.offset = cs->ring_type;
> + if (cs->request.ip_type != AMDGPU_HW_IP_UVD &&
> + cs->request.ip_type != AMDGPU_HW_IP_VCE) {
> + cs->request.fence_info.handle = acs->ctx->user_fence_bo;
> + cs->request.fence_info.offset = acs->ring_type;
> + }
> +
> + /* Create the buffer list.
> + * Use a buffer list containing all allocated buffers if requested.
> + */
> + if (debug_get_option_all_bos()) {
> + struct amdgpu_winsys_bo *bo;
> + amdgpu_bo_handle *handles;
> + unsigned num = 0;
> +
> + pipe_mutex_lock(ws->global_bo_list_lock);
> +
> + handles = malloc(sizeof(handles[0]) * ws->num_buffers);
> + if (!handles) {
> + pipe_mutex_unlock(ws->global_bo_list_lock);
> + amdgpu_cs_context_cleanup(cs);
> + return;
> + }
> +
> + LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
> + assert(num < ws->num_buffers);
> + handles[num++] = bo->bo;
> + }
> +
> + r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
> + handles, NULL,
> + &cs->request.resources);
> + free(handles);
> + pipe_mutex_unlock(ws->global_bo_list_lock);
> + } else {
> + r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
> + cs->handles, cs->flags,
> + &cs->request.resources);
> }
>
> - r = amdgpu_cs_submit(cs->ctx->ctx, 0, &cs->request, 1);
> + if (r) {
> + fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
> + cs->request.resources = NULL;
> + amdgpu_fence_signalled(cs->fence);
> + goto cleanup;
> + }
> +
> + r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1);
> if (r) {
> if (r == -ENOMEM)
> fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
> @@ -644,30 +724,43 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
> fprintf(stderr, "amdgpu: The CS has been rejected, "
> "see dmesg for more information.\n");
>
> - amdgpu_fence_signalled(fence);
> + amdgpu_fence_signalled(cs->fence);
> } else {
> /* Success. */
> uint64_t *user_fence = NULL;
> - if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE)
> - user_fence = cs->ctx->user_fence_cpu_address_base +
> + if (cs->request.ip_type != AMDGPU_HW_IP_UVD &&
> + cs->request.ip_type != AMDGPU_HW_IP_VCE)
> + user_fence = acs->ctx->user_fence_cpu_address_base +
> cs->request.fence_info.offset;
> - amdgpu_fence_submitted(fence, &cs->request, user_fence);
> -
> - for (i = 0; i < cs->num_buffers; i++)
> - amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->ring_type],
> - fence);
> + amdgpu_fence_submitted(cs->fence, &cs->request, user_fence);
> }
> - pipe_mutex_unlock(ws->bo_fence_lock);
> - amdgpu_fence_reference(&fence, NULL);
> +
> + /* Cleanup. */
> + if (cs->request.resources)
> + amdgpu_bo_list_destroy(cs->request.resources);
> +
> +cleanup:
> + for (i = 0; i < cs->num_buffers; i++)
> + p_atomic_dec(&cs->buffers[i].bo->num_active_ioctls);
> +
> + amdgpu_cs_context_cleanup(cs);
> }
>
> -static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
> +/* Make sure the previous submission is completed. */
> +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
> {
> - /* no-op */
> + struct amdgpu_cs *cs = amdgpu_cs(rcs);
> +
> + /* Wait for any pending ioctl of this CS to complete. */
> + if (cs->ctx->ws->thread) {
> + /* wait and set the semaphore to "busy" */
> + pipe_semaphore_wait(&cs->flush_completed);
> + /* set the semaphore to "idle" */
> + pipe_semaphore_signal(&cs->flush_completed);
> + }
> }
>
> DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
> -DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
>
> static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
> unsigned flags,
> @@ -720,74 +813,69 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
> RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
>
> /* If the CS is not empty or overflowed.... */
> - if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw && !debug_get_option_noop()) {
> - int r;
> -
> - /* Use a buffer list containing all allocated buffers if requested. */
> - if (debug_get_option_all_bos()) {
> - struct amdgpu_winsys_bo *bo;
> - amdgpu_bo_handle *handles;
> - unsigned num = 0;
> -
> - pipe_mutex_lock(ws->global_bo_list_lock);
> -
> - handles = malloc(sizeof(handles[0]) * ws->num_buffers);
> - if (!handles) {
> - pipe_mutex_unlock(ws->global_bo_list_lock);
> - goto cleanup;
> - }
> + if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw &&
> + !debug_get_option_noop()) {
> + struct amdgpu_cs_context *cur = cs->csc;
> + unsigned i, num_buffers = cur->num_buffers;
>
> - LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
> - assert(num < ws->num_buffers);
> - handles[num++] = bo->bo;
> - }
> -
> - r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
> - handles, NULL,
> - &cs->request.resources);
> - free(handles);
> - pipe_mutex_unlock(ws->global_bo_list_lock);
> - } else {
> - r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
> - cs->handles, cs->flags,
> - &cs->request.resources);
> - }
> -
> - if (r) {
> - fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", r);
> - cs->request.resources = NULL;
> - goto cleanup;
> - }
> -
> - cs->ib[IB_MAIN].size = cs->main.base.cdw;
> + /* Set IB sizes. */
> + cur->ib[IB_MAIN].size = cs->main.base.cdw;
> cs->main.used_ib_space += cs->main.base.cdw * 4;
>
> if (cs->const_ib.ib_mapped) {
> - cs->ib[IB_CONST].size = cs->const_ib.base.cdw;
> + cur->ib[IB_CONST].size = cs->const_ib.base.cdw;
> cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4;
> }
>
> if (cs->const_preamble_ib.ib_mapped) {
> - cs->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw;
> + cur->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw;
> cs->const_preamble_ib.used_ib_space += cs->const_preamble_ib.base.cdw * 4;
> }
>
> - amdgpu_cs_do_submission(cs, fence);
> + /* Create a fence. */
> + amdgpu_fence_reference(&cur->fence, NULL);
> + cur->fence = amdgpu_fence_create(cs->ctx,
> + cur->request.ip_type,
> + cur->request.ip_instance,
> + cur->request.ring);
> + if (fence)
> + amdgpu_fence_reference(fence, cur->fence);
> +
> + /* Prepare buffers. */
> + pipe_mutex_lock(ws->bo_fence_lock);
> + amdgpu_add_fence_dependencies(cs);
> + for (i = 0; i < num_buffers; i++) {
> + p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls);
> + amdgpu_fence_reference(&cur->buffers[i].bo->fence[cs->ring_type],
> + cur->fence);
> + }
> + pipe_mutex_unlock(ws->bo_fence_lock);
>
> - /* Cleanup. */
> - if (cs->request.resources)
> - amdgpu_bo_list_destroy(cs->request.resources);
> - }
> + amdgpu_cs_sync_flush(rcs);
>
> -cleanup:
> - amdgpu_cs_context_cleanup(cs);
> + /* Swap command streams. "cst" is going to be submitted. */
> + cs->csc = cs->cst;
> + cs->cst = cur;
> +
> + /* Submit. */
> + if (ws->thread && (flags & RADEON_FLUSH_ASYNC)) {
> + /* Set the semaphore to "busy". */
> + pipe_semaphore_wait(&cs->flush_completed);
> + amdgpu_ws_queue_cs(ws, cs);
> + } else {
> + amdgpu_cs_submit_ib(cs);
> + }
> + } else {
> + amdgpu_cs_context_cleanup(cs->csc);
> + }
>
> - amdgpu_get_new_ib(&ws->base, &cs->main, &cs->ib[IB_MAIN], IB_MAIN);
> + amdgpu_get_new_ib(&ws->base, &cs->main, &cs->csc->ib[IB_MAIN], IB_MAIN);
> if (cs->const_ib.ib_mapped)
> - amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST], IB_CONST);
> + amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->csc->ib[IB_CONST],
> + IB_CONST);
> if (cs->const_preamble_ib.ib_mapped)
> amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib,
> - &cs->ib[IB_CONST_PREAMBLE], IB_CONST_PREAMBLE);
> + &cs->csc->ib[IB_CONST_PREAMBLE], IB_CONST_PREAMBLE);
>
> ws->num_cs_flushes++;
> }
> @@ -796,11 +884,14 @@ static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
> {
> struct amdgpu_cs *cs = amdgpu_cs(rcs);
>
> - amdgpu_destroy_cs_context(cs);
> + amdgpu_cs_sync_flush(rcs);
> + pipe_semaphore_destroy(&cs->flush_completed);
> p_atomic_dec(&cs->ctx->ws->num_cs);
> pb_reference(&cs->main.big_ib_buffer, NULL);
> pb_reference(&cs->const_ib.big_ib_buffer, NULL);
> pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL);
> + amdgpu_destroy_cs_context(&cs->csc1);
> + amdgpu_destroy_cs_context(&cs->csc2);
> FREE(cs);
> }
>
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> index 4ed830b..1caec0a 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> @@ -66,18 +66,7 @@ enum {
> IB_NUM
> };
>
> -struct amdgpu_cs {
> - struct amdgpu_ib main; /* must be first because this is inherited */
> - struct amdgpu_ib const_ib; /* optional constant engine IB */
> - struct amdgpu_ib const_preamble_ib;
> - struct amdgpu_ctx *ctx;
> -
> - /* Flush CS. */
> - void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence);
> - void *flush_data;
> -
> - /* amdgpu_cs_submit parameters */
> - enum ring_type ring_type;
> +struct amdgpu_cs_context {
> struct amdgpu_cs_request request;
> struct amdgpu_cs_ib_info ib[IB_NUM];
>
> @@ -94,6 +83,32 @@ struct amdgpu_cs {
> uint64_t used_gart;
>
> unsigned max_dependencies;
> +
> + struct pipe_fence_handle *fence;
> +};
> +
> +struct amdgpu_cs {
> + struct amdgpu_ib main; /* must be first because this is inherited */
> + struct amdgpu_ib const_ib; /* optional constant engine IB */
> + struct amdgpu_ib const_preamble_ib;
> + struct amdgpu_ctx *ctx;
> + enum ring_type ring_type;
> +
> + /* We flip between these two CS. While one is being consumed
> + * by the kernel in another thread, the other one is being filled
> + * by the pipe driver. */
> + struct amdgpu_cs_context csc1;
> + struct amdgpu_cs_context csc2;
> + /* The currently-used CS. */
> + struct amdgpu_cs_context *csc;
> + /* The CS being currently-owned by the other thread. */
> + struct amdgpu_cs_context *cst;
> +
> + /* Flush CS. */
> + void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence);
> + void *flush_data;
> +
> + pipe_semaphore flush_completed;
> };
>
> struct amdgpu_fence {
> @@ -103,6 +118,9 @@ struct amdgpu_fence {
> struct amdgpu_cs_fence fence;
> uint64_t *user_fence_cpu_address;
>
> + /* If the fence is unknown due to an IB still being submitted
> + * in the other thread. */
> + volatile int submission_in_progress; /* bool (int for atomicity) */
> volatile int signalled; /* bool (int for atomicity) */
> };
>
> @@ -128,7 +146,7 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst,
> *rdst = rsrc;
> }
>
> -int amdgpu_lookup_buffer(struct amdgpu_cs *csc, struct amdgpu_winsys_bo *bo);
> +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo);
>
> static inline struct amdgpu_cs *
> amdgpu_cs(struct radeon_winsys_cs *base)
> @@ -142,7 +160,7 @@ amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs,
> {
> int num_refs = bo->num_cs_references;
> return num_refs == bo->ws->num_cs ||
> - (num_refs && amdgpu_lookup_buffer(cs, bo) != -1);
> + (num_refs && amdgpu_lookup_buffer(cs->csc, bo) != -1);
> }
>
> static inline boolean
> @@ -155,11 +173,11 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs,
> if (!bo->num_cs_references)
> return FALSE;
>
> - index = amdgpu_lookup_buffer(cs, bo);
> + index = amdgpu_lookup_buffer(cs->csc, bo);
> if (index == -1)
> return FALSE;
>
> - return (cs->buffers[index].usage & usage) != 0;
> + return (cs->csc->buffers[index].usage & usage) != 0;
> }
>
> static inline boolean
> @@ -170,6 +188,8 @@ amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo)
>
> bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
> bool absolute);
> +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs);
> void amdgpu_cs_init_functions(struct amdgpu_winsys *ws);
> +void amdgpu_cs_submit_ib(struct amdgpu_cs *cs);
>
> #endif
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
> index e6db145..4bd3165 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
> @@ -308,6 +308,13 @@ static void amdgpu_winsys_destroy(struct radeon_winsys *rws)
> {
> struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
>
> + if (ws->thread) {
> + ws->kill_thread = 1;
> + pipe_semaphore_signal(&ws->cs_queued);
> + pipe_thread_wait(ws->thread);
> + }
> + pipe_semaphore_destroy(&ws->cs_queued);
> + pipe_mutex_destroy(ws->cs_queue_lock);
> pipe_mutex_destroy(ws->bo_fence_lock);
> pb_cache_deinit(&ws->bo_cache);
> pipe_mutex_destroy(ws->global_bo_list_lock);
> @@ -392,6 +399,55 @@ static int compare_dev(void *key1, void *key2)
> return key1 != key2;
> }
>
> +void amdgpu_ws_queue_cs(struct amdgpu_winsys *ws, struct amdgpu_cs *cs)
> +{
> +retry:
> + pipe_mutex_lock(ws->cs_queue_lock);
> + if (ws->num_enqueued_cs >= ARRAY_SIZE(ws->cs_queue)) {
> + /* no room left for a flush */
> + pipe_mutex_unlock(ws->cs_queue_lock);
> + goto retry;
> + }
This is a busy loop - shouldn't happen in normal programs, but it's
still not very nice. How about adding a cs_queue_space semaphore that is
initialized with the array size? Alternatively, by open-coding that
semaphore (mutex + condvar), one additional mutex could be avoided, but
I don't feel strongly either way.
With those changes, you can add my R-b.
Cheers,
Nicolai
> + ws->cs_queue[ws->num_enqueued_cs++] = cs;
> + pipe_mutex_unlock(ws->cs_queue_lock);
> + pipe_semaphore_signal(&ws->cs_queued);
> +}
> +
> +static PIPE_THREAD_ROUTINE(amdgpu_cs_thread_func, param)
> +{
> + struct amdgpu_winsys *ws = (struct amdgpu_winsys *)param;
> + struct amdgpu_cs *cs;
> + unsigned i;
> +
> + while (1) {
> + pipe_semaphore_wait(&ws->cs_queued);
> + if (ws->kill_thread)
> + break;
> +
> + pipe_mutex_lock(ws->cs_queue_lock);
> + cs = ws->cs_queue[0];
> + for (i = 1; i < ws->num_enqueued_cs; i++)
> + ws->cs_queue[i - 1] = ws->cs_queue[i];
> + ws->cs_queue[--ws->num_enqueued_cs] = NULL;
> + pipe_mutex_unlock(ws->cs_queue_lock);
> +
> + if (cs) {
> + amdgpu_cs_submit_ib(cs);
> + pipe_semaphore_signal(&cs->flush_completed);
> + }
> + }
> + pipe_mutex_lock(ws->cs_queue_lock);
> + for (i = 0; i < ws->num_enqueued_cs; i++) {
> + pipe_semaphore_signal(&ws->cs_queue[i]->flush_completed);
> + ws->cs_queue[i] = NULL;
> + }
> + pipe_mutex_unlock(ws->cs_queue_lock);
> + return 0;
> +}
> +
> +DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
> +static PIPE_THREAD_ROUTINE(amdgpu_cs_thread_func, param);
> +
> static bool amdgpu_winsys_unref(struct radeon_winsys *rws)
> {
> struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
> @@ -485,8 +541,13 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
>
> LIST_INITHEAD(&ws->global_bo_list);
> pipe_mutex_init(ws->global_bo_list_lock);
> + pipe_mutex_init(ws->cs_queue_lock);
> pipe_mutex_init(ws->bo_fence_lock);
>
> + pipe_semaphore_init(&ws->cs_queued, 0);
> + if (sysconf(_SC_NPROCESSORS_ONLN) > 1 && debug_get_option_thread())
> + ws->thread = pipe_thread_create(amdgpu_cs_thread_func, ws);
> +
> /* Create the screen at the end. The winsys must be initialized
> * completely.
> *
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
> index 91b9be4..803c616 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
> @@ -59,6 +59,14 @@ struct amdgpu_winsys {
>
> struct radeon_info info;
>
> + /* multithreaded IB submission */
> + pipe_mutex cs_queue_lock;
> + pipe_semaphore cs_queued;
> + pipe_thread thread;
> + int kill_thread;
> + int num_enqueued_cs;
> + struct amdgpu_cs *cs_queue[8];
> +
> struct amdgpu_gpu_info amdinfo;
> ADDR_HANDLE addrlib;
> uint32_t rev_id;
> @@ -76,6 +84,7 @@ amdgpu_winsys(struct radeon_winsys *base)
> return (struct amdgpu_winsys*)base;
> }
>
> +void amdgpu_ws_queue_cs(struct amdgpu_winsys *ws, struct amdgpu_cs *cs);
> void amdgpu_surface_init_functions(struct amdgpu_winsys *ws);
> ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws);
>
>
More information about the mesa-dev
mailing list