[Mesa-dev] [PATCH 8/8] gallium/radeon: use unflushed fences for deferred flushes

Tue Aug 2 14:19:26 UTC 2016

On 02.08.2016 12:27, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> +23% Bioshock Infinite performance.
> ---
>  src/gallium/drivers/radeon/r600_pipe_common.c | 45 ++++++++++++++++++++++++++-
>  1 file changed, 44 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index 5e9d0b6..1a7bd7a 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -47,6 +47,12 @@ struct r600_multi_fence {
>  	struct pipe_reference reference;
>  	struct pipe_fence_handle *gfx;
>  	struct pipe_fence_handle *sdma;
> +
> +	/* If the context wasn't flushed at fence creation, this is non-NULL. */
> +	struct {
> +		struct r600_common_context *ctx;
> +		unsigned ib_index;
> +	} gfx_unflushed;
>  };
>
>  /*
> @@ -262,6 +268,7 @@ static void r600_flush_from_st(struct pipe_context *ctx,
>  	unsigned rflags = 0;
>  	struct pipe_fence_handle *gfx_fence = NULL;
>  	struct pipe_fence_handle *sdma_fence = NULL;
> +	bool deferred_fence = false;
>
>  	if (flags & PIPE_FLUSH_END_OF_FRAME)
>  		rflags |= RADEON_FLUSH_END_OF_FRAME;
> @@ -271,7 +278,22 @@ static void r600_flush_from_st(struct pipe_context *ctx,
>  	if (rctx->dma.cs) {
>  		rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
>  	}
> -	rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
> +
> +	/* Instead of flushing, create a deferred fence. Constraints:
> +	 * - The state tracker must allow a deferred flush.
> +	 * - The state tracker must request a fence.
> +	 * - At most one API context can exist to ensure thread-safety.
> +	 *   (that's a weak contraint though, we must be careful about
> +	 *    how PIPE_FLUSH_DEFERRED is used)
> +	 */
> +	if (flags & PIPE_FLUSH_DEFERRED && fence &&
> +	    /* internal aux_context + 1 API context */
> +	    rctx->screen->num_contexts <= 2) {
> +		gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
> +		deferred_fence = true;
> +	} else {
> +		rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
> +	}
>
>  	/* Both engines can signal out of order, so we need to keep both fences. */
>  	if (gfx_fence || sdma_fence) {
> @@ -284,6 +306,11 @@ static void r600_flush_from_st(struct pipe_context *ctx,
>  		multi_fence->gfx = gfx_fence;
>  		multi_fence->sdma = sdma_fence;
>
> +		if (deferred_fence) {
> +			multi_fence->gfx_unflushed.ctx = rctx;
> +			multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
> +		}
> +
>  		screen->fence_reference(screen, fence, NULL);
>  		*fence = (struct pipe_fence_handle*)multi_fence;
>  	}
> @@ -962,6 +989,7 @@ static boolean r600_fence_finish(struct pipe_screen *screen,
>  {
>  	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
>  	struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
> +	struct r600_common_context *rctx;
>  	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
>
>  	if (rfence->sdma) {
> @@ -978,6 +1006,21 @@ static boolean r600_fence_finish(struct pipe_screen *screen,
>  	if (!rfence->gfx)
>  		return true;
>
> +	/* Flush the gfx IB if it hasn't been flushed yet. */
> +	rctx = rfence->gfx_unflushed.ctx;
> +	if (rctx && rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) {
> +		rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL);

It seems like a good idea to clear rfence->gfx_unflushed.ctx here.

So, I was feeling a bit iffy about the multi-context handling here -- 
e.g., what if a second context was created after the fence was created 
but before it was waited on -- and I think we should go back and read 
the spec on this again :-)

In particular, I'm looking at section 4.1.2 of OpenGL 4.5 (Compatibility 
Profile), and it says: [...] if ClientWaitSync is called and all of the 
following are true:

- the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
- sync is unsignaled when ClientWaitSync is called,
- and the calls to ClientWaitSync and FenceSync were issued *from the 
same context*,

then the GL will behave as if the equivalent of Flush were inserted 
immediately after the creation of sync.

So I think the multi-context dance isn't needed. Instead, while most of 
this patch is fine as is, it seems like the proper solution is slightly 
different: have a fence_finish callback associated to pipe_context 
rather than pipe_screen, which optionally does the flush on unflushed 
fences, but *only if in the same context*. Then the state tracker would 
call that fence function instead.

At least that's how I'm reading that part of the spec...

Cheers,
Nicolai

> +
> +		if (!timeout)
> +			return false;
> +
> +		/* Recompute the timeout after all that. */
> +		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
> +			int64_t time = os_time_get_nano();
> +			timeout = abs_timeout > time ? abs_timeout - time : 0;
> +		}
> +	}
> +
>  	return rws->fence_wait(rws, rfence->gfx, timeout);
>  }
>
>