[Mesa-dev] [PATCH 3/3] gallium/radeon: use unflushed fences for deferred flushes (v2)

Tue Aug 9 08:54:29 UTC 2016

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

On 06.08.2016 17:32, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> +23% Bioshock Infinite performance.
>
> v2: - use the new fence_finish interface
>     - allow deferred fences with multiple contexts
>     - clear the ctx pointer after a deferred flush
> ---
>  src/gallium/drivers/radeon/r600_pipe_common.c | 44 ++++++++++++++++++++++++++-
>  1 file changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index 119fdf5..1c56e6e 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -40,20 +40,26 @@
>  #include <sys/utsname.h>
>
>  #ifndef HAVE_LLVM
>  #define HAVE_LLVM 0
>  #endif
>
>  struct r600_multi_fence {
>  	struct pipe_reference reference;
>  	struct pipe_fence_handle *gfx;
>  	struct pipe_fence_handle *sdma;
> +
> +	/* If the context wasn't flushed at fence creation, this is non-NULL. */
> +	struct {
> +		struct r600_common_context *ctx;
> +		unsigned ib_index;
> +	} gfx_unflushed;
>  };
>
>  /*
>   * shader binary helpers.
>   */
>  void radeon_shader_binary_init(struct radeon_shader_binary *b)
>  {
>  	memset(b, 0, sizeof(*b));
>  }
>
> @@ -255,42 +261,59 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
>
>  static void r600_flush_from_st(struct pipe_context *ctx,
>  			       struct pipe_fence_handle **fence,
>  			       unsigned flags)
>  {
>  	struct pipe_screen *screen = ctx->screen;
>  	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
>  	unsigned rflags = 0;
>  	struct pipe_fence_handle *gfx_fence = NULL;
>  	struct pipe_fence_handle *sdma_fence = NULL;
> +	bool deferred_fence = false;
>
>  	if (flags & PIPE_FLUSH_END_OF_FRAME)
>  		rflags |= RADEON_FLUSH_END_OF_FRAME;
>  	if (flags & PIPE_FLUSH_DEFERRED)
>  		rflags |= RADEON_FLUSH_ASYNC;
>
>  	if (rctx->dma.cs) {
>  		rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
>  	}
> -	rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
> +
> +	/* Instead of flushing, create a deferred fence. Constraints:
> +	 * - The state tracker must allow a deferred flush.
> +	 * - The state tracker must request a fence.
> +	 * Thread safety in fence_finish must be ensured by the state tracker.
> +	 */
> +	if (flags & PIPE_FLUSH_DEFERRED && fence) {
> +		gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
> +		deferred_fence = true;
> +	} else {
> +		rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
> +	}
>
>  	/* Both engines can signal out of order, so we need to keep both fences. */
>  	if (gfx_fence || sdma_fence) {
>  		struct r600_multi_fence *multi_fence =
>  			CALLOC_STRUCT(r600_multi_fence);
>  		if (!multi_fence)
>  			return;
>
>  		multi_fence->reference.count = 1;
>  		multi_fence->gfx = gfx_fence;
>  		multi_fence->sdma = sdma_fence;
>
> +		if (deferred_fence) {
> +			multi_fence->gfx_unflushed.ctx = rctx;
> +			multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
> +		}
> +
>  		screen->fence_reference(screen, fence, NULL);
>  		*fence = (struct pipe_fence_handle*)multi_fence;
>  	}
>  }
>
>  static void r600_flush_dma_ring(void *ctx, unsigned flags,
>  				struct pipe_fence_handle **fence)
>  {
>  	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
>  	struct radeon_winsys_cs *cs = rctx->dma.cs;
> @@ -953,36 +976,55 @@ static void r600_fence_reference(struct pipe_screen *screen,
>          *rdst = rsrc;
>  }
>
>  static boolean r600_fence_finish(struct pipe_screen *screen,
>  				 struct pipe_context *ctx,
>  				 struct pipe_fence_handle *fence,
>  				 uint64_t timeout)
>  {
>  	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
>  	struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
> +	struct r600_common_context *rctx =
> +		ctx ? (struct r600_common_context*)ctx : NULL;
>  	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
>
>  	if (rfence->sdma) {
>  		if (!rws->fence_wait(rws, rfence->sdma, timeout))
>  			return false;
>
>  		/* Recompute the timeout after waiting. */
>  		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
>  			int64_t time = os_time_get_nano();
>  			timeout = abs_timeout > time ? abs_timeout - time : 0;
>  		}
>  	}
>
>  	if (!rfence->gfx)
>  		return true;
>
> +	/* Flush the gfx IB if it hasn't been flushed yet. */
> +	if (rctx &&
> +	    rfence->gfx_unflushed.ctx == rctx &&
> +	    rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) {
> +		rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL);
> +		rfence->gfx_unflushed.ctx = NULL;
> +
> +		if (!timeout)
> +			return false;
> +
> +		/* Recompute the timeout after all that. */
> +		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
> +			int64_t time = os_time_get_nano();
> +			timeout = abs_timeout > time ? abs_timeout - time : 0;
> +		}
> +	}
> +
>  	return rws->fence_wait(rws, rfence->gfx, timeout);
>  }
>
>  static void r600_query_memory_info(struct pipe_screen *screen,
>  				   struct pipe_memory_info *info)
>  {
>  	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
>  	struct radeon_winsys *ws = rscreen->ws;
>  	unsigned vram_usage, gtt_usage;
>
>