[Mesa-dev] [PATCH 4/4] radeonsi: use SDMA for uploading data through const_uploader

Thu Feb 21 00:06:40 UTC 2019

For the _rev2_ version (Patchwork Mesa) series is

Tested-by: Dieter Nützel <Dieter at nuetzel-hh.de>

on Polaris 20

UH+UV working flawlessly, now.
No 'measurable' speed decrease. - GREAT!
Blender, FreeCAD, glmark2 all fine.

But I had to have rebased part 4 (see attachment).

Dieter

Am 07.02.2019 02:22, schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> ---
>  src/gallium/drivers/radeonsi/si_buffer.c | 56 ++++++++++++++++++++++--
>  src/gallium/drivers/radeonsi/si_dma_cs.c | 19 ++++----
>  src/gallium/drivers/radeonsi/si_gfx_cs.c | 42 +++++++++++++++---
>  src/gallium/drivers/radeonsi/si_pipe.c   | 23 ++++++----
>  src/gallium/drivers/radeonsi/si_pipe.h   | 17 +++++++
>  5 files changed, 131 insertions(+), 26 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_buffer.c
> b/src/gallium/drivers/radeonsi/si_buffer.c
> index c01118ce96a..3f8db7cf4f0 100644
> --- a/src/gallium/drivers/radeonsi/si_buffer.c
> +++ b/src/gallium/drivers/radeonsi/si_buffer.c
> @@ -433,21 +433,29 @@ static void *si_buffer_transfer_map(struct
> pipe_context *ctx,
> 
>  		if (si_invalidate_buffer(sctx, buf)) {
>  			/* At this point, the buffer is always idle. */
>  			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
>  		} else {
>  			/* Fall back to a temporary buffer. */
>  			usage |= PIPE_TRANSFER_DISCARD_RANGE;
>  		}
>  	}
> 
> -	if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
> +	if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
> +	    buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) 
> {
> +		usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
> +			   PIPE_TRANSFER_PERSISTENT);
> +		usage |= PIPE_TRANSFER_DISCARD_RANGE;
> +		force_discard_range = true;
> +	}
> +
> +	if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
>  	    ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
>  			 PIPE_TRANSFER_PERSISTENT))) ||
>  	     (buf->flags & RADEON_FLAG_SPARSE))) {
>  		assert(usage & PIPE_TRANSFER_WRITE);
> 
>  		/* Check if mapping this buffer would cause waiting for the GPU.
>  		 */
>  		if (buf->flags & RADEON_FLAG_SPARSE ||
>  		    force_discard_range ||
>  		    si_rings_is_buffer_referenced(sctx, buf->buf, 
> RADEON_USAGE_READWRITE) ||
> @@ -514,32 +522,72 @@ static void *si_buffer_transfer_map(struct
> pipe_context *ctx,
>  	data += box->x;
> 
>  	return si_buffer_get_transfer(ctx, resource, usage, box,
>  					ptransfer, data, NULL, 0);
>  }
> 
>  static void si_buffer_do_flush_region(struct pipe_context *ctx,
>  				      struct pipe_transfer *transfer,
>  				      const struct pipe_box *box)
>  {
> +	struct si_context *sctx = (struct si_context*)ctx;
>  	struct si_transfer *stransfer = (struct si_transfer*)transfer;
>  	struct si_resource *buf = si_resource(transfer->resource);
> 
>  	if (stransfer->staging) {
>  		unsigned src_offset = stransfer->offset +
>  				      transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
>  				      (box->x - transfer->box.x);
> 
> +		if (buf->b.b.flags & 
> SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
> +			/* This should be true for all uploaders. */
> +			assert(transfer->box.x == 0);
> +
> +			/* Find a previous upload and extend its range. The last
> +			 * upload is likely to be at the end of the list.
> +			 */
> +			for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
> +				struct si_sdma_upload *up = &sctx->sdma_uploads[i];
> +
> +				if (up->dst != buf)
> +					continue;
> +
> +				assert(up->src == stransfer->staging);
> +				assert(box->x > up->dst_offset);
> +				up->size = box->x + box->width - up->dst_offset;
> +				return;
> +			}
> +
> +			/* Enlarge the array if it's full. */
> +			if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
> +				unsigned size;
> +
> +				sctx->max_sdma_uploads += 4;
> +				size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
> +				sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
> +			}
> +
> +			/* Add a new upload. */
> +			struct si_sdma_upload *up =
> +				&sctx->sdma_uploads[sctx->num_sdma_uploads++];
> +			up->dst = up->src = NULL;
> +			si_resource_reference(&up->dst, buf);
> +			si_resource_reference(&up->src, stransfer->staging);
> +			up->dst_offset = box->x;
> +			up->src_offset = src_offset;
> +			up->size = box->width;
> +			return;
> +		}
> +
>  		/* Copy the staging buffer into the original one. */
> -		si_copy_buffer((struct si_context*)ctx, transfer->resource,
> -			       &stransfer->staging->b.b, box->x, src_offset,
> -			       box->width);
> +		si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
> +			       box->x, src_offset, box->width);
>  	}
> 
>  	util_range_add(&buf->valid_buffer_range, box->x,
>  		       box->x + box->width);
>  }
> 
>  static void si_buffer_flush_region(struct pipe_context *ctx,
>  				   struct pipe_transfer *transfer,
>  				   const struct pipe_box *rel_box)
>  {
> diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c
> b/src/gallium/drivers/radeonsi/si_dma_cs.c
> index 2aafc1f09a0..bba1bd95826 100644
> --- a/src/gallium/drivers/radeonsi/si_dma_cs.c
> +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
> @@ -133,21 +133,22 @@ void si_need_dma_space(struct si_context *ctx,
> unsigned num_dw,
>  	if (dst) {
>  		vram += dst->vram_usage;
>  		gtt += dst->gart_usage;
>  	}
>  	if (src) {
>  		vram += src->vram_usage;
>  		gtt += src->gart_usage;
>  	}
> 
>  	/* Flush the GFX IB if DMA depends on it. */
> -	if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
> +	if (!ctx->sdma_uploads_in_progress &&
> +	    radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
>  	    ((dst &&
>  	      ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
>  					  RADEON_USAGE_READWRITE)) ||
>  	     (src &&
>  	      ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
>  					  RADEON_USAGE_WRITE))))
>  		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
> NULL);
> 
>  	/* Flush if there's not enough space, or if the memory usage per IB
>  	 * is too large.
> @@ -155,45 +156,47 @@ void si_need_dma_space(struct si_context *ctx,
> unsigned num_dw,
>  	 * IBs using too little memory are limited by the IB submission 
> overhead.
>  	 * IBs using too much memory are limited by the kernel/TTM overhead.
>  	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
>  	 *
>  	 * This heuristic makes sure that DMA requests are executed
>  	 * very soon after the call is made and lowers memory usage.
>  	 * It improves texture upload performance by keeping the DMA
>  	 * engine busy while uploads are being submitted.
>  	 */
>  	num_dw++; /* for emit_wait_idle below */
> -	if (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
> -	    ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 
> 1024 ||
> -	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, 
> gtt)) {
> +	if (!ctx->sdma_uploads_in_progress &&
> +	    (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
> +	     ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 
> 1024 ||
> +	     !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, 
> gtt))) {
>  		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
>  		assert((num_dw + ctx->dma_cs->current.cdw) <= 
> ctx->dma_cs->current.max_dw);
>  	}
> 
>  	/* Wait for idle if either buffer has been used in the IB before to
>  	 * prevent read-after-write hazards.
>  	 */
>  	if ((dst &&
>  	     ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf,
>  					 RADEON_USAGE_READWRITE)) ||
>  	    (src &&
>  	     ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf,
>  					 RADEON_USAGE_WRITE)))
>  		si_dma_emit_wait_idle(ctx);
> 
> +	unsigned sync = ctx->sdma_uploads_in_progress ? 0 : 
> RADEON_USAGE_SYNCHRONIZED;
>  	if (dst) {
> -		radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
> -					  RADEON_USAGE_WRITE, 0);
> +		ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
> +				  dst->domains, 0);
>  	}
>  	if (src) {
> -		radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
> -					  RADEON_USAGE_READ, 0);
> +		ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | sync,
> +				  src->domains, 0);
>  	}
> 
>  	/* this function is called before all DMA calls, so increment this. 
> */
>  	ctx->num_dma_calls++;
>  }
> 
>  void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
>  		     struct pipe_fence_handle **fence)
>  {
>  	struct radeon_cmdbuf *cs = ctx->dma_cs;
> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c
> b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> index 3d64587fa2b..13d5b5a959a 100644
> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> @@ -19,20 +19,21 @@
>   * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT 
> SHALL
>   * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
>   * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
> OR
>   * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
> OR THE
>   * USE OR OTHER DEALINGS IN THE SOFTWARE.
>   */
> 
>  #include "si_pipe.h"
> 
>  #include "util/os_time.h"
> +#include "util/u_upload_mgr.h"
> 
>  /* initialize */
>  void si_need_gfx_cs_space(struct si_context *ctx)
>  {
>  	struct radeon_cmdbuf *cs = ctx->gfx_cs;
> 
>  	/* There is no need to flush the DMA IB here, because
>  	 * si_need_dma_space always flushes the GFX IB if there is
>  	 * a conflict, which means any unflushed DMA commands automatically
>  	 * precede the GFX IB (= they had no dependency on the GFX IB when
> @@ -57,20 +58,29 @@ void si_need_gfx_cs_space(struct si_context *ctx)
>  	 * and just flush if there is not enough space left.
>  	 *
>  	 * Also reserve space for stopping queries at the end of IB, because
>  	 * the number of active queries is mostly unlimited.
>  	 */
>  	unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
>  	if (!ctx->ws->cs_check_space(cs, need_dwords))
>  		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
> NULL);
>  }
> 
> +void si_unref_sdma_uploads(struct si_context *sctx)
> +{
> +	for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
> +		si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
> +		si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
> +	}
> +	sctx->num_sdma_uploads = 0;
> +}
> +
>  void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
>  		     struct pipe_fence_handle **fence)
>  {
>  	struct radeon_cmdbuf *cs = ctx->gfx_cs;
>  	struct radeon_winsys *ws = ctx->ws;
>  	unsigned wait_flags = 0;
> 
>  	if (ctx->gfx_flush_in_progress)
>  		return;
> 
> @@ -91,31 +101,51 @@ void si_flush_gfx_cs(struct si_context *ctx,
> unsigned flags,
>  	if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
>  	    (!wait_flags || !ctx->gfx_last_ib_is_busy))
>  		return;
> 
>  	if (si_check_device_reset(ctx))
>  		return;
> 
>  	if (ctx->screen->debug_flags & DBG(CHECK_VM))
>  		flags &= ~PIPE_FLUSH_ASYNC;
> 
> +	ctx->gfx_flush_in_progress = true;
> +
>  	/* If the state tracker is flushing the GFX IB, si_flush_from_st is
>  	 * responsible for flushing the DMA IB and merging the fences from 
> both.
> -	 * This code is only needed when the driver flushes the GFX IB
> -	 * internally, and it never asks for a fence handle.
> +	 * If the driver flushes the GFX IB internally, and it should never 
> ask
> +	 * for a fence handle.
>  	 */
> -	if (radeon_emitted(ctx->dma_cs, 0)) {
> -		assert(fence == NULL); /* internal flushes only */
> -		si_flush_dma_cs(ctx, flags, NULL);
> +	assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL);
> +
> +	/* Update the sdma_uploads list by flushing the uploader. */
> +	u_upload_unmap(ctx->b.const_uploader);
> +
> +	/* Execute SDMA uploads. */
> +	ctx->sdma_uploads_in_progress = true;
> +	for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
> +		struct si_sdma_upload *up = &ctx->sdma_uploads[i];
> +		struct pipe_box box;
> +
> +		assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
> +		       up->size % 4 == 0);
> +
> +		u_box_1d(up->src_offset, up->size, &box);
> +		ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0,
> +			      &up->src->b.b, 0, &box);
>  	}
> +	ctx->sdma_uploads_in_progress = false;
> +	si_unref_sdma_uploads(ctx);
> 
> -	ctx->gfx_flush_in_progress = true;
> +	/* Flush SDMA (preamble IB). */
> +	if (radeon_emitted(ctx->dma_cs, 0))
> +		si_flush_dma_cs(ctx, flags, NULL);
> 
>  	if (!LIST_IS_EMPTY(&ctx->active_queries))
>  		si_suspend_queries(ctx);
> 
>  	ctx->streamout.suspended = false;
>  	if (ctx->streamout.begin_emitted) {
>  		si_emit_streamout_end(ctx);
>  		ctx->streamout.suspended = true;
>  	}
> 
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
> b/src/gallium/drivers/radeonsi/si_pipe.c
> index c6f93e7b15e..c0ee2b1a6dc 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -257,20 +257,21 @@ static void si_destroy_context(struct
> pipe_context *context)
>  	si_saved_cs_reference(&sctx->current_saved_cs, NULL);
> 
>  	_mesa_hash_table_destroy(sctx->tex_handles, NULL);
>  	_mesa_hash_table_destroy(sctx->img_handles, NULL);
> 
>  	util_dynarray_fini(&sctx->resident_tex_handles);
>  	util_dynarray_fini(&sctx->resident_img_handles);
>  	util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
>  	util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
>  	util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
> +	si_unref_sdma_uploads(sctx);
>  	FREE(sctx);
>  }
> 
>  static enum pipe_reset_status si_get_reset_status(struct pipe_context 
> *ctx)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
> 
>  	if (sctx->screen->info.has_gpu_reset_status_query)
>  		return sctx->ws->ctx_query_reset_status(sctx->ctx);
> 
> @@ -436,43 +437,49 @@ static struct pipe_context
> *si_create_context(struct pipe_screen *screen,
>  				      SI_RESOURCE_FLAG_CLEAR, false);
>  	if (!sctx->allocator_zeroed_memory)
>  		goto fail;
> 
>  	sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
>  						    0, PIPE_USAGE_STREAM,
>  						    SI_RESOURCE_FLAG_READ_ONLY);
>  	if (!sctx->b.stream_uploader)
>  		goto fail;
> 
> -	sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024,
> -						   0, PIPE_USAGE_DEFAULT,
> -						   SI_RESOURCE_FLAG_32BIT |
> -						   (sscreen->cpdma_prefetch_writes_memory ?
> -							    0 : SI_RESOURCE_FLAG_READ_ONLY));
> -	if (!sctx->b.const_uploader)
> -		goto fail;
> -
>  	sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
>  						       0, PIPE_USAGE_STAGING, 0);
>  	if (!sctx->cached_gtt_allocator)
>  		goto fail;
> 
>  	sctx->ctx = sctx->ws->ctx_create(sctx->ws);
>  	if (!sctx->ctx)
>  		goto fail;
> 
>  	if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags &
> DBG(NO_ASYNC_DMA))) {
>  		sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
>  						   (void*)si_flush_dma_cs,
>  						   sctx, stop_exec_on_failure);
>  	}
> 
> +	bool use_sdma_upload = sscreen->info.has_dedicated_vram &&
> sctx->dma_cs && debug_get_bool_option("SDMA", true);
> +	sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
> +						 0, PIPE_USAGE_DEFAULT,
> +						 SI_RESOURCE_FLAG_32BIT |
> +						 (use_sdma_upload ?
> +							  SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :
> +							  (sscreen->cpdma_prefetch_writes_memory ?
> +								   0 : SI_RESOURCE_FLAG_READ_ONLY)));
> +	if (!sctx->b.const_uploader)
> +		goto fail;
> +
> +	if (use_sdma_upload)
> +		u_upload_enable_flush_explicit(sctx->b.const_uploader);
> +
>  	si_init_buffer_functions(sctx);
>  	si_init_clear_functions(sctx);
>  	si_init_blit_functions(sctx);
>  	si_init_compute_functions(sctx);
>  	si_init_compute_blit_functions(sctx);
>  	si_init_debug_functions(sctx);
>  	si_init_msaa_functions(sctx);
>  	si_init_streamout_functions(sctx);
> 
>  	if (sscreen->info.has_hw_decode) {
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> index b01d5744752..b208bdeb848 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -103,20 +103,22 @@
>  #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
> 
>  #define SI_RESOURCE_FLAG_TRANSFER	(PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
>  #define SI_RESOURCE_FLAG_FLUSHED_DEPTH	(PIPE_RESOURCE_FLAG_DRV_PRIV << 
> 1)
>  #define SI_RESOURCE_FLAG_FORCE_MSAA_TILING 
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
>  #define SI_RESOURCE_FLAG_DISABLE_DCC	(PIPE_RESOURCE_FLAG_DRV_PRIV << 
> 3)
>  #define SI_RESOURCE_FLAG_UNMAPPABLE	(PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
>  #define SI_RESOURCE_FLAG_READ_ONLY	(PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
>  #define SI_RESOURCE_FLAG_32BIT		(PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
>  #define SI_RESOURCE_FLAG_CLEAR		(PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
> +/* For const_uploader, upload data via GTT and copy to VRAM on
> context flush via SDMA. */
> +#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
> 
>  enum si_clear_code
>  {
>  	DCC_CLEAR_COLOR_0000   = 0x00000000,
>  	DCC_CLEAR_COLOR_0001   = 0x40404040,
>  	DCC_CLEAR_COLOR_1110   = 0x80808080,
>  	DCC_CLEAR_COLOR_1111   = 0xC0C0C0C0,
>  	DCC_CLEAR_COLOR_REG    = 0x20202020,
>  	DCC_UNCOMPRESSED       = 0xFFFFFFFF,
>  };
> @@ -769,20 +771,28 @@ struct si_saved_cs {
>  	struct si_context	*ctx;
>  	struct radeon_saved_cs	gfx;
>  	struct si_resource	*trace_buf;
>  	unsigned		trace_id;
> 
>  	unsigned		gfx_last_dw;
>  	bool			flushed;
>  	int64_t			time_flush;
>  };
> 
> +struct si_sdma_upload {
> +	struct si_resource	*dst;
> +	struct si_resource	*src;
> +	unsigned		src_offset;
> +	unsigned		dst_offset;
> +	unsigned		size;
> +};
> +
>  struct si_context {
>  	struct pipe_context		b; /* base class */
> 
>  	enum radeon_family		family;
>  	enum chip_class			chip_class;
> 
>  	struct radeon_winsys		*ws;
>  	struct radeon_winsys_ctx	*ctx;
>  	struct radeon_cmdbuf		*gfx_cs;
>  	struct radeon_cmdbuf		*dma_cs;
> @@ -1074,20 +1084,26 @@ struct si_context {
>  	int				num_perfect_occlusion_queries;
>  	struct list_head		active_queries;
>  	unsigned			num_cs_dw_queries_suspend;
> 
>  	/* Render condition. */
>  	struct pipe_query		*render_cond;
>  	unsigned			render_cond_mode;
>  	bool				render_cond_invert;
>  	bool				render_cond_force_off; /* for u_blitter */
> 
> +	/* For uploading data via GTT and copy to VRAM on context flush via 
> SDMA. */
> +	bool				sdma_uploads_in_progress;
> +	struct si_sdma_upload		*sdma_uploads;
> +	unsigned			num_sdma_uploads;
> +	unsigned			max_sdma_uploads;
> +
>  	/* Statistics gathering for the DCC enablement heuristic. It can't be
>  	 * in si_texture because si_texture can be shared by multiple
>  	 * contexts. This is for back buffers only. We shouldn't get too many
>  	 * of those.
>  	 *
>  	 * X11 DRI3 rotates among a finite set of back buffers. They should
>  	 * all fit in this array. If they don't, separate DCC might never be
>  	 * enabled by DCC stat gathering.
>  	 */
>  	struct {
> @@ -1273,20 +1289,21 @@ struct pipe_fence_handle
> *si_create_fence(struct pipe_context *ctx,
>  					  struct tc_unflushed_batch_token *tc_token);
> 
>  /* si_get.c */
>  void si_init_screen_get_functions(struct si_screen *sscreen);
> 
>  /* si_gfx_cs.c */
>  void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
>  		     struct pipe_fence_handle **fence);
>  void si_begin_new_gfx_cs(struct si_context *ctx);
>  void si_need_gfx_cs_space(struct si_context *ctx);
> +void si_unref_sdma_uploads(struct si_context *sctx);
> 
>  /* si_gpu_load.c */
>  void si_gpu_load_kill_thread(struct si_screen *sscreen);
>  uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
>  unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
>  			uint64_t begin);
> 
>  /* si_compute.c */
>  void si_init_compute_functions(struct si_context *sctx);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 4-4-radeonsi-use-SDMA-for-uploading-data-through-const_uploader.patch
Type: text/x-diff
Size: 1108 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20190221/ed97d60d/attachment-0001.patch>