[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Thu May 18 09:31:13 UTC 2017

On 17.05.2017 21:38, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> This decreases the size of CE RAM dumps to L2, or the size of descriptor
> uploads without CE.
> ---
>  src/gallium/drivers/radeonsi/si_compute.c       | 28 ++++++--
>  src/gallium/drivers/radeonsi/si_descriptors.c   | 85 ++++++++++++++++++++-----
>  src/gallium/drivers/radeonsi/si_state.h         | 18 +++++-
>  src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
>  4 files changed, 113 insertions(+), 24 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 22ef111..4c98066 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -201,21 +201,38 @@ static void *si_create_compute_state(
>  			return NULL;
>  		}
>  	}
>
>  	return program;
>  }
>
>  static void si_bind_compute_state(struct pipe_context *ctx, void *state)
>  {
>  	struct si_context *sctx = (struct si_context*)ctx;
> -	sctx->cs_shader_state.program = (struct si_compute*)state;
> +	struct si_compute *program = (struct si_compute*)state;
> +
> +	sctx->cs_shader_state.program = program;
> +	if (!program)
> +		return;
> +
> +	/* Wait because we need active slot usage masks. */
> +	if (program->ir_type == PIPE_SHADER_IR_TGSI)
> +		util_queue_fence_wait(&program->ready);
> +
> +	si_set_active_descriptors(sctx,
> +				  SI_DESCS_FIRST_COMPUTE +
> +				  SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
> +				  program->active_const_and_shader_buffers);
> +	si_set_active_descriptors(sctx,
> +				  SI_DESCS_FIRST_COMPUTE +
> +				  SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
> +				  program->active_samplers_and_images);
>  }
>
>  static void si_set_global_binding(
>  	struct pipe_context *ctx, unsigned first, unsigned n,
>  	struct pipe_resource **resources,
>  	uint32_t **handles)
>  {
>  	unsigned i;
>  	struct si_context *sctx = (struct si_context*)ctx;
>  	struct si_compute *program = sctx->cs_shader_state.program;
> @@ -749,26 +766,23 @@ static void si_launch_grid(
>  	bool cs_regalloc_hang =
>  		(sctx->b.chip_class == SI ||
>  		 sctx->b.family == CHIP_BONAIRE ||
>  		 sctx->b.family == CHIP_KABINI) &&
>  		info->block[0] * info->block[1] * info->block[2] > 256;
>
>  	if (cs_regalloc_hang)
>  		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>  				 SI_CONTEXT_CS_PARTIAL_FLUSH;
>
> -	if (program->ir_type == PIPE_SHADER_IR_TGSI) {
> -		util_queue_fence_wait(&program->ready);
> -
> -		if (program->shader.compilation_failed)
> -			return;
> -	}
> +	if (program->ir_type == PIPE_SHADER_IR_TGSI &&
> +	    program->shader.compilation_failed)
> +		return;
>
>  	si_decompress_compute_textures(sctx);
>
>  	/* Add buffer sizes for memory checking in need_cs_space. */
>  	r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
>  	/* TODO: add the scratch buffer */
>
>  	if (info->indirect) {
>  		r600_context_add_resource_size(ctx, info->indirect);
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 38e4ae1..a2f40a8 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors *desc,
>  	}
>  }
>
>  static void si_release_descriptors(struct si_descriptors *desc)
>  {
>  	r600_resource_reference(&desc->buffer, NULL);
>  	FREE(desc->list);
>  }
>
>  static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
> -			 unsigned *out_offset, struct r600_resource **out_buf) {
> +			 unsigned *out_offset, struct r600_resource **out_buf)
> +{
>  	uint64_t va;
>
>  	u_suballocator_alloc(sctx->ce_suballocator, size,
> -			     sctx->screen->b.info.tcc_cache_line_size,
> -			     out_offset, (struct pipe_resource**)out_buf);
> +			     si_optimal_tcc_alignment(sctx, size),
> +			     (unsigned*)out_offset,

The extra cast of out_offset is unnecessary.

> +			     (struct pipe_resource**)out_buf);
>  	if (!out_buf)
>  			return false;
>
>  	va = (*out_buf)->gpu_address + *out_offset;
>
>  	radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
>  	radeon_emit(sctx->ce_ib, ce_offset);
>  	radeon_emit(sctx->ce_ib, size / 4);
>  	radeon_emit(sctx->ce_ib, va);
>  	radeon_emit(sctx->ce_ib, va >> 32);
> @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
>  	radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
>  	radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
>  	                CONTEXT_CONTROL_LOAD_CE_RAM(1));
>  	radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
>  }
>
>  static bool si_upload_descriptors(struct si_context *sctx,
>  				  struct si_descriptors *desc,
>  				  struct r600_atom * atom)
>  {
> -	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
> +	unsigned slot_size = desc->element_dw_size * 4;
> +	unsigned first_slot_offset = desc->first_active_slot * slot_size;
> +	unsigned upload_size = desc->num_active_slots * slot_size;
> +
> +	if (!upload_size)
> +		return true;

The early-out here means that desc->num_active_slots *does* control what 
is written to CE RAM, contrary to what its descriptive comment says. It 
needs to be moved further down.

Cheers,
Nicolai

>
>  	if (sctx->ce_ib && desc->uses_ce) {
>  		uint32_t const* list = (uint32_t const*)desc->list;
>
>  		while(desc->dirty_mask) {
>  			int begin, count;
>  			u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
>  						       &count);
>
>  			begin *= desc->element_dw_size;
>  			count *= desc->element_dw_size;
>
>  			radeon_emit(sctx->ce_ib,
>  			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
>  			radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
>  			radeon_emit_array(sctx->ce_ib, list + begin, count);
>  		}
>
> -		if (!si_ce_upload(sctx, desc->ce_offset, list_size,
> -		                           &desc->buffer_offset, &desc->buffer))
> +		if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
> +				  upload_size, (unsigned*)&desc->buffer_offset,
> +				  &desc->buffer))
>  			return false;
>  	} else {
> -		void *ptr;
> +		uint32_t *ptr;
>
> -		u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
> -			       sctx->screen->b.info.tcc_cache_line_size,
> -			       &desc->buffer_offset,
> -			       (struct pipe_resource**)&desc->buffer, &ptr);
> +		u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
> +			       si_optimal_tcc_alignment(sctx, upload_size),
> +			       (unsigned*)&desc->buffer_offset,
> +			       (struct pipe_resource**)&desc->buffer,
> +			       (void**)&ptr);
>  		if (!desc->buffer)
>  			return false; /* skip the draw call */
>
> -		util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
> -		desc->gpu_list = ptr;
> +		util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
> +					upload_size);
> +		desc->gpu_list = ptr - first_slot_offset / 4;
>
>  		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
>  	                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
>  	}
> +
> +	/* The shader pointer should point to slot 0. */
> +	desc->buffer_offset -= first_slot_offset;
> +
>  	desc->dirty_mask = 0;
>
>  	if (atom)
>  		si_mark_atom_dirty(sctx, atom);
>
>  	return true;
>  }
>
>  static void
>  si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
> @@ -1023,21 +1037,21 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
>  	desc_list_byte_size = velems->desc_list_byte_size;
>  	first_vb_use_mask = velems->first_vb_use_mask;
>
>  	/* Vertex buffer descriptors are the only ones which are uploaded
>  	 * directly through a staging buffer and don't go through
>  	 * the fine-grained upload path.
>  	 */
>  	u_upload_alloc(sctx->b.b.const_uploader, 0,
>  		       desc_list_byte_size,
>  		       si_optimal_tcc_alignment(sctx, desc_list_byte_size),
> -		       &desc->buffer_offset,
> +		       (unsigned*)&desc->buffer_offset,
>  		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
>  	if (!desc->buffer)
>  		return false;
>
>  	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>  			      desc->buffer, RADEON_USAGE_READ,
>  			      RADEON_PRIO_DESCRIPTORS);
>
>  	assert(count <= SI_MAX_ATTRIBS);
>
> @@ -1883,21 +1897,22 @@ void si_shader_change_notify(struct si_context *sctx)
>  	}
>  }
>
>  static void si_emit_shader_pointer(struct si_context *sctx,
>  				   struct si_descriptors *desc,
>  				   unsigned sh_base)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>  	uint64_t va;
>
> -	assert(desc->buffer);
> +	if (!desc->buffer)
> +		return; /* the pointer is not used by current shaders */
>
>  	va = desc->buffer->gpu_address +
>  	     desc->buffer_offset;
>
>  	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
>  	radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
>  	radeon_emit(cs, va);
>  	radeon_emit(cs, va >> 32);
>  }
>
> @@ -2026,20 +2041,22 @@ void si_init_all_descriptors(struct si_context *sctx)
>  	}
>
>  	si_init_buffer_resources(&sctx->rw_buffers,
>  				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
>  				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
>  				 /* The second set of usage/priority is used by
>  				  * const buffers in RW buffer slots. */
>  				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
>  				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
>  				 &ce_offset);
> +	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
> +
>  	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
>  			    4, SI_NUM_VERTEX_BUFFERS, NULL);
>
>  	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
>  	sctx->total_ce_ram_allocated = ce_offset;
>
>  	if (sctx->b.chip_class >= GFX9)
>  		assert(ce_offset <= 4096);
>  	else
>  		assert(ce_offset <= 32768);
> @@ -2148,10 +2165,48 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
>  		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
>  	}
>  	si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
>  	si_vertex_buffers_begin_new_cs(sctx);
>
>  	for (i = 0; i < SI_NUM_DESCS; ++i)
>  		si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
>
>  	si_shader_userdata_begin_new_cs(sctx);
>  }
> +
> +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
> +			       uint64_t new_active_mask)
> +{
> +	struct si_descriptors *desc = &sctx->descriptors[desc_idx];
> +
> +	/* Ignore no-op updates and updates that disable all slots. */
> +	if (!new_active_mask ||
> +	    new_active_mask == u_bit_consecutive64(desc->first_active_slot,
> +						   desc->num_active_slots))
> +		return;
> +
> +	int first, count;
> +	u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
> +	assert(new_active_mask == 0);
> +
> +	/* Upload/dump descriptors if slots are being enabled. */
> +	if (first < desc->first_active_slot ||
> +	    first + count > desc->first_active_slot + desc->num_active_slots)
> +		sctx->descriptors_dirty |= 1u << desc_idx;
> +
> +	desc->first_active_slot = first;
> +	desc->num_active_slots = count;
> +}
> +
> +void si_set_active_descriptors_for_shader(struct si_context *sctx,
> +					  struct si_shader_selector *sel)
> +{
> +	if (!sel)
> +		return;
> +
> +	si_set_active_descriptors(sctx,
> +		si_const_and_shader_buffer_descriptors_idx(sel->type),
> +		sel->active_const_and_shader_buffers);
> +	si_set_active_descriptors(sctx,
> +		si_sampler_and_image_descriptors_idx(sel->type),
> +		sel->active_samplers_and_images);
> +}
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index f2003a5..dfabaa3 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -35,20 +35,21 @@
>
>  #define SI_MAX_ATTRIBS			16
>  #define SI_NUM_VERTEX_BUFFERS		SI_MAX_ATTRIBS
>  #define SI_NUM_SAMPLERS			32 /* OpenGL textures units per shader */
>  #define SI_NUM_CONST_BUFFERS		16
>  #define SI_NUM_IMAGES			16
>  #define SI_NUM_SHADER_BUFFERS		16
>
>  struct si_screen;
>  struct si_shader;
> +struct si_shader_selector;
>
>  struct si_state_blend {
>  	struct si_pm4_state	pm4;
>  	uint32_t		cb_target_mask;
>  	bool			alpha_to_coverage;
>  	bool			alpha_to_one;
>  	bool			dual_src_blend;
>  	/* Set 0xf or 0x0 (4 bits) per render target if the following is
>  	 * true. ANDed with spi_shader_col_format.
>  	 */
> @@ -215,26 +216,34 @@ struct si_descriptors {
>  	uint32_t *list;
>  	/* The list in mapped GPU memory. */
>  	uint32_t *gpu_list;
>  	/* The size of one descriptor. */
>  	unsigned element_dw_size;
>  	/* The maximum number of descriptors. */
>  	unsigned num_elements;
>
>  	/* The buffer where the descriptors have been uploaded. */
>  	struct r600_resource *buffer;
> -	unsigned buffer_offset;
> +	int buffer_offset; /* can be negative if not using lower slots */
>
>  	/* Offset in CE RAM */
>  	unsigned ce_offset;
>
> -	/* elements of the list that are changed and need to be uploaded */
> +	/* Slots that are used by currently-bound shaders.
> +	 * With CE: It determines which slots are dumped to L2.
> +	 *          It doesn't skip uploads to CE RAM.
> +	 * Without CE: It determines which slots are uploaded.
> +	 */
> +	unsigned first_active_slot;
> +	unsigned num_active_slots;
> +
> +	/* Slots that have been changed and need to be uploaded. */
>  	uint64_t dirty_mask;
>
>  	/* Whether CE is used to upload this descriptor array. */
>  	bool uses_ce;
>
>  	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
>  	 * array will be stored. */
>  	unsigned shader_userdata_offset;
>  };
>
> @@ -308,20 +317,25 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx);
>  void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
>  			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
>  void si_update_all_texture_descriptors(struct si_context *sctx);
>  void si_shader_change_notify(struct si_context *sctx);
>  void si_update_compressed_colortex_masks(struct si_context *sctx);
>  void si_emit_graphics_shader_userdata(struct si_context *sctx,
>                                        struct r600_atom *atom);
>  void si_emit_compute_shader_userdata(struct si_context *sctx);
>  void si_set_rw_buffer(struct si_context *sctx,
>  		      uint slot, const struct pipe_constant_buffer *input);
> +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
> +			       uint64_t new_active_mask);
> +void si_set_active_descriptors_for_shader(struct si_context *sctx,
> +					  struct si_shader_selector *sel);
> +
>  /* si_state.c */
>  struct si_shader_selector;
>
>  void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
>  		  struct r600_atom **list_elem,
>  		  void (*emit_func)(struct si_context *ctx, struct r600_atom *state));
>  void si_init_state_functions(struct si_context *sctx);
>  void si_init_screen_state_functions(struct si_screen *sscreen);
>  void
>  si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 45d996b..8ac4309 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -2144,20 +2144,21 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
>  	struct si_shader_selector *sel = state;
>
>  	if (sctx->vs_shader.cso == sel)
>  		return;
>
>  	sctx->vs_shader.cso = sel;
>  	sctx->vs_shader.current = sel ? sel->first_variant : NULL;
>  	sctx->do_update_shaders = true;
>  	si_mark_atom_dirty(sctx, &sctx->clip_regs);
>  	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
> +	si_set_active_descriptors_for_shader(sctx, sel);
>  }
>
>  static void si_update_tess_uses_prim_id(struct si_context *sctx)
>  {
>  	sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
>  		(sctx->tes_shader.cso &&
>  		 sctx->tes_shader.cso->info.uses_primid) ||
>  		(sctx->tcs_shader.cso &&
>  		 sctx->tcs_shader.cso->info.uses_primid) ||
>  		(sctx->gs_shader.cso &&
> @@ -2181,38 +2182,41 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
>  	sctx->do_update_shaders = true;
>  	si_mark_atom_dirty(sctx, &sctx->clip_regs);
>  	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
>
>  	if (enable_changed) {
>  		si_shader_change_notify(sctx);
>  		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
>  			si_update_tess_uses_prim_id(sctx);
>  	}
>  	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
> +	si_set_active_descriptors_for_shader(sctx, sel);
>  }
>
>  static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
>  	struct si_shader_selector *sel = state;
>  	bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
>
>  	if (sctx->tcs_shader.cso == sel)
>  		return;
>
>  	sctx->tcs_shader.cso = sel;
>  	sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
>  	si_update_tess_uses_prim_id(sctx);
>  	sctx->do_update_shaders = true;
>
>  	if (enable_changed)
>  		sctx->last_tcs = NULL; /* invalidate derived tess state */
> +
> +	si_set_active_descriptors_for_shader(sctx, sel);
>  }
>
>  static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
>  	struct si_shader_selector *sel = state;
>  	bool enable_changed = !!sctx->tes_shader.cso != !!sel;
>
>  	if (sctx->tes_shader.cso == sel)
>  		return;
> @@ -2223,37 +2227,39 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
>  	si_update_tess_uses_prim_id(sctx);
>  	sctx->do_update_shaders = true;
>  	si_mark_atom_dirty(sctx, &sctx->clip_regs);
>  	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
>
>  	if (enable_changed) {
>  		si_shader_change_notify(sctx);
>  		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
>  	}
>  	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
> +	si_set_active_descriptors_for_shader(sctx, sel);
>  }
>
>  static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
>  	struct si_shader_selector *sel = state;
>
>  	/* skip if supplied shader is one already in use */
>  	if (sctx->ps_shader.cso == sel)
>  		return;
>
>  	sctx->ps_shader.cso = sel;
>  	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
>  	sctx->do_update_shaders = true;
>  	if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
>  		si_update_tess_uses_prim_id(sctx);
>  	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
> +	si_set_active_descriptors_for_shader(sctx, sel);
>  }
>
>  static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
>  {
>  	if (shader->is_optimized) {
>  		util_queue_fence_wait(&shader->optimized_ready);
>  		util_queue_fence_destroy(&shader->optimized_ready);
>  	}
>
>  	if (shader->pm4) {
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.