[Mesa-dev] [PATCH] radeonsi: add flexible shader descriptor management and use it for sampler views

Christian König deathsimple at vodafone.de
Thu Aug 15 10:12:59 PDT 2013


Am 15.08.2013 19:01, schrieb Marek Olšák:
> (This should be applied before MSAA, which will need to be rebased.)
>
> It moves all sampler view descriptors to a buffer.
> It supports partial resource updates and it can also unbind resources
> (required for FMASK texturing).
>
> The buffer contains all sampler view descriptors for one shader stage,
> represented as an array. On top of that, there are N arrays in the buffer,
> which are used to emulate context registers as implemented by the previous
> ASICs (each array is a context).
>
> This uses the RCU synchronization approach to avoid read-after-write hazards
> as discussed in the thread:
> "radeonsi: add FMASK texture binding slots and resource setup"
>
> CP DMA is used to clear the descriptors at context initialization and to copy
> the descriptors from one context to the next.
>
> v2: - use PKT3_DMA_DATA on CIK (I'll test CIK later)
>      - turn the bool CP DMA parameters into self-explanatory flags
>      - add a nice simple API for packet emission to radeon_winsys.h
>      - use 256 contexts, 128 causes texture corruption in openarena
>
> DISCUSSION:
>    Maybe there is a synchronization issue and we don't actually need so many
>    contexts? We can always flush KCACHE at the "end" of the ring if needed.

Well you definitely need to flush the texture cache when changing the 
descriptors, but I assume you already do so.

Going to dig up the documentation for it, but 256 indeed sounds a bit much.

Christian.

> ---
>   src/gallium/drivers/radeonsi/Makefile.sources  |   1 +
>   src/gallium/drivers/radeonsi/r600_blit.c       |  12 +-
>   src/gallium/drivers/radeonsi/r600_hw_context.c |  22 +-
>   src/gallium/drivers/radeonsi/radeonsi_pipe.c   |   7 +-
>   src/gallium/drivers/radeonsi/radeonsi_pipe.h   |  19 +-
>   src/gallium/drivers/radeonsi/si_descriptors.c  | 355 +++++++++++++++++++++++++
>   src/gallium/drivers/radeonsi/si_state.c        |  47 +---
>   src/gallium/drivers/radeonsi/si_state.h        |  56 ++++
>   src/gallium/drivers/radeonsi/si_state_draw.c   |  18 +-
>   src/gallium/drivers/radeonsi/sid.h             |  54 ++++
>   src/gallium/winsys/radeon/drm/radeon_winsys.h  |  12 +
>   11 files changed, 547 insertions(+), 56 deletions(-)
>   create mode 100644 src/gallium/drivers/radeonsi/si_descriptors.c
>
> diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
> index b3ffa72..68c8282 100644
> --- a/src/gallium/drivers/radeonsi/Makefile.sources
> +++ b/src/gallium/drivers/radeonsi/Makefile.sources
> @@ -10,6 +10,7 @@ C_SOURCES := \
>   	r600_translate.c \
>   	radeonsi_pm4.c \
>   	radeonsi_compute.c \
> +	si_descriptors.c \
>   	si_state.c \
>   	si_state_streamout.c \
>   	si_state_draw.c \
> diff --git a/src/gallium/drivers/radeonsi/r600_blit.c b/src/gallium/drivers/radeonsi/r600_blit.c
> index bab108e..bdd9bb4 100644
> --- a/src/gallium/drivers/radeonsi/r600_blit.c
> +++ b/src/gallium/drivers/radeonsi/r600_blit.c
> @@ -70,12 +70,12 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
>   
>   	if (op & R600_SAVE_TEXTURES) {
>   		util_blitter_save_fragment_sampler_states(
> -			rctx->blitter, rctx->ps_samplers.n_samplers,
> -			(void**)rctx->ps_samplers.samplers);
> +			rctx->blitter, rctx->samplers[PIPE_SHADER_FRAGMENT].n_samplers,
> +			(void**)rctx->samplers[PIPE_SHADER_FRAGMENT].samplers);
>   
> -		util_blitter_save_fragment_sampler_views(
> -			rctx->blitter, rctx->ps_samplers.n_views,
> -			(struct pipe_sampler_view**)rctx->ps_samplers.views);
> +		util_blitter_save_fragment_sampler_views(rctx->blitter,
> +			util_last_bit(rctx->samplers[PIPE_SHADER_FRAGMENT].views.desc.enabled_mask),
> +			rctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
>   	}
>   
>   	if ((op & R600_DISABLE_RENDER_COND) && rctx->current_render_cond) {
> @@ -224,7 +224,7 @@ void si_flush_depth_textures(struct r600_context *rctx,
>   		struct pipe_sampler_view *view;
>   		struct r600_texture *tex;
>   
> -		view = &textures->views[i]->base;
> +		view = textures->views.views[i];
>   		if (!view) continue;
>   
>   		tex = (struct r600_texture *)view->texture;
> diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c b/src/gallium/drivers/radeonsi/r600_hw_context.c
> index 25c972b..bc6ba0b 100644
> --- a/src/gallium/drivers/radeonsi/r600_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/r600_hw_context.c
> @@ -114,9 +114,17 @@ err:
>   void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
>   			boolean count_draw_in)
>   {
> +	int i;
> +
>   	/* The number of dwords we already used in the CS so far. */
>   	num_dw += ctx->cs->cdw;
>   
> +	for (i = 0; i < SI_NUM_ATOMS(ctx); i++) {
> +		if (ctx->atoms.array[i]->dirty) {
> +			num_dw += ctx->atoms.array[i]->num_dw;
> +		}
> +	}
> +
>   	if (count_draw_in) {
>   		/* The number of dwords all the dirty states would take. */
>   		num_dw += ctx->pm4_dirty_cdwords;
> @@ -254,6 +262,15 @@ void si_context_flush(struct r600_context *ctx, unsigned flags)
>   	ctx->pm4_dirty_cdwords = 0;
>   	ctx->flags = 0;
>   
> +	/* set all valid group as dirty so they get reemited on
> +	 * next draw command
> +	 */
> +	si_pm4_reset_emitted(ctx);
> +
> +	/* The CS initialization should be emitted before everything else. */
> +	si_pm4_emit(ctx, ctx->queued.named.init);
> +	ctx->emitted.named.init = ctx->queued.named.init;
> +
>   #if 0
>   	if (streamout_suspended) {
>   		ctx->streamout_start = TRUE;
> @@ -266,10 +283,7 @@ void si_context_flush(struct r600_context *ctx, unsigned flags)
>   		r600_context_queries_resume(ctx);
>   	}
>   
> -	/* set all valid group as dirty so they get reemited on
> -	 * next draw command
> -	 */
> -	si_pm4_reset_emitted(ctx);
> +	si_all_descriptors_begin_new_cs(ctx);
>   }
>   
>   void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
> diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
> index b4a1ca9..9afc7f2 100644
> --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c
> +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
> @@ -178,6 +178,8 @@ static void r600_destroy_context(struct pipe_context *context)
>   {
>   	struct r600_context *rctx = (struct r600_context *)context;
>   
> +	si_release_all_descriptors(rctx);
> +
>   	si_resource_reference(&rctx->border_color_table, NULL);
>   
>   	if (rctx->dummy_pixel_shader) {
> @@ -231,12 +233,15 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
>   		rctx->context.create_video_buffer = vl_video_buffer_create;
>   	}
>   
> +	rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL);
> +
> +	si_init_all_descriptors(rctx);
> +
>   	switch (rctx->chip_class) {
>   	case SI:
>   	case CIK:
>   		si_init_state_functions(rctx);
>   		LIST_INITHEAD(&rctx->active_query_list);
> -		rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL);
>   		rctx->max_db = 8;
>   		si_init_config(rctx);
>   		break;
> diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
> index 6fbe653..674c630 100644
> --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h
> +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
> @@ -94,11 +94,8 @@ struct si_cs_shader_state {
>   	struct si_pipe_compute		*program;
>   };
>   
> -/* needed for blitter save */
> -#define NUM_TEX_UNITS 16
> -
>   struct r600_textures_info {
> -	struct si_pipe_sampler_view	*views[NUM_TEX_UNITS];
> +	struct si_sampler_views		views;
>   	struct si_pipe_sampler_state	*samplers[NUM_TEX_UNITS];
>   	unsigned			n_views;
>   	uint32_t			depth_texture_mask; /* which textures are depth */
> @@ -131,6 +128,9 @@ struct r600_constbuf_state
>   	uint32_t			dirty_mask;
>   };
>   
> +#define SI_NUM_ATOMS(rctx) (sizeof((rctx)->atoms)/sizeof((rctx)->atoms.array[0]))
> +#define SI_NUM_SHADERS (PIPE_SHADER_FRAGMENT+1)
> +
>   struct r600_context {
>   	struct pipe_context		context;
>   	struct blitter_context		*blitter;
> @@ -142,6 +142,14 @@ struct r600_context {
>   	void				*custom_dsa_flush_inplace;
>   	struct r600_screen		*screen;
>   	struct radeon_winsys		*ws;
> +
> +	union {
> +		struct {
> +			struct si_atom *sampler_views[SI_NUM_SHADERS];
> +		};
> +		struct si_atom *array[0];
> +	} atoms;
> +
>   	struct si_vertex_element	*vertex_elements;
>   	struct pipe_framebuffer_state	framebuffer;
>   	unsigned			pa_sc_line_stipple;
> @@ -161,8 +169,7 @@ struct r600_context {
>   	unsigned			sprite_coord_enable;
>   	unsigned			export_16bpc;
>   	struct r600_constbuf_state	constbuf_state[PIPE_SHADER_TYPES];
> -	struct r600_textures_info	vs_samplers;
> -	struct r600_textures_info	ps_samplers;
> +	struct r600_textures_info	samplers[SI_NUM_SHADERS];
>   	struct si_resource		*border_color_table;
>   	unsigned			border_color_offset;
>   
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> new file mode 100644
> index 0000000..f05c8f4
> --- /dev/null
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -0,0 +1,355 @@
> +/*
> + * Copyright 2013 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * on the rights to use, copy, modify, merge, publish, distribute, sub
> + * license, and/or sell copies of the Software, and to permit persons to whom
> + * the Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + * Authors:
> + *      Marek Olšák <marek.olsak at amd.com>
> + */
> +
> +#include "radeonsi_pipe.h"
> +#include "radeonsi_resource.h"
> +#include "radeonsi_shader.h"
> +#include "r600_hw_context_priv.h"
> +
> +#include "util/u_memory.h"
> +
> +#define SI_NUM_CONTEXTS 256
> +
> +static const uint32_t null_desc[8]; /* zeros */
> +
> +/* Set this if you want the 3D engine to wait until CP DMA is done.
> + * It should be set on the last CP DMA packet. */
> +#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
> +
> +/* Set this if the source data was used as a destination in a previous CP DMA
> + * packet. It's for preventing a read-after-write (RAW) hazard between two
> + * CP DMA packets. */
> +#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
> +
> +/* Emit a CP DMA packet to do a copy from one buffer to another.
> + * The size must fit in bits [20:0]. Notes:
> + */
> +static void si_emit_cp_dma_copy_buffer(struct r600_context *rctx,
> +				       uint64_t dst_va, uint64_t src_va,
> +				       unsigned size, unsigned flags)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
> +	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
> +
> +	assert(size);
> +	assert((size & ((1<<21)-1)) == size);
> +
> +	if (rctx->chip_class >= CIK) {
> +		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
> +		radeon_emit(cs, sync_flag);		/* CP_SYNC [31] */
> +		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
> +		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
> +		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
> +		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
> +		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
> +	} else {
> +		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
> +		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
> +		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
> +		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
> +		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
> +		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
> +	}
> +}
> +
> +/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
> +static void si_emit_cp_dma_clear_buffer(struct r600_context *rctx,
> +					uint64_t dst_va, unsigned size,
> +					uint32_t clear_value, unsigned flags)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
> +	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
> +
> +	assert(size);
> +	assert((size & ((1<<21)-1)) == size);
> +
> +	if (rctx->chip_class >= CIK) {
> +		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
> +		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
> +		radeon_emit(cs, clear_value);		/* DATA [31:0] */
> +		radeon_emit(cs, 0);
> +		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
> +		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
> +		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
> +	} else {
> +		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
> +		radeon_emit(cs, clear_value);		/* DATA [31:0] */
> +		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
> +		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
> +		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
> +		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
> +	}
> +}
> +
> +static void si_init_descriptors(struct r600_context *rctx,
> +				struct si_descriptors *desc,
> +				unsigned shader_userdata_reg,
> +				unsigned element_dw_size,
> +				unsigned num_elements,
> +				void (*emit_func)(struct r600_context *ctx, struct si_atom *state))
> +{
> +	uint64_t va;
> +
> +	desc->atom.emit = emit_func;
> +	desc->shader_userdata_reg = shader_userdata_reg;
> +	desc->element_dw_size = element_dw_size;
> +	desc->num_elements = num_elements;
> +	desc->context_size = num_elements * element_dw_size * 4;
> +
> +	desc->buffer = (struct si_resource*)
> +		pipe_buffer_create(rctx->context.screen, PIPE_BIND_CUSTOM,
> +				   PIPE_USAGE_STATIC,
> +				   SI_NUM_CONTEXTS * desc->context_size);
> +
> +	r600_context_bo_reloc(rctx, desc->buffer, RADEON_USAGE_READWRITE);
> +	va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b);
> +
> +	/* We don't check for CS space here, because this should be called
> +	 * only once at context initialization. */
> +	si_emit_cp_dma_clear_buffer(rctx, va, desc->buffer->b.b.width0, 0,
> +				    R600_CP_DMA_SYNC);
> +}
> +
> +static void si_release_descriptors(struct si_descriptors *desc)
> +{
> +	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
> +}
> +
> +static void si_update_descriptors(struct si_descriptors *desc)
> +{
> +	if (desc->dirty_mask) {
> +		desc->atom.num_dw =
> +			7 + /* copy */
> +			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask) + /* update */
> +			4; /* pointer update */
> +		desc->atom.dirty = true;
> +	} else {
> +		desc->atom.dirty = false;
> +	}
> +}
> +
> +static void si_emit_shader_pointer(struct r600_context *rctx,
> +				   struct si_descriptors *desc)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint64_t va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b) +
> +		      desc->current_context_id * desc->context_size;
> +
> +	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
> +	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
> +	radeon_emit(cs, va);
> +	radeon_emit(cs, va >> 32);
> +}
> +
> +static void si_emit_descriptors(struct r600_context *rctx,
> +				struct si_descriptors *desc,
> +				const uint32_t **descriptors)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint64_t va_base;
> +	int packet_start;
> +	int packet_size = 0;
> +	int last_index = desc->num_elements; /* point to a non-existing element */
> +	unsigned dirty_mask = desc->dirty_mask;
> +	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
> +
> +	assert(dirty_mask);
> +
> +	va_base = r600_resource_va(rctx->context.screen, &desc->buffer->b.b);
> +
> +	/* Copy the descriptors to a new context slot. */
> +	si_emit_cp_dma_copy_buffer(rctx,
> +				   va_base + new_context_id * desc->context_size,
> +				   va_base + desc->current_context_id * desc->context_size,
> +				   desc->context_size, R600_CP_DMA_SYNC);
> +
> +	va_base += new_context_id * desc->context_size;
> +
> +	/* Update the descriptors.
> +	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
> +	 *
> +	 * XXX When unbinding lots of resources, consider clearing the memory
> +	 *     with CP DMA instead of emitting zeros.
> +	 */
> +	while (dirty_mask) {
> +		int i = u_bit_scan(&dirty_mask);
> +
> +		assert(i < desc->num_elements);
> +
> +		if (last_index+1 == i && packet_size) {
> +			/* Append new data at the end of the last packet. */
> +			packet_size += desc->element_dw_size;
> +			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
> +		} else {
> +			/* Start a new packet. */
> +			uint64_t va = va_base + i * desc->element_dw_size * 4;
> +
> +			packet_start = cs->cdw;
> +			packet_size = 2 + desc->element_dw_size;
> +
> +			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
> +			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) |
> +					     PKT3_WRITE_DATA_WR_CONFIRM |
> +					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
> +			radeon_emit(cs, va & 0xFFFFFFFFUL);
> +			radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
> +		}
> +
> +		radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
> +
> +		last_index = i;
> +	}
> +
> +	desc->dirty_mask = 0;
> +	desc->current_context_id = new_context_id;
> +
> +	/* Now update the shader userdata pointer. */
> +	si_emit_shader_pointer(rctx, desc);
> +}
> +
> +static unsigned si_get_shader_user_data_base(unsigned shader)
> +{
> +	switch (shader) {
> +	case PIPE_SHADER_VERTEX:
> +		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
> +	case PIPE_SHADER_GEOMETRY:
> +		return R_00B230_SPI_SHADER_USER_DATA_GS_0;
> +	case PIPE_SHADER_FRAGMENT:
> +		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
> +	default:
> +		assert(0);
> +		return 0;
> +	}
> +}
> +
> +/* SAMPLER VIEWS */
> +
> +static void si_emit_sampler_views(struct r600_context *rctx, struct si_atom *atom)
> +{
> +	struct si_sampler_views *views = (struct si_sampler_views*)atom;
> +
> +	si_emit_descriptors(rctx, &views->desc, views->desc_data);
> +}
> +
> +static void si_init_sampler_views(struct r600_context *rctx,
> +				  struct si_sampler_views *views,
> +				  unsigned shader)
> +{
> +	si_init_descriptors(rctx, &views->desc,
> +			    si_get_shader_user_data_base(shader) +
> +			    SI_SGPR_RESOURCE * 4,
> +			    8, 16, si_emit_sampler_views);
> +}
> +
> +static void si_release_sampler_views(struct si_sampler_views *views)
> +{
> +	int i;
> +
> +	for (i = 0; i < Elements(views->views); i++) {
> +		pipe_sampler_view_reference(&views->views[i], NULL);
> +	}
> +	si_release_descriptors(&views->desc);
> +}
> +
> +static void si_sampler_views_begin_new_cs(struct r600_context *rctx,
> +					  struct si_sampler_views *views)
> +{
> +	unsigned mask = views->desc.enabled_mask;
> +
> +	/* Add relocations to the CS. */
> +	while (mask) {
> +		int i = u_bit_scan(&mask);
> +		struct si_pipe_sampler_view *rview =
> +			(struct si_pipe_sampler_view*)views->views[i];
> +
> +		r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ);
> +	}
> +
> +	r600_context_bo_reloc(rctx, views->desc.buffer, RADEON_USAGE_READWRITE);
> +
> +	si_emit_shader_pointer(rctx, &views->desc);
> +}
> +
> +void si_set_sampler_view(struct r600_context *rctx, unsigned shader,
> +			 unsigned slot, struct pipe_sampler_view *view,
> +			 unsigned *view_desc)
> +{
> +	struct si_sampler_views *views = &rctx->samplers[shader].views;
> +
> +	if (views->views[slot] == view)
> +		return;
> +
> +	if (view) {
> +		struct si_pipe_sampler_view *rview =
> +			(struct si_pipe_sampler_view*)view;
> +
> +		r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ);
> +
> +		pipe_sampler_view_reference(&views->views[slot], view);
> +		views->desc_data[slot] = view_desc;
> +		views->desc.enabled_mask |= 1 << slot;
> +	} else {
> +		pipe_sampler_view_reference(&views->views[slot], NULL);
> +		views->desc_data[slot] = null_desc;
> +		views->desc.enabled_mask &= ~(1 << slot);
> +	}
> +
> +	views->desc.dirty_mask |= 1 << slot;
> +	si_update_descriptors(&views->desc);
> +}
> +
> +/* INIT/DEINIT */
> +
> +void si_init_all_descriptors(struct r600_context *rctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < SI_NUM_SHADERS; i++) {
> +		si_init_sampler_views(rctx, &rctx->samplers[i].views, i);
> +
> +		rctx->atoms.sampler_views[i] = &rctx->samplers[i].views.desc.atom;
> +	}
> +}
> +
> +void si_release_all_descriptors(struct r600_context *rctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < SI_NUM_SHADERS; i++) {
> +		si_release_sampler_views(&rctx->samplers[i].views);
> +	}
> +}
> +
> +void si_all_descriptors_begin_new_cs(struct r600_context *rctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < SI_NUM_SHADERS; i++) {
> +		si_sampler_views_begin_new_cs(rctx, &rctx->samplers[i].views);
> +	}
> +}
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 1dd51a8..633cc7d 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -2489,26 +2489,17 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
>   }
>   
>   static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
> -						 unsigned count,
> -						 struct pipe_sampler_view **views,
> -						 struct r600_textures_info *samplers,
> -						 unsigned user_data_reg)
> +						 unsigned shader, unsigned count,
> +						 struct pipe_sampler_view **views)
>   {
> -	struct si_pipe_sampler_view **resource = (struct si_pipe_sampler_view **)views;
> +	struct r600_textures_info *samplers = &rctx->samplers[shader];
> +	struct si_pipe_sampler_view **rviews = (struct si_pipe_sampler_view **)views;
>   	struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx);
> -	int i, j;
> -
> -	if (!count)
> -		goto out;
> +	int i;
>   
>   	si_pm4_inval_texture_cache(pm4);
>   
> -	si_pm4_sh_data_begin(pm4);
>   	for (i = 0; i < count; i++) {
> -		pipe_sampler_view_reference(
> -			(struct pipe_sampler_view **)&samplers->views[i],
> -			views[i]);
> -
>   		if (views[i]) {
>   			struct r600_texture *rtex =
>   				(struct r600_texture*)views[i]->texture;
> @@ -2519,25 +2510,17 @@ static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
>   				samplers->depth_texture_mask &= ~(1 << i);
>   			}
>   
> -			si_pm4_add_bo(pm4, resource[i]->resource, RADEON_USAGE_READ);
> +			si_set_sampler_view(rctx, shader, i, views[i], rviews[i]->state);
>   		} else {
>   			samplers->depth_texture_mask &= ~(1 << i);
> -		}
> -
> -		for (j = 0; j < Elements(resource[i]->state); ++j) {
> -			si_pm4_sh_data_add(pm4, resource[i] ? resource[i]->state[j] : 0);
> +			si_set_sampler_view(rctx, shader, i, NULL, NULL);
>   		}
>   	}
> -
> -	for (i = count; i < NUM_TEX_UNITS; i++) {
> -		if (samplers->views[i])
> -			pipe_sampler_view_reference((struct pipe_sampler_view **)&samplers->views[i], NULL);
> +	for (; i < samplers->n_views; i++) {
> +		si_set_sampler_view(rctx, shader, i, NULL, NULL);
>   	}
>   
> -	si_pm4_sh_data_end(pm4, user_data_reg, SI_SGPR_RESOURCE);
> -
> -out:
> -	rctx->ps_samplers.n_views = count;
> +	samplers->n_views = count;
>   	return pm4;
>   }
>   
> @@ -2547,8 +2530,7 @@ static void si_set_vs_sampler_views(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_set_sampler_views(rctx, count, views, &rctx->vs_samplers,
> -			    R_00B130_SPI_SHADER_USER_DATA_VS_0);
> +	pm4 = si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views);
>   	si_pm4_set_state(rctx, vs_sampler_views, pm4);
>   }
>   
> @@ -2558,8 +2540,7 @@ static void si_set_ps_sampler_views(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_set_sampler_views(rctx, count, views, &rctx->ps_samplers,
> -				  R_00B030_SPI_SHADER_USER_DATA_PS_0);
> +	pm4 = si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views);
>   	si_pm4_set_state(rctx, ps_sampler_views, pm4);
>   }
>   
> @@ -2642,7 +2623,7 @@ static void si_bind_vs_sampler_states(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->vs_samplers,
> +	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_VERTEX],
>   			      R_00B130_SPI_SHADER_USER_DATA_VS_0);
>   	si_pm4_set_state(rctx, vs_sampler, pm4);
>   }
> @@ -2652,7 +2633,7 @@ static void si_bind_ps_sampler_states(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->ps_samplers,
> +	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_FRAGMENT],
>   			      R_00B030_SPI_SHADER_USER_DATA_PS_0);
>   	si_pm4_set_state(rctx, ps_sampler, pm4);
>   }
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index 7ce084e..610303b 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -29,6 +29,14 @@
>   
>   #include "radeonsi_pm4.h"
>   
> +/* This encapsulates a state or an operation which can emitted into the GPU
> + * command stream. */
> +struct si_atom {
> +	void (*emit)(struct r600_context *ctx, struct si_atom *state);
> +	unsigned		num_dw;
> +	bool			dirty;
> +};
> +
>   struct si_state_blend {
>   	struct si_pm4_state	pm4;
>   	uint32_t		cb_target_mask;
> @@ -103,6 +111,46 @@ union si_state {
>   	struct si_pm4_state	*array[0];
>   };
>   
> +#define NUM_TEX_UNITS 16
> +
> +/* This represents resource descriptors in memory, such as buffer resources,
> + * image resources, and sampler states.
> + */
> +struct si_descriptors {
> +	struct si_atom atom;
> +
> +	/* The size of one resource descriptor. */
> +	unsigned element_dw_size;
> +	/* The maximum number of resource descriptors. */
> +	unsigned num_elements;
> +
> +	/* The buffer where resource descriptors are stored. */
> +	struct si_resource *buffer;
> +
> +	/* The i-th bit is set if that element is dirty (changed but not emitted). */
> +	unsigned dirty_mask;
> +	/* The i-th bit is set if that element is enabled (non-NULL resource). */
> +	unsigned enabled_mask;
> +
> +	/* We can't update descriptors directly because the GPU might be
> +	 * reading them at the same time, so we have to update them
> +	 * in a copy-on-write manner. Each such copy is called a context,
> +	 * which is just another array descriptors in the same buffer. */
> +	unsigned current_context_id;
> +	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
> +	unsigned context_size;
> +
> +	/* The shader userdata register where the 64-bit pointer to the descriptor
> +	 * array will be stored. */
> +	unsigned shader_userdata_reg;
> +};
> +
> +struct si_sampler_views {
> +	struct si_descriptors		desc;
> +	struct pipe_sampler_view	*views[NUM_TEX_UNITS];
> +	const uint32_t			*desc_data[NUM_TEX_UNITS];
> +};
> +
>   #define si_pm4_block_idx(member) \
>   	(offsetof(union si_state, named.member) / sizeof(struct si_pm4_state *))
>   
> @@ -133,6 +181,14 @@ union si_state {
>   		} \
>   	} while(0)
>   
> +/* si_descriptors.c */
> +void si_set_sampler_view(struct r600_context *rctx, unsigned shader,
> +			 unsigned slot, struct pipe_sampler_view *view,
> +			 unsigned *view_desc);
> +void si_init_all_descriptors(struct r600_context *rctx);
> +void si_release_all_descriptors(struct r600_context *rctx);
> +void si_all_descriptors_begin_new_cs(struct r600_context *rctx);
> +
>   /* si_state.c */
>   struct si_pipe_shader_selector;
>   
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 3363d46..8a8ffcd 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -410,11 +410,10 @@ static void si_update_derived_state(struct r600_context *rctx)
>   
>   	if (!rctx->blitter->running) {
>   		/* Flush depth textures which need to be flushed. */
> -		if (rctx->vs_samplers.depth_texture_mask) {
> -			si_flush_depth_textures(rctx, &rctx->vs_samplers);
> -		}
> -		if (rctx->ps_samplers.depth_texture_mask) {
> -			si_flush_depth_textures(rctx, &rctx->ps_samplers);
> +		for (int i = 0; i < SI_NUM_SHADERS; i++) {
> +			if (rctx->samplers[i].depth_texture_mask) {
> +				si_flush_depth_textures(rctx, &rctx->samplers[i]);
> +			}
>   		}
>   	}
>   
> @@ -649,7 +648,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   {
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct pipe_index_buffer ib = {};
> -	uint32_t cp_coher_cntl;
> +	uint32_t cp_coher_cntl, i;
>   
>   	if (!info->count && (info->indexed || !info->count_from_stream_output))
>   		return;
> @@ -702,6 +701,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   
>   	si_need_cs_space(rctx, 0, TRUE);
>   
> +	for (i = 0; i < SI_NUM_ATOMS(rctx); i++) {
> +		if (rctx->atoms.array[i]->dirty) {
> +			rctx->atoms.array[i]->emit(rctx, rctx->atoms.array[i]);
> +			rctx->atoms.array[i]->dirty = false;
> +		}
> +	}
> +
>   	si_pm4_emit_dirty(rctx);
>   	rctx->pm4_dirty_cdwords = 0;
>   
> diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
> index 208d3a8..57ce72e 100644
> --- a/src/gallium/drivers/radeonsi/sid.h
> +++ b/src/gallium/drivers/radeonsi/sid.h
> @@ -134,6 +134,60 @@
>   #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
>   #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate))
>   
> +#define PKT3_CP_DMA					0x41
> +/* 1. header
> + * 2. SRC_ADDR_LO [31:0] or DATA [31:0]
> + * 3. CP_SYNC [31] | SRC_SEL [30:29] | ENGINE [27] | DST_SEL [21:20] | SRC_ADDR_HI [15:0]
> + * 4. DST_ADDR_LO [31:0]
> + * 5. DST_ADDR_HI [15:0]
> + * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
> + */
> +#define PKT3_CP_DMA_CP_SYNC       (1 << 31)
> +#define PKT3_CP_DMA_SRC_SEL(x)       ((x) << 29)
> +/* 0 - SRC_ADDR
> + * 1 - GDS (program SAS to 1 as well)
> + * 2 - DATA
> + */
> +#define PKT3_CP_DMA_DST_SEL(x)       ((x) << 20)
> +/* 0 - DST_ADDR
> + * 1 - GDS (program DAS to 1 as well)
> + */
> +/* COMMAND */
> +#define PKT3_CP_DMA_CMD_SRC_SWAP(x) ((x) << 23)
> +/* 0 - none
> + * 1 - 8 in 16
> + * 2 - 8 in 32
> + * 3 - 8 in 64
> + */
> +#define PKT3_CP_DMA_CMD_DST_SWAP(x) ((x) << 24)
> +/* 0 - none
> + * 1 - 8 in 16
> + * 2 - 8 in 32
> + * 3 - 8 in 64
> + */
> +#define PKT3_CP_DMA_CMD_SAS       (1 << 26)
> +/* 0 - memory
> + * 1 - register
> + */
> +#define PKT3_CP_DMA_CMD_DAS       (1 << 27)
> +/* 0 - memory
> + * 1 - register
> + */
> +#define PKT3_CP_DMA_CMD_SAIC      (1 << 28)
> +#define PKT3_CP_DMA_CMD_DAIC      (1 << 29)
> +#define PKT3_CP_DMA_CMD_RAW_WAIT  (1 << 30)
> +
> +#define PKT3_DMA_DATA					0x50 /* new for CIK */
> +/* 1. header
> + * 2. CP_SYNC [31] | SRC_SEL [30:29] | DST_SEL [21:20] | ENGINE [0]
> + * 2. SRC_ADDR_LO [31:0] or DATA [31:0]
> + * 3. SRC_ADDR_HI [31:0]
> + * 4. DST_ADDR_LO [31:0]
> + * 5. DST_ADDR_HI [31:0]
> + * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
> + */
> +
> +
>   #define R_0084FC_CP_STRMOUT_CNTL		                        0x0084FC
>   #define   S_0084FC_OFFSET_UPDATE_DONE(x)		              (((x) & 0x1) << 0)
>   #define R_0085F0_CP_COHER_CNTL                                          0x0085F0
> diff --git a/src/gallium/winsys/radeon/drm/radeon_winsys.h b/src/gallium/winsys/radeon/drm/radeon_winsys.h
> index a619d70..9c6589a 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_winsys.h
> +++ b/src/gallium/winsys/radeon/drm/radeon_winsys.h
> @@ -501,4 +501,16 @@ struct radeon_winsys {
>                               enum radeon_value_id value);
>   };
>   
> +static INLINE void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
> +{
> +    cs->buf[cs->cdw++] = value;
> +}
> +
> +static INLINE void radeon_emit_array(struct radeon_winsys_cs *cs,
> +				     const uint32_t *values, unsigned count)
> +{
> +    memcpy(cs->buf+cs->cdw, values, count * 4);
> +    cs->cdw += count;
> +}
> +
>   #endif



More information about the mesa-dev mailing list