[Mesa-dev] [PATCH] radeonsi: add flexible shader descriptor management and use it for sampler views

Thu Aug 15 01:27:23 PDT 2013

Am 15.08.2013 05:25, schrieb Marek Olšák:
> (This should be applied before MSAA, which will need to be rebased.)
>
> It moves all sampler view descriptors to a buffer.
> It supports partial resource updates and it can also unbind resources
> (required for FMASK texturing).
>
> The buffer contains all sampler view descriptors for one shader stage,
> represented as an array. On top of that, there are N arrays in the buffer,
> which are used to emulate context registers as implemented by the previous
> ASICs (each array is a context).
>
> This uses the RCU synchronization approach to avoid read-after-write hazards
> as discussed in the thread:
> "radeonsi: add FMASK texture binding slots and resource setup"
>
> CP DMA is used to clear the descriptors at context initialization and to copy
> the descriptors from one context to the next.
>
> IMPORTANT:
>    128 resource contexts are needed, 64 doesn't work. If I set
>    SH_KCACHE_ACTION_ENA before every draw call, only 2 contexts are needed.
>    I don't have an explanation for this.
> ---

The idea itself looks really good to me, but we should probably also 
move the all resources and samplers to the new model and then rip out 
the code that stores them directly into the IB.

> +/* Emit a CP DMA packet to do a copy from one buffer to another.
> + * The size must fit in bits [20:0]. Notes:
> + *
> + * 1) Set sync to true if you want the 3D engine to wait until CP DMA is done.
> + *
> + * 2) Set raw_hazard_wait to true if the source data was used as a destination
> + *    in a previous CP DMA packet. It's for preventing a read-after-write hazard
> + *    between two CP DMA packets.
> + */
> +static void si_emit_cp_dma_copy_buffer(struct r600_context *rctx,
> +				       uint64_t dst_va, uint64_t src_va,
> +				       unsigned size,
> +				       bool sync, bool raw_hazard_wait)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint32_t sync_flag = sync ? PKT3_CP_DMA_CP_SYNC : 0;
> +	uint32_t raw_wait = raw_hazard_wait ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
> +
> +	assert(size);
> +	assert((size & ((1<<21)-1)) == size);
> +
> +	cs->buf[cs->cdw++] = PKT3(PKT3_CP_DMA, 4, 0);
> +	cs->buf[cs->cdw++] = src_va;			/* SRC_ADDR_LO [31:0] */
> +	cs->buf[cs->cdw++] = sync_flag | ((src_va >> 32) & 0xff); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */
> +	cs->buf[cs->cdw++] = dst_va;			/* DST_ADDR_LO [31:0] */
> +	cs->buf[cs->cdw++] = (dst_va >> 32) & 0xff;	/* DST_ADDR_HI [7:0] */
> +	cs->buf[cs->cdw++] = size | raw_wait;		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
> +}
> +
> +/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
> +static void si_emit_cp_dma_clear_buffer(struct r600_context *rctx,
> +					uint64_t dst_va, unsigned size,
> +					uint32_t clear_value,
> +					bool sync, bool raw_hazard_wait)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint32_t sync_flag = sync ? PKT3_CP_DMA_CP_SYNC : 0;
> +	uint32_t raw_wait = raw_hazard_wait ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
> +
> +	assert(size);
> +	assert((size & ((1<<21)-1)) == size);
> +
> +	cs->buf[cs->cdw++] = PKT3(PKT3_CP_DMA, 4, 0);
> +	cs->buf[cs->cdw++] = clear_value;		/* DATA [31:0] */
> +	cs->buf[cs->cdw++] = sync_flag | PKT3_CP_DMA_SRC_SEL(2); /* CP_SYNC [31] | SRC_SEL[30:29] */
> +	cs->buf[cs->cdw++] = dst_va;			/* DST_ADDR_LO [31:0] */
> +	cs->buf[cs->cdw++] = (dst_va >> 32) & 0xff;	/* DST_ADDR_HI [7:0] */
> +	cs->buf[cs->cdw++] = size | raw_wait;		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
> +}

Can we use some kind of macro or inline function instead of 
"cs->buf[cs->cdw++] " ? That should help of we need to port that over to 
a different CS mechanism.

And IIRC the CP DMA is identical on all chipset generation (maybe 
excluding early R6xx, but I'm not 100% sure of that), so it might be a 
good idea to start sharing code again by putting this under 
"src/gallium/drivers/radeon/radeon_cp_dma.c". Not necessary now, but 
more as a general idea. What do you think?

Christian.

> +
> +static void si_init_descriptors(struct r600_context *rctx,
> +				struct si_descriptors *desc,
> +				unsigned shader_userdata_reg,
> +				unsigned element_dw_size,
> +				unsigned num_elements,
> +				void (*emit_func)(struct r600_context *ctx, struct si_atom *state))
> +{
> +	uint64_t va;
> +
> +	desc->atom.emit = emit_func;
> +	desc->shader_userdata_reg = shader_userdata_reg;
> +	desc->element_dw_size = element_dw_size;
> +	desc->num_elements = num_elements;
> +	desc->context_size = num_elements * element_dw_size * 4;
> +
> +	desc->buffer = (struct si_resource*)
> +		pipe_buffer_create(rctx->context.screen, PIPE_BIND_CUSTOM,
> +				   PIPE_USAGE_STATIC,
> +				   SI_NUM_CONTEXTS * desc->context_size);
> +
> +	r600_context_bo_reloc(rctx, desc->buffer, RADEON_USAGE_READWRITE);
> +	va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b);
> +
> +	/* We don't check for CS space here, because this should be called
> +	 * only once at context initialization. */
> +	si_emit_cp_dma_clear_buffer(rctx, va, desc->buffer->b.b.width0, 0,
> +				    true, false);
> +}
> +
> +static void si_release_descriptors(struct si_descriptors *desc)
> +{
> +	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
> +}
> +
> +static void si_update_descriptors(struct si_descriptors *desc)
> +{
> +	if (desc->dirty_mask) {
> +		desc->atom.num_dw =
> +			6 + /* copy */
> +			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask) + /* update */
> +			4; /* pointer update */
> +		desc->atom.dirty = true;
> +	} else {
> +		desc->atom.dirty = false;
> +	}
> +}
> +
> +static void si_emit_shader_pointer(struct r600_context *rctx,
> +				   struct si_descriptors *desc)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint64_t va = r600_resource_va(rctx->context.screen, &desc->buffer->b.b) +
> +		      desc->current_context_id * desc->context_size;
> +
> +	cs->buf[cs->cdw++] = PKT3(PKT3_SET_SH_REG, 2, 0);
> +	cs->buf[cs->cdw++] = (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2;
> +	cs->buf[cs->cdw++] = va;
> +	cs->buf[cs->cdw++] = va >> 32;
> +}
> +
> +static void si_emit_descriptors(struct r600_context *rctx,
> +				struct si_descriptors *desc,
> +				const uint32_t **descriptors)
> +{
> +	struct radeon_winsys_cs *cs = rctx->cs;
> +	uint64_t va_base;
> +	int packet_start;
> +	int packet_size = 0;
> +	int last_index = desc->num_elements; /* point to a non-existing element */
> +	unsigned dirty_mask = desc->dirty_mask;
> +	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
> +
> +	assert(dirty_mask);
> +
> +	va_base = r600_resource_va(rctx->context.screen, &desc->buffer->b.b);
> +
> +	/* Copy the descriptors to a new context slot. */
> +	si_emit_cp_dma_copy_buffer(rctx,
> +				   va_base + new_context_id * desc->context_size,
> +				   va_base + desc->current_context_id * desc->context_size,
> +				   desc->context_size, true, false);
> +
> +	va_base += new_context_id * desc->context_size;
> +
> +	/* Update the descriptors.
> +	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
> +	 *
> +	 * XXX When unbinding lots of resources, consider clearing the memory
> +	 *     with CP DMA instead of emitting zeros.
> +	 */
> +	while (dirty_mask) {
> +		int i = u_bit_scan(&dirty_mask);
> +
> +		assert(i < desc->num_elements);
> +
> +		if (last_index+1 == i && packet_size) {
> +			/* Append new data at the end of the last packet. */
> +			packet_size += desc->element_dw_size;
> +			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
> +		} else {
> +			/* Start a new packet. */
> +			uint64_t va = va_base + i * desc->element_dw_size * 4;
> +
> +			packet_start = cs->cdw;
> +			packet_size = 2 + desc->element_dw_size;
> +
> +			cs->buf[cs->cdw++] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
> +			cs->buf[cs->cdw++] = PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) |
> +					     PKT3_WRITE_DATA_WR_CONFIRM |
> +					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME);
> +			cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;
> +			cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFFFFFFFUL;
> +		}
> +
> +		memcpy(cs->buf+cs->cdw, descriptors[i], desc->element_dw_size * 4);
> +		cs->cdw += desc->element_dw_size;
> +
> +		last_index = i;
> +	}
> +
> +	desc->dirty_mask = 0;
> +	desc->current_context_id = new_context_id;
> +
> +	/* Now update the shader userdata pointer. */
> +	si_emit_shader_pointer(rctx, desc);
> +}
> +
> +static unsigned si_get_shader_user_data_base(unsigned shader)
> +{
> +	switch (shader) {
> +	case PIPE_SHADER_VERTEX:
> +		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
> +	case PIPE_SHADER_GEOMETRY:
> +		return R_00B230_SPI_SHADER_USER_DATA_GS_0;
> +	case PIPE_SHADER_FRAGMENT:
> +		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
> +	default:
> +		assert(0);
> +		return 0;
> +	}
> +}
> +
> +/* SAMPLER VIEWS */
> +
> +static void si_emit_sampler_views(struct r600_context *rctx, struct si_atom *atom)
> +{
> +	struct si_sampler_views *views = (struct si_sampler_views*)atom;
> +
> +	si_emit_descriptors(rctx, &views->desc, views->desc_data);
> +}
> +
> +static void si_init_sampler_views(struct r600_context *rctx,
> +				  struct si_sampler_views *views,
> +				  unsigned shader)
> +{
> +	si_init_descriptors(rctx, &views->desc,
> +			    si_get_shader_user_data_base(shader) +
> +			    SI_SGPR_RESOURCE * 4,
> +			    8, 16, si_emit_sampler_views);
> +}
> +
> +static void si_release_sampler_views(struct si_sampler_views *views)
> +{
> +	int i;
> +
> +	for (i = 0; i < Elements(views->views); i++) {
> +		pipe_sampler_view_reference(&views->views[i], NULL);
> +	}
> +	si_release_descriptors(&views->desc);
> +}
> +
> +static void si_sampler_views_begin_new_cs(struct r600_context *rctx,
> +					  struct si_sampler_views *views)
> +{
> +	unsigned mask = views->desc.enabled_mask;
> +
> +	/* Add relocations to the CS. */
> +	while (mask) {
> +		int i = u_bit_scan(&mask);
> +		struct si_pipe_sampler_view *rview =
> +			(struct si_pipe_sampler_view*)views->views[i];
> +
> +		r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ);
> +	}
> +
> +	r600_context_bo_reloc(rctx, views->desc.buffer, RADEON_USAGE_READWRITE);
> +
> +	si_emit_shader_pointer(rctx, &views->desc);
> +}
> +
> +void si_set_sampler_view(struct r600_context *rctx, unsigned shader,
> +			 unsigned slot, struct pipe_sampler_view *view,
> +			 unsigned *view_desc)
> +{
> +	struct si_sampler_views *views = &rctx->samplers[shader].views;
> +
> +	if (views->views[slot] == view)
> +		return;
> +
> +	if (view) {
> +		struct si_pipe_sampler_view *rview =
> +			(struct si_pipe_sampler_view*)view;
> +
> +		r600_context_bo_reloc(rctx, rview->resource, RADEON_USAGE_READ);
> +
> +		pipe_sampler_view_reference(&views->views[slot], view);
> +		views->desc_data[slot] = view_desc;
> +		views->desc.enabled_mask |= 1 << slot;
> +	} else {
> +		pipe_sampler_view_reference(&views->views[slot], NULL);
> +		views->desc_data[slot] = null_desc;
> +		views->desc.enabled_mask &= ~(1 << slot);
> +	}
> +
> +	views->desc.dirty_mask |= 1 << slot;
> +	si_update_descriptors(&views->desc);
> +}
> +
> +/* INIT/DEINIT */
> +
> +void si_init_all_descriptors(struct r600_context *rctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < SI_NUM_SHADERS; i++) {
> +		si_init_sampler_views(rctx, &rctx->samplers[i].views, i);
> +
> +		rctx->atoms.sampler_views[i] = &rctx->samplers[i].views.desc.atom;
> +	}
> +}
> +
> +void si_release_all_descriptors(struct r600_context *rctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < SI_NUM_SHADERS; i++) {
> +		si_release_sampler_views(&rctx->samplers[i].views);
> +	}
> +}
> +
> +void si_all_descriptors_begin_new_cs(struct r600_context *rctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < SI_NUM_SHADERS; i++) {
> +		si_sampler_views_begin_new_cs(rctx, &rctx->samplers[i].views);
> +	}
> +}
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 1dd51a8..633cc7d 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -2489,26 +2489,17 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
>   }
>   
>   static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
> -						 unsigned count,
> -						 struct pipe_sampler_view **views,
> -						 struct r600_textures_info *samplers,
> -						 unsigned user_data_reg)
> +						 unsigned shader, unsigned count,
> +						 struct pipe_sampler_view **views)
>   {
> -	struct si_pipe_sampler_view **resource = (struct si_pipe_sampler_view **)views;
> +	struct r600_textures_info *samplers = &rctx->samplers[shader];
> +	struct si_pipe_sampler_view **rviews = (struct si_pipe_sampler_view **)views;
>   	struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx);
> -	int i, j;
> -
> -	if (!count)
> -		goto out;
> +	int i;
>   
>   	si_pm4_inval_texture_cache(pm4);
>   
> -	si_pm4_sh_data_begin(pm4);
>   	for (i = 0; i < count; i++) {
> -		pipe_sampler_view_reference(
> -			(struct pipe_sampler_view **)&samplers->views[i],
> -			views[i]);
> -
>   		if (views[i]) {
>   			struct r600_texture *rtex =
>   				(struct r600_texture*)views[i]->texture;
> @@ -2519,25 +2510,17 @@ static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
>   				samplers->depth_texture_mask &= ~(1 << i);
>   			}
>   
> -			si_pm4_add_bo(pm4, resource[i]->resource, RADEON_USAGE_READ);
> +			si_set_sampler_view(rctx, shader, i, views[i], rviews[i]->state);
>   		} else {
>   			samplers->depth_texture_mask &= ~(1 << i);
> -		}
> -
> -		for (j = 0; j < Elements(resource[i]->state); ++j) {
> -			si_pm4_sh_data_add(pm4, resource[i] ? resource[i]->state[j] : 0);
> +			si_set_sampler_view(rctx, shader, i, NULL, NULL);
>   		}
>   	}
> -
> -	for (i = count; i < NUM_TEX_UNITS; i++) {
> -		if (samplers->views[i])
> -			pipe_sampler_view_reference((struct pipe_sampler_view **)&samplers->views[i], NULL);
> +	for (; i < samplers->n_views; i++) {
> +		si_set_sampler_view(rctx, shader, i, NULL, NULL);
>   	}
>   
> -	si_pm4_sh_data_end(pm4, user_data_reg, SI_SGPR_RESOURCE);
> -
> -out:
> -	rctx->ps_samplers.n_views = count;
> +	samplers->n_views = count;
>   	return pm4;
>   }
>   
> @@ -2547,8 +2530,7 @@ static void si_set_vs_sampler_views(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_set_sampler_views(rctx, count, views, &rctx->vs_samplers,
> -			    R_00B130_SPI_SHADER_USER_DATA_VS_0);
> +	pm4 = si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views);
>   	si_pm4_set_state(rctx, vs_sampler_views, pm4);
>   }
>   
> @@ -2558,8 +2540,7 @@ static void si_set_ps_sampler_views(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_set_sampler_views(rctx, count, views, &rctx->ps_samplers,
> -				  R_00B030_SPI_SHADER_USER_DATA_PS_0);
> +	pm4 = si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views);
>   	si_pm4_set_state(rctx, ps_sampler_views, pm4);
>   }
>   
> @@ -2642,7 +2623,7 @@ static void si_bind_vs_sampler_states(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->vs_samplers,
> +	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_VERTEX],
>   			      R_00B130_SPI_SHADER_USER_DATA_VS_0);
>   	si_pm4_set_state(rctx, vs_sampler, pm4);
>   }
> @@ -2652,7 +2633,7 @@ static void si_bind_ps_sampler_states(struct pipe_context *ctx, unsigned count,
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct si_pm4_state *pm4;
>   
> -	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->ps_samplers,
> +	pm4 = si_bind_sampler_states(rctx, count, states, &rctx->samplers[PIPE_SHADER_FRAGMENT],
>   			      R_00B030_SPI_SHADER_USER_DATA_PS_0);
>   	si_pm4_set_state(rctx, ps_sampler, pm4);
>   }
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index 7ce084e..610303b 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -29,6 +29,14 @@
>   
>   #include "radeonsi_pm4.h"
>   
> +/* This encapsulates a state or an operation which can emitted into the GPU
> + * command stream. */
> +struct si_atom {
> +	void (*emit)(struct r600_context *ctx, struct si_atom *state);
> +	unsigned		num_dw;
> +	bool			dirty;
> +};
> +
>   struct si_state_blend {
>   	struct si_pm4_state	pm4;
>   	uint32_t		cb_target_mask;
> @@ -103,6 +111,46 @@ union si_state {
>   	struct si_pm4_state	*array[0];
>   };
>   
> +#define NUM_TEX_UNITS 16
> +
> +/* This represents resource descriptors in memory, such as buffer resources,
> + * image resources, and sampler states.
> + */
> +struct si_descriptors {
> +	struct si_atom atom;
> +
> +	/* The size of one resource descriptor. */
> +	unsigned element_dw_size;
> +	/* The maximum number of resource descriptors. */
> +	unsigned num_elements;
> +
> +	/* The buffer where resource descriptors are stored. */
> +	struct si_resource *buffer;
> +
> +	/* The i-th bit is set if that element is dirty (changed but not emitted). */
> +	unsigned dirty_mask;
> +	/* The i-th bit is set if that element is enabled (non-NULL resource). */
> +	unsigned enabled_mask;
> +
> +	/* We can't update descriptors directly because the GPU might be
> +	 * reading them at the same time, so we have to update them
> +	 * in a copy-on-write manner. Each such copy is called a context,
> +	 * which is just another array descriptors in the same buffer. */
> +	unsigned current_context_id;
> +	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
> +	unsigned context_size;
> +
> +	/* The shader userdata register where the 64-bit pointer to the descriptor
> +	 * array will be stored. */
> +	unsigned shader_userdata_reg;
> +};
> +
> +struct si_sampler_views {
> +	struct si_descriptors		desc;
> +	struct pipe_sampler_view	*views[NUM_TEX_UNITS];
> +	const uint32_t			*desc_data[NUM_TEX_UNITS];
> +};
> +
>   #define si_pm4_block_idx(member) \
>   	(offsetof(union si_state, named.member) / sizeof(struct si_pm4_state *))
>   
> @@ -133,6 +181,14 @@ union si_state {
>   		} \
>   	} while(0)
>   
> +/* si_descriptors.c */
> +void si_set_sampler_view(struct r600_context *rctx, unsigned shader,
> +			 unsigned slot, struct pipe_sampler_view *view,
> +			 unsigned *view_desc);
> +void si_init_all_descriptors(struct r600_context *rctx);
> +void si_release_all_descriptors(struct r600_context *rctx);
> +void si_all_descriptors_begin_new_cs(struct r600_context *rctx);
> +
>   /* si_state.c */
>   struct si_pipe_shader_selector;
>   
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 3363d46..8a8ffcd 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -410,11 +410,10 @@ static void si_update_derived_state(struct r600_context *rctx)
>   
>   	if (!rctx->blitter->running) {
>   		/* Flush depth textures which need to be flushed. */
> -		if (rctx->vs_samplers.depth_texture_mask) {
> -			si_flush_depth_textures(rctx, &rctx->vs_samplers);
> -		}
> -		if (rctx->ps_samplers.depth_texture_mask) {
> -			si_flush_depth_textures(rctx, &rctx->ps_samplers);
> +		for (int i = 0; i < SI_NUM_SHADERS; i++) {
> +			if (rctx->samplers[i].depth_texture_mask) {
> +				si_flush_depth_textures(rctx, &rctx->samplers[i]);
> +			}
>   		}
>   	}
>   
> @@ -649,7 +648,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   {
>   	struct r600_context *rctx = (struct r600_context *)ctx;
>   	struct pipe_index_buffer ib = {};
> -	uint32_t cp_coher_cntl;
> +	uint32_t cp_coher_cntl, i;
>   
>   	if (!info->count && (info->indexed || !info->count_from_stream_output))
>   		return;
> @@ -702,6 +701,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   
>   	si_need_cs_space(rctx, 0, TRUE);
>   
> +	for (i = 0; i < SI_NUM_ATOMS(rctx); i++) {
> +		if (rctx->atoms.array[i]->dirty) {
> +			rctx->atoms.array[i]->emit(rctx, rctx->atoms.array[i]);
> +			rctx->atoms.array[i]->dirty = false;
> +		}
> +	}
> +
>   	si_pm4_emit_dirty(rctx);
>   	rctx->pm4_dirty_cdwords = 0;
>   
> diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
> index 208d3a8..3ec110a 100644
> --- a/src/gallium/drivers/radeonsi/sid.h
> +++ b/src/gallium/drivers/radeonsi/sid.h
> @@ -134,6 +134,49 @@
>   #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
>   #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate))
>   
> +#define PKT3_CP_DMA					0x41
> +/* 1. header
> + * 2. SRC_ADDR_LO [31:0] or DATA [31:0]
> + * 3. CP_SYNC [31] | SRC_SEL [30:29] | ENGINE [27] | DST_SEL [21:20] | SRC_ADDR_HI [7:0]
> + * 4. DST_ADDR_LO [31:0]
> + * 5. DST_ADDR_HI [7:0]
> + * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
> + */
> +#define PKT3_CP_DMA_CP_SYNC       (1 << 31)
> +#define PKT3_CP_DMA_SRC_SEL(x)       ((x) << 29)
> +/* 0 - SRC_ADDR
> + * 1 - GDS (program SAS to 1 as well)
> + * 2 - DATA
> + */
> +#define PKT3_CP_DMA_DST_SEL(x)       ((x) << 20)
> +/* 0 - DST_ADDR
> + * 1 - GDS (program DAS to 1 as well)
> + */
> +/* COMMAND */
> +#define PKT3_CP_DMA_CMD_SRC_SWAP(x) ((x) << 23)
> +/* 0 - none
> + * 1 - 8 in 16
> + * 2 - 8 in 32
> + * 3 - 8 in 64
> + */
> +#define PKT3_CP_DMA_CMD_DST_SWAP(x) ((x) << 24)
> +/* 0 - none
> + * 1 - 8 in 16
> + * 2 - 8 in 32
> + * 3 - 8 in 64
> + */
> +#define PKT3_CP_DMA_CMD_SAS       (1 << 26)
> +/* 0 - memory
> + * 1 - register
> + */
> +#define PKT3_CP_DMA_CMD_DAS       (1 << 27)
> +/* 0 - memory
> + * 1 - register
> + */
> +#define PKT3_CP_DMA_CMD_SAIC      (1 << 28)
> +#define PKT3_CP_DMA_CMD_DAIC      (1 << 29)
> +#define PKT3_CP_DMA_CMD_RAW_WAIT  (1 << 30)
> +
>   #define R_0084FC_CP_STRMOUT_CNTL		                        0x0084FC
>   #define   S_0084FC_OFFSET_UPDATE_DONE(x)		              (((x) & 0x1) << 0)
>   #define R_0085F0_CP_COHER_CNTL                                          0x0085F0