[Mesa-dev] [PATCH 4/6] radeonsi: move vertex buffer descriptors from IB to memory

Marek Olšák maraeo at gmail.com
Sat Jul 12 09:35:35 PDT 2014


From: Marek Olšák <marek.olsak at amd.com>

This removes the intermediate storage (pm4 state) and generates descriptors
directly in a staging buffer.

It also reduces the number of flushes, because the descriptors no longer
take CS space.
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 131 ++++++++++++++++++++++++--
 src/gallium/drivers/radeonsi/si_pipe.h        |   4 +-
 src/gallium/drivers/radeonsi/si_pm4.c         |  31 ------
 src/gallium/drivers/radeonsi/si_pm4.h         |   4 -
 src/gallium/drivers/radeonsi/si_state.c       |   1 +
 src/gallium/drivers/radeonsi/si_state.h       |   4 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  |  64 +------------
 7 files changed, 133 insertions(+), 106 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index a4e14a1..18cc947 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -166,11 +166,13 @@ static void si_update_descriptors(struct si_context *sctx,
 }
 
 static void si_emit_shader_pointer(struct si_context *sctx,
-				   struct si_descriptors *desc)
+				   struct r600_atom *atom)
 {
+	struct si_descriptors *desc = (struct si_descriptors*)atom;
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	uint64_t va = r600_resource_va(sctx->b.b.screen, &desc->buffer->b.b) +
-		      desc->current_context_id * desc->context_size;
+		      desc->current_context_id * desc->context_size +
+		      desc->buffer_offset;
 
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
 	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
@@ -255,7 +257,7 @@ static void si_emit_descriptors(struct si_context *sctx,
 	desc->current_context_id = new_context_id;
 
 	/* Now update the shader userdata pointer. */
-	si_emit_shader_pointer(sctx, desc);
+	si_emit_shader_pointer(sctx, &desc->atom);
 }
 
 static unsigned si_get_shader_user_data_base(unsigned shader)
@@ -332,7 +334,7 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 
-	si_emit_shader_pointer(sctx, &views->desc);
+	si_emit_shader_pointer(sctx, &views->desc.atom);
 }
 
 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@@ -434,7 +436,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 {
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-	si_emit_shader_pointer(sctx, &states->desc);
+	si_emit_shader_pointer(sctx, &states->desc.atom);
 }
 
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@@ -535,9 +537,119 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
 
-	si_emit_shader_pointer(sctx, &buffers->desc);
+	si_emit_shader_pointer(sctx, &buffers->desc.atom);
 }
 
+/* VERTEX BUFFERS */
+
+static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
+{
+	struct si_descriptors *desc = &sctx->vertex_buffers;
+	int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
+
+		if (vb >= sctx->nr_vertex_buffers)
+			continue;
+		if (!sctx->vertex_buffer[vb].buffer)
+			continue;
+
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
+				      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
+	}
+	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+			      desc->buffer, RADEON_USAGE_READ,
+			      RADEON_PRIO_SHADER_DATA);
+
+	si_emit_shader_pointer(sctx, &desc->atom);
+}
+
+void si_update_vertex_buffers(struct si_context *sctx)
+{
+	struct pipe_context *ctx = &sctx->b.b;
+	struct si_descriptors *desc = &sctx->vertex_buffers;
+	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
+	unsigned i, count = sctx->vertex_elements->count;
+	uint64_t va;
+	uint32_t *ptr;
+
+	if (!count || !sctx->vertex_elements)
+		return;
+
+	/* Vertex buffer descriptors are the only ones which are uploaded
+	 * directly through a staging buffer and don't go through
+	 * the fine-grained upload path.
+	 */
+	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
+		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+
+	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+			      desc->buffer, RADEON_USAGE_READ,
+			      RADEON_PRIO_SHADER_DATA);
+
+	assert(count <= SI_NUM_VERTEX_BUFFERS);
+	assert(desc->current_context_id == 0);
+
+	for (i = 0; i < count; i++) {
+		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
+		struct pipe_vertex_buffer *vb;
+		struct r600_resource *rbuffer;
+		unsigned offset;
+		uint32_t *desc = &ptr[i*4];
+
+		if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers) {
+			memset(desc, 0, 16);
+			continue;
+		}
+
+		vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
+		rbuffer = (struct r600_resource*)vb->buffer;
+		if (rbuffer == NULL) {
+			memset(desc, 0, 16);
+			continue;
+		}
+
+		offset = vb->buffer_offset + ve->src_offset;
+
+		va = r600_resource_va(ctx->screen, (void*)rbuffer);
+		va += offset;
+
+		/* Fill in T# buffer resource description */
+		desc[0] = va & 0xFFFFFFFF;
+		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
+			  S_008F04_STRIDE(vb->stride);
+		if (vb->stride)
+			/* Round up by rounding down and adding 1 */
+			desc[2] = (vb->buffer->width0 - offset -
+				   sctx->vertex_elements->format_size[i]) /
+				  vb->stride + 1;
+		else
+			desc[2] = vb->buffer->width0 - offset;
+
+		desc[3] = sctx->vertex_elements->rsrc_word3[i];
+
+		if (!bound[ve->vertex_buffer_index]) {
+			r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+					      (struct r600_resource*)vb->buffer,
+					      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
+			bound[ve->vertex_buffer_index] = true;
+		}
+	}
+
+	desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
+	desc->atom.dirty = true;
+
+	/* Don't flush the const cache. It would have a very negative effect
+	 * on performance (confirmed by testing). New descriptors are always
+	 * uploaded to a fresh new buffer, so I don't think flushing the const
+	 * cache is needed. */
+	sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
+}
+
+
 /* CONSTANT BUFFERS */
 
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
@@ -1098,6 +1210,11 @@ void si_init_all_descriptors(struct si_context *sctx)
 		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
 	}
 
+	si_init_descriptors(sctx, &sctx->vertex_buffers,
+			    si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
+			    SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
+			    si_emit_shader_pointer);
+	sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1117,6 +1234,7 @@ void si_release_all_descriptors(struct si_context *sctx)
 		si_release_sampler_views(&sctx->samplers[i].views);
 		si_release_descriptors(&sctx->samplers[i].states.desc);
 	}
+	si_release_descriptors(&sctx->vertex_buffers);
 }
 
 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
@@ -1129,4 +1247,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
 		si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
 	}
+	si_vertex_buffers_begin_new_cs(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 94c6c5c..85bef5d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -101,6 +101,7 @@ struct si_context {
 	union {
 		struct {
 			/* The order matters. */
+			struct r600_atom *vertex_buffers;
 			struct r600_atom *const_buffers[SI_NUM_SHADERS];
 			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
 			struct r600_atom *sampler_views[SI_NUM_SHADERS];
@@ -128,9 +129,10 @@ struct si_context {
 	struct si_cs_shader_state	cs_shader_state;
 	/* shader information */
 	unsigned			sprite_coord_enable;
+	struct si_descriptors		vertex_buffers;
 	struct si_buffer_resources	const_buffers[SI_NUM_SHADERS];
 	struct si_buffer_resources	rw_buffers[SI_NUM_SHADERS];
-	struct si_textures_info	samplers[SI_NUM_SHADERS];
+	struct si_textures_info		samplers[SI_NUM_SHADERS];
 	struct r600_resource		*border_color_table;
 	unsigned			border_color_offset;
 
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index 082da85..705b226 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -103,37 +103,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
 	state->bo_priority[idx] = priority;
 }
 
-void si_pm4_sh_data_begin(struct si_pm4_state *state)
-{
-	si_pm4_cmd_begin(state, PKT3_NOP);
-}
-
-void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw)
-{
-	si_pm4_cmd_add(state, dw);
-}
-
-void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx)
-{
-	unsigned offs = state->last_pm4 + 1;
-	unsigned reg = base + idx * 4;
-
-	/* Bail if no data was added */
-	if (state->ndw == offs) {
-		state->ndw--;
-		return;
-	}
-
-	si_pm4_cmd_end(state, false);
-
-	si_pm4_cmd_begin(state, PKT3_SET_SH_REG_OFFSET);
-	si_pm4_cmd_add(state, (reg - SI_SH_REG_OFFSET) >> 2);
-	state->relocs[state->nrelocs++] = state->ndw;
-	si_pm4_cmd_add(state, offs << 2);
-	si_pm4_cmd_add(state, 0);
-	si_pm4_cmd_end(state, false);
-}
-
 void si_pm4_inval_shader_cache(struct si_pm4_state *state)
 {
 	state->cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h
index a719586..0702bd4 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -76,10 +76,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
 		   enum radeon_bo_usage usage,
 		   enum radeon_bo_priority priority);
 
-void si_pm4_sh_data_begin(struct si_pm4_state *state);
-void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw);
-void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx);
-
 void si_pm4_inval_shader_cache(struct si_pm4_state *state);
 void si_pm4_inval_texture_cache(struct si_pm4_state *state);
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index dc14ab1..dd79cf2 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2818,6 +2818,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 				   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
 				   S_008F0C_NUM_FORMAT(num_format) |
 				   S_008F0C_DATA_FORMAT(data_format);
+		v->format_size[i] = desc->block.bits / 8;
 	}
 	memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count);
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 53a98ed..df4e88b 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -72,6 +72,7 @@ struct si_vertex_element
 {
 	unsigned			count;
 	uint32_t			rsrc_word3[PIPE_MAX_ATTRIBS];
+	uint32_t			format_size[PIPE_MAX_ATTRIBS];
 	struct pipe_vertex_element	elements[PIPE_MAX_ATTRIBS];
 };
 
@@ -97,7 +98,6 @@ union si_state {
 		struct si_pm4_state		*vs;
 		struct si_pm4_state		*ps;
 		struct si_pm4_state		*spi;
-		struct si_pm4_state		*vertex_buffers;
 		struct si_pm4_state		*draw_info;
 		struct si_pm4_state		*draw;
 	} named;
@@ -147,6 +147,7 @@ struct si_descriptors {
 
 	/* The buffer where resource descriptors are stored. */
 	struct r600_resource *buffer;
+	unsigned buffer_offset;
 
 	/* The i-th bit is set if that element is dirty (changed but not emitted). */
 	unsigned dirty_mask;
@@ -221,6 +222,7 @@ struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 				unsigned start, unsigned count, void **states);
+void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_constant_buffer *input,
 			unsigned stride, unsigned num_records,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 01564eb..c02494d 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -664,68 +664,6 @@ static void si_update_derived_state(struct si_context *sctx)
 	}
 }
 
-static void si_vertex_buffer_update(struct si_context *sctx)
-{
-	struct pipe_context *ctx = &sctx->b.b;
-	struct si_pm4_state *pm4 = si_pm4_alloc_state(sctx);
-	bool bound[PIPE_MAX_ATTRIBS] = {};
-	unsigned i, count;
-	uint64_t va;
-
-	sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
-
-	count = sctx->vertex_elements->count;
-	assert(count <= 256 / 4);
-
-	si_pm4_sh_data_begin(pm4);
-	for (i = 0 ; i < count; i++) {
-		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
-		struct pipe_vertex_buffer *vb;
-		struct r600_resource *rbuffer;
-		unsigned offset;
-
-		if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers)
-			continue;
-
-		vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
-		rbuffer = (struct r600_resource*)vb->buffer;
-		if (rbuffer == NULL)
-			continue;
-
-		offset = 0;
-		offset += vb->buffer_offset;
-		offset += ve->src_offset;
-
-		va = r600_resource_va(ctx->screen, (void*)rbuffer);
-		va += offset;
-
-		/* Fill in T# buffer resource description */
-		si_pm4_sh_data_add(pm4, va & 0xFFFFFFFF);
-		si_pm4_sh_data_add(pm4, (S_008F04_BASE_ADDRESS_HI(va >> 32) |
-					 S_008F04_STRIDE(vb->stride)));
-		if (vb->stride)
-			/* Round up by rounding down and adding 1 */
-			si_pm4_sh_data_add(pm4,
-					   (vb->buffer->width0 - offset -
-					    util_format_get_blocksize(ve->src_format)) /
-					   vb->stride + 1);
-		else
-			si_pm4_sh_data_add(pm4, vb->buffer->width0 - offset);
-		si_pm4_sh_data_add(pm4, sctx->vertex_elements->rsrc_word3[i]);
-
-		if (!bound[ve->vertex_buffer_index]) {
-			si_pm4_add_bo(pm4, rbuffer, RADEON_USAGE_READ,
-				      RADEON_PRIO_SHADER_BUFFER_RO);
-			bound[ve->vertex_buffer_index] = true;
-		}
-	}
-	si_pm4_sh_data_end(pm4, sctx->gs_shader ?
-			   R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-			   R_00B130_SPI_SHADER_USER_DATA_VS_0,
-			   SI_SGPR_VERTEX_BUFFER);
-	si_pm4_set_state(sctx, vertex_buffers, pm4);
-}
-
 static void si_state_draw(struct si_context *sctx,
 			  const struct pipe_draw_info *info,
 			  const struct pipe_index_buffer *ib)
@@ -907,7 +845,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		return;
 
 	si_update_derived_state(sctx);
-	si_vertex_buffer_update(sctx);
+	si_update_vertex_buffers(sctx);
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
-- 
1.9.1



More information about the mesa-dev mailing list