[Mesa-dev] [PATCH 4/6] radeonsi: move vertex buffer descriptors from IB to memory
Marek Olšák
maraeo at gmail.com
Sat Jul 12 09:35:35 PDT 2014
From: Marek Olšák <marek.olsak at amd.com>
This removes the intermediate storage (pm4 state) and generates descriptors
directly in a staging buffer.
It also reduces the number of flushes, because the descriptors no longer
take CS space.
---
src/gallium/drivers/radeonsi/si_descriptors.c | 131 ++++++++++++++++++++++++--
src/gallium/drivers/radeonsi/si_pipe.h | 4 +-
src/gallium/drivers/radeonsi/si_pm4.c | 31 ------
src/gallium/drivers/radeonsi/si_pm4.h | 4 -
src/gallium/drivers/radeonsi/si_state.c | 1 +
src/gallium/drivers/radeonsi/si_state.h | 4 +-
src/gallium/drivers/radeonsi/si_state_draw.c | 64 +------------
7 files changed, 133 insertions(+), 106 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index a4e14a1..18cc947 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -166,11 +166,13 @@ static void si_update_descriptors(struct si_context *sctx,
}
static void si_emit_shader_pointer(struct si_context *sctx,
- struct si_descriptors *desc)
+ struct r600_atom *atom)
{
+ struct si_descriptors *desc = (struct si_descriptors*)atom;
struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
uint64_t va = r600_resource_va(sctx->b.b.screen, &desc->buffer->b.b) +
- desc->current_context_id * desc->context_size;
+ desc->current_context_id * desc->context_size +
+ desc->buffer_offset;
radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
@@ -255,7 +257,7 @@ static void si_emit_descriptors(struct si_context *sctx,
desc->current_context_id = new_context_id;
/* Now update the shader userdata pointer. */
- si_emit_shader_pointer(sctx, desc);
+ si_emit_shader_pointer(sctx, &desc->atom);
}
static unsigned si_get_shader_user_data_base(unsigned shader)
@@ -332,7 +334,7 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
- si_emit_shader_pointer(sctx, &views->desc);
+ si_emit_shader_pointer(sctx, &views->desc.atom);
}
static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@@ -434,7 +436,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
{
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
- si_emit_shader_pointer(sctx, &states->desc);
+ si_emit_shader_pointer(sctx, &states->desc.atom);
}
void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@@ -535,9 +537,119 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
buffers->desc.buffer, RADEON_USAGE_READWRITE,
RADEON_PRIO_SHADER_DATA);
- si_emit_shader_pointer(sctx, &buffers->desc);
+ si_emit_shader_pointer(sctx, &buffers->desc.atom);
}
+/* VERTEX BUFFERS */
+
+static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
+{
+ struct si_descriptors *desc = &sctx->vertex_buffers;
+ int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
+
+ if (vb >= sctx->nr_vertex_buffers)
+ continue;
+ if (!sctx->vertex_buffer[vb].buffer)
+ continue;
+
+ r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+ (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
+ RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
+ }
+ r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+ desc->buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_DATA);
+
+ si_emit_shader_pointer(sctx, &desc->atom);
+}
+
+void si_update_vertex_buffers(struct si_context *sctx)
+{
+ struct pipe_context *ctx = &sctx->b.b;
+ struct si_descriptors *desc = &sctx->vertex_buffers;
+ bool bound[SI_NUM_VERTEX_BUFFERS] = {};
+ unsigned i, count = sctx->vertex_elements->count;
+ uint64_t va;
+ uint32_t *ptr;
+
+ if (!count || !sctx->vertex_elements)
+ return;
+
+ /* Vertex buffer descriptors are the only ones which are uploaded
+ * directly through a staging buffer and don't go through
+ * the fine-grained upload path.
+ */
+ u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
+ (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+
+ r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+ desc->buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_DATA);
+
+ assert(count <= SI_NUM_VERTEX_BUFFERS);
+ assert(desc->current_context_id == 0);
+
+ for (i = 0; i < count; i++) {
+ struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
+ struct pipe_vertex_buffer *vb;
+ struct r600_resource *rbuffer;
+ unsigned offset;
+ uint32_t *desc = &ptr[i*4];
+
+ if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers) {
+ memset(desc, 0, 16);
+ continue;
+ }
+
+ vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
+ rbuffer = (struct r600_resource*)vb->buffer;
+ if (rbuffer == NULL) {
+ memset(desc, 0, 16);
+ continue;
+ }
+
+ offset = vb->buffer_offset + ve->src_offset;
+
+ va = r600_resource_va(ctx->screen, (void*)rbuffer);
+ va += offset;
+
+ /* Fill in T# buffer resource description */
+ desc[0] = va & 0xFFFFFFFF;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
+ S_008F04_STRIDE(vb->stride);
+ if (vb->stride)
+ /* Round up by rounding down and adding 1 */
+ desc[2] = (vb->buffer->width0 - offset -
+ sctx->vertex_elements->format_size[i]) /
+ vb->stride + 1;
+ else
+ desc[2] = vb->buffer->width0 - offset;
+
+ desc[3] = sctx->vertex_elements->rsrc_word3[i];
+
+ if (!bound[ve->vertex_buffer_index]) {
+ r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+ (struct r600_resource*)vb->buffer,
+ RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
+ bound[ve->vertex_buffer_index] = true;
+ }
+ }
+
+ desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
+ desc->atom.dirty = true;
+
+ /* Don't flush the const cache. It would have a very negative effect
+ * on performance (confirmed by testing). New descriptors are always
+ * uploaded to a fresh new buffer, so I don't think flushing the const
+ * cache is needed. */
+ sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
+}
+
+
/* CONSTANT BUFFERS */
void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
@@ -1098,6 +1210,11 @@ void si_init_all_descriptors(struct si_context *sctx)
sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
}
+ si_init_descriptors(sctx, &sctx->vertex_buffers,
+ si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
+ SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
+ si_emit_shader_pointer);
+ sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
/* Set pipe_context functions. */
sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1117,6 +1234,7 @@ void si_release_all_descriptors(struct si_context *sctx)
si_release_sampler_views(&sctx->samplers[i].views);
si_release_descriptors(&sctx->samplers[i].states.desc);
}
+ si_release_descriptors(&sctx->vertex_buffers);
}
void si_all_descriptors_begin_new_cs(struct si_context *sctx)
@@ -1129,4 +1247,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
}
+ si_vertex_buffers_begin_new_cs(sctx);
}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 94c6c5c..85bef5d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -101,6 +101,7 @@ struct si_context {
union {
struct {
/* The order matters. */
+ struct r600_atom *vertex_buffers;
struct r600_atom *const_buffers[SI_NUM_SHADERS];
struct r600_atom *rw_buffers[SI_NUM_SHADERS];
struct r600_atom *sampler_views[SI_NUM_SHADERS];
@@ -128,9 +129,10 @@ struct si_context {
struct si_cs_shader_state cs_shader_state;
/* shader information */
unsigned sprite_coord_enable;
+ struct si_descriptors vertex_buffers;
struct si_buffer_resources const_buffers[SI_NUM_SHADERS];
struct si_buffer_resources rw_buffers[SI_NUM_SHADERS];
- struct si_textures_info samplers[SI_NUM_SHADERS];
+ struct si_textures_info samplers[SI_NUM_SHADERS];
struct r600_resource *border_color_table;
unsigned border_color_offset;
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index 082da85..705b226 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -103,37 +103,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
state->bo_priority[idx] = priority;
}
-void si_pm4_sh_data_begin(struct si_pm4_state *state)
-{
- si_pm4_cmd_begin(state, PKT3_NOP);
-}
-
-void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw)
-{
- si_pm4_cmd_add(state, dw);
-}
-
-void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx)
-{
- unsigned offs = state->last_pm4 + 1;
- unsigned reg = base + idx * 4;
-
- /* Bail if no data was added */
- if (state->ndw == offs) {
- state->ndw--;
- return;
- }
-
- si_pm4_cmd_end(state, false);
-
- si_pm4_cmd_begin(state, PKT3_SET_SH_REG_OFFSET);
- si_pm4_cmd_add(state, (reg - SI_SH_REG_OFFSET) >> 2);
- state->relocs[state->nrelocs++] = state->ndw;
- si_pm4_cmd_add(state, offs << 2);
- si_pm4_cmd_add(state, 0);
- si_pm4_cmd_end(state, false);
-}
-
void si_pm4_inval_shader_cache(struct si_pm4_state *state)
{
state->cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h
index a719586..0702bd4 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -76,10 +76,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
enum radeon_bo_usage usage,
enum radeon_bo_priority priority);
-void si_pm4_sh_data_begin(struct si_pm4_state *state);
-void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw);
-void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx);
-
void si_pm4_inval_shader_cache(struct si_pm4_state *state);
void si_pm4_inval_texture_cache(struct si_pm4_state *state);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index dc14ab1..dd79cf2 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2818,6 +2818,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
S_008F0C_NUM_FORMAT(num_format) |
S_008F0C_DATA_FORMAT(data_format);
+ v->format_size[i] = desc->block.bits / 8;
}
memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 53a98ed..df4e88b 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -72,6 +72,7 @@ struct si_vertex_element
{
unsigned count;
uint32_t rsrc_word3[PIPE_MAX_ATTRIBS];
+ uint32_t format_size[PIPE_MAX_ATTRIBS];
struct pipe_vertex_element elements[PIPE_MAX_ATTRIBS];
};
@@ -97,7 +98,6 @@ union si_state {
struct si_pm4_state *vs;
struct si_pm4_state *ps;
struct si_pm4_state *spi;
- struct si_pm4_state *vertex_buffers;
struct si_pm4_state *draw_info;
struct si_pm4_state *draw;
} named;
@@ -147,6 +147,7 @@ struct si_descriptors {
/* The buffer where resource descriptors are stored. */
struct r600_resource *buffer;
+ unsigned buffer_offset;
/* The i-th bit is set if that element is dirty (changed but not emitted). */
unsigned dirty_mask;
@@ -221,6 +222,7 @@ struct si_buffer_resources {
/* si_descriptors.c */
void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
unsigned start, unsigned count, void **states);
+void si_update_vertex_buffers(struct si_context *sctx);
void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
struct pipe_constant_buffer *input,
unsigned stride, unsigned num_records,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 01564eb..c02494d 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -664,68 +664,6 @@ static void si_update_derived_state(struct si_context *sctx)
}
}
-static void si_vertex_buffer_update(struct si_context *sctx)
-{
- struct pipe_context *ctx = &sctx->b.b;
- struct si_pm4_state *pm4 = si_pm4_alloc_state(sctx);
- bool bound[PIPE_MAX_ATTRIBS] = {};
- unsigned i, count;
- uint64_t va;
-
- sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
-
- count = sctx->vertex_elements->count;
- assert(count <= 256 / 4);
-
- si_pm4_sh_data_begin(pm4);
- for (i = 0 ; i < count; i++) {
- struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
- struct pipe_vertex_buffer *vb;
- struct r600_resource *rbuffer;
- unsigned offset;
-
- if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers)
- continue;
-
- vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
- rbuffer = (struct r600_resource*)vb->buffer;
- if (rbuffer == NULL)
- continue;
-
- offset = 0;
- offset += vb->buffer_offset;
- offset += ve->src_offset;
-
- va = r600_resource_va(ctx->screen, (void*)rbuffer);
- va += offset;
-
- /* Fill in T# buffer resource description */
- si_pm4_sh_data_add(pm4, va & 0xFFFFFFFF);
- si_pm4_sh_data_add(pm4, (S_008F04_BASE_ADDRESS_HI(va >> 32) |
- S_008F04_STRIDE(vb->stride)));
- if (vb->stride)
- /* Round up by rounding down and adding 1 */
- si_pm4_sh_data_add(pm4,
- (vb->buffer->width0 - offset -
- util_format_get_blocksize(ve->src_format)) /
- vb->stride + 1);
- else
- si_pm4_sh_data_add(pm4, vb->buffer->width0 - offset);
- si_pm4_sh_data_add(pm4, sctx->vertex_elements->rsrc_word3[i]);
-
- if (!bound[ve->vertex_buffer_index]) {
- si_pm4_add_bo(pm4, rbuffer, RADEON_USAGE_READ,
- RADEON_PRIO_SHADER_BUFFER_RO);
- bound[ve->vertex_buffer_index] = true;
- }
- }
- si_pm4_sh_data_end(pm4, sctx->gs_shader ?
- R_00B330_SPI_SHADER_USER_DATA_ES_0 :
- R_00B130_SPI_SHADER_USER_DATA_VS_0,
- SI_SGPR_VERTEX_BUFFER);
- si_pm4_set_state(sctx, vertex_buffers, pm4);
-}
-
static void si_state_draw(struct si_context *sctx,
const struct pipe_draw_info *info,
const struct pipe_index_buffer *ib)
@@ -907,7 +845,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
return;
si_update_derived_state(sctx);
- si_vertex_buffer_update(sctx);
+ si_update_vertex_buffers(sctx);
if (info->indexed) {
/* Initialize the index buffer struct. */
--
1.9.1
More information about the mesa-dev
mailing list