[Mesa-dev] [PATCH 10/10] radeonsi/gfx9: use CE RAM optimally
Nicolai Hähnle
nhaehnle at gmail.com
Thu May 18 09:38:44 UTC 2017
On 17.05.2017 21:38, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> On GFX9 with only 4K CE RAM, define the range of slots that will be
> allocated in CE RAM. All other slots will be uploaded directly. This will
> switch dynamically according to which slots are used by current shaders.
>
> GFX9 CE usage should now be similar to VI instead of being often disabled.
>
> Tested on VI by taking the GFX9 CE allocation codepath and setting
> num_ce_slots = 2 everywhere to get frequent switches between both modes.
> CE is still disabled on GFX9.
> ---
> src/gallium/drivers/radeonsi/si_descriptors.c | 163 ++++++++++++++++++++------
> src/gallium/drivers/radeonsi/si_state.h | 7 ++
> 2 files changed, 134 insertions(+), 36 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index af174d5..5753ae0 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -88,40 +88,45 @@ static uint32_t null_texture_descriptor[8] = {
>
> static uint32_t null_image_descriptor[8] = {
> 0,
> 0,
> 0,
> S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
> /* the rest must contain zeros, which is also used by the buffer
> * descriptor */
> };
>
> -static void si_init_descriptors(struct si_descriptors *desc,
> +static void si_init_descriptors(struct si_context *sctx,
> + struct si_descriptors *desc,
> unsigned shader_userdata_index,
> unsigned element_dw_size,
> unsigned num_elements,
> + unsigned first_ce_slot,
> + unsigned num_ce_slots,
> unsigned *ce_offset)
> {
> assert(num_elements <= sizeof(desc->dirty_mask)*8);
>
> desc->list = CALLOC(num_elements, element_dw_size * 4);
> desc->element_dw_size = element_dw_size;
> desc->num_elements = num_elements;
> + desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
> + desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
> desc->dirty_mask = u_bit_consecutive64(0, num_elements);
> desc->shader_userdata_offset = shader_userdata_index * 4;
>
> - if (ce_offset) {
> + if (desc->num_ce_slots) {
> desc->uses_ce = true;
> desc->ce_offset = *ce_offset;
>
> /* make sure that ce_offset stays 32 byte aligned */
> - *ce_offset += element_dw_size * num_elements * 4;
> + *ce_offset += element_dw_size * desc->num_ce_slots * 4;
> }
> }
>
> static void si_release_descriptors(struct si_descriptors *desc)
> {
> r600_resource_reference(&desc->buffer, NULL);
> FREE(desc->list);
> }
>
> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
> @@ -195,38 +200,43 @@ static bool si_upload_descriptors(struct si_context *sctx,
> struct si_descriptors *desc,
> struct r600_atom * atom)
> {
> unsigned slot_size = desc->element_dw_size * 4;
> unsigned first_slot_offset = desc->first_active_slot * slot_size;
> unsigned upload_size = desc->num_active_slots * slot_size;
>
> if (!upload_size)
> return true;
>
> - if (sctx->ce_ib && desc->uses_ce) {
> - uint32_t const* list = (uint32_t const*)desc->list;
> + if (desc->uses_ce) {
> + const uint32_t *list = desc->list +
> + desc->first_ce_slot * desc->element_dw_size;
> + uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) &
> + u_bit_consecutive64(0, desc->num_ce_slots);
>
> - while(desc->dirty_mask) {
> +
> + while (mask) {
> int begin, count;
> - u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
> - &count);
> + u_bit_scan_consecutive_range64(&mask, &begin, &count);
>
> begin *= desc->element_dw_size;
> count *= desc->element_dw_size;
>
> radeon_emit(sctx->ce_ib,
> PKT3(PKT3_WRITE_CONST_RAM, count, 0));
> radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
> radeon_emit_array(sctx->ce_ib, list + begin, count);
> }
>
> - if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
> + if (!si_ce_upload(sctx,
> + desc->ce_offset +
> + (first_slot_offset - desc->first_ce_slot * slot_size),
> upload_size, (unsigned*)&desc->buffer_offset,
> &desc->buffer))
> return false;
> } else {
> uint32_t *ptr;
>
> u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
> si_optimal_tcc_alignment(sctx, upload_size),
> (unsigned*)&desc->buffer_offset,
> (struct pipe_resource**)&desc->buffer,
> @@ -910,38 +920,41 @@ static void si_bind_sampler_states(struct pipe_context *ctx,
> continue;
>
> memcpy(desc->list + desc_slot * 16 + 12, sstates[i]->val, 4*4);
> desc->dirty_mask |= 1ull << desc_slot;
> sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
> }
> }
>
> /* BUFFER RESOURCES */
>
> -static void si_init_buffer_resources(struct si_buffer_resources *buffers,
> +static void si_init_buffer_resources(struct si_context *sctx,
> + struct si_buffer_resources *buffers,
> struct si_descriptors *descs,
> unsigned num_buffers,
> + unsigned first_ce_slot,
> + unsigned num_ce_slots,
> unsigned shader_userdata_index,
> enum radeon_bo_usage shader_usage,
> enum radeon_bo_usage shader_usage_constbuf,
> enum radeon_bo_priority priority,
> enum radeon_bo_priority priority_constbuf,
> unsigned *ce_offset)
> {
> buffers->shader_usage = shader_usage;
> buffers->shader_usage_constbuf = shader_usage_constbuf;
> buffers->priority = priority;
> buffers->priority_constbuf = priority_constbuf;
> buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
>
> - si_init_descriptors(descs, shader_userdata_index, 4,
> - num_buffers, ce_offset);
> + si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers,
> + first_ce_slot, num_ce_slots, ce_offset);
> }
>
> static void si_release_buffer_resources(struct si_buffer_resources *buffers,
> struct si_descriptors *descs)
> {
> int i;
>
> for (i = 0; i < descs->num_elements; i++) {
> pipe_resource_reference(&buffers->buffers[i], NULL);
> }
> @@ -1983,82 +1996,140 @@ void si_emit_compute_shader_userdata(struct si_context *sctx)
> while (mask) {
> unsigned i = u_bit_scan(&mask);
>
> si_emit_shader_pointer(sctx, descs + i, base);
> }
> sctx->shader_pointers_dirty &= ~compute_mask;
> }
>
> /* INIT/DEINIT/UPLOAD */
>
> +/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order
> + * to make CE RAM as useful as possible, this defines limits
> + * for the number slots that can be in CE RAM on GFX9. If a shader
> + * is using more, descriptors will be uploaded to memory directly and
> + * CE won't be used.
> + *
> + * These numbers are based on shader-db.
> + */
> +static unsigned gfx9_max_samplers[SI_NUM_SHADERS] = {
> + [PIPE_SHADER_VERTEX] = 0,
> + [PIPE_SHADER_TESS_CTRL] = 0,
> + [PIPE_SHADER_TESS_EVAL] = 1,
> + [PIPE_SHADER_GEOMETRY] = 0,
> + [PIPE_SHADER_FRAGMENT] = 24,
> + [PIPE_SHADER_COMPUTE] = 16,
> +};
> +static unsigned gfx9_max_images[SI_NUM_SHADERS] = {
> + /* these must be even due to slot alignment */
> + [PIPE_SHADER_VERTEX] = 0,
> + [PIPE_SHADER_TESS_CTRL] = 0,
> + [PIPE_SHADER_TESS_EVAL] = 0,
> + [PIPE_SHADER_GEOMETRY] = 0,
> + [PIPE_SHADER_FRAGMENT] = 2,
> + [PIPE_SHADER_COMPUTE] = 8,
> +};
> +static unsigned gfx9_max_const_buffers[SI_NUM_SHADERS] = {
> + [PIPE_SHADER_VERTEX] = 9,
> + [PIPE_SHADER_TESS_CTRL] = 3,
> + [PIPE_SHADER_TESS_EVAL] = 5,
> + [PIPE_SHADER_GEOMETRY] = 0,
> + [PIPE_SHADER_FRAGMENT] = 8,
> + [PIPE_SHADER_COMPUTE] = 6,
> +};
> +static unsigned gfx9_max_shader_buffers[SI_NUM_SHADERS] = {
> + [PIPE_SHADER_VERTEX] = 0,
> + [PIPE_SHADER_TESS_CTRL] = 0,
> + [PIPE_SHADER_TESS_EVAL] = 0,
> + [PIPE_SHADER_GEOMETRY] = 0,
> + [PIPE_SHADER_FRAGMENT] = 12,
> + [PIPE_SHADER_COMPUTE] = 13,
> +};
Please rename these to gfx9_max_ce_*
Apart from that:
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
> +
> void si_init_all_descriptors(struct si_context *sctx)
> {
> int i;
> unsigned ce_offset = 0;
>
> STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
> STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
>
> for (i = 0; i < SI_NUM_SHADERS; i++) {
> - bool gfx9_tcs = sctx->b.chip_class == GFX9 &&
> - i == PIPE_SHADER_TESS_CTRL;
> - bool gfx9_gs = sctx->b.chip_class == GFX9 &&
> - i == PIPE_SHADER_GEOMETRY;
> - /* GFX9 has only 4KB of CE, while previous chips had 32KB.
> - * Rarely used descriptors don't use CE RAM.
> - */
> - bool big_ce = sctx->b.chip_class <= VI;
> - bool const_and_shaderbufs_use_ce = big_ce ||
> - i == PIPE_SHADER_VERTEX ||
> - i == PIPE_SHADER_FRAGMENT;
> - bool samplers_and_images_use_ce = big_ce ||
> - i == PIPE_SHADER_FRAGMENT;
> -
> - si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
> + bool gfx9_tcs = false;
> + bool gfx9_gs = false;
> + unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
> + unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
> +
> + unsigned first_sampler_ce_slot = 0;
> + unsigned num_sampler_ce_slots = num_sampler_slots;
> +
> + unsigned first_buffer_ce_slot = 0;
> + unsigned num_buffer_ce_slots = num_buffer_slots;
> +
> + /* Adjust CE slot ranges based on GFX9 CE RAM limits. */
> + if (sctx->b.chip_class >= GFX9) {
> + gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
> + gfx9_gs = i == PIPE_SHADER_GEOMETRY;
> +
> + first_sampler_ce_slot =
> + si_get_image_slot(gfx9_max_images[i] - 1) / 2;
> + num_sampler_ce_slots = gfx9_max_images[i] / 2 +
> + gfx9_max_samplers[i];
> +
> + first_buffer_ce_slot =
> + si_get_shaderbuf_slot(gfx9_max_shader_buffers[i] - 1);
> + num_buffer_ce_slots = gfx9_max_shader_buffers[i] +
> + gfx9_max_const_buffers[i];
> + }
> +
> + si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i],
> si_const_and_shader_buffer_descriptors(sctx, i),
> - SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS,
> + num_buffer_slots,
> + first_buffer_ce_slot, num_buffer_ce_slots,
> gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
> gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
> SI_SGPR_CONST_AND_SHADER_BUFFERS,
> RADEON_USAGE_READWRITE,
> RADEON_USAGE_READ,
> RADEON_PRIO_SHADER_RW_BUFFER,
> RADEON_PRIO_CONST_BUFFER,
> - const_and_shaderbufs_use_ce ? &ce_offset : NULL);
> + &ce_offset);
>
> struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
> - si_init_descriptors(desc,
> + si_init_descriptors(sctx, desc,
> gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
> gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
> SI_SGPR_SAMPLERS_AND_IMAGES,
> - 16, SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS,
> - samplers_and_images_use_ce ? &ce_offset : NULL);
> + 16, num_sampler_slots,
> + first_sampler_ce_slot, num_sampler_ce_slots,
> + &ce_offset);
>
> int j;
> for (j = 0; j < SI_NUM_IMAGES; j++)
> memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
> for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
> memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
> }
>
> - si_init_buffer_resources(&sctx->rw_buffers,
> + si_init_buffer_resources(sctx, &sctx->rw_buffers,
> &sctx->descriptors[SI_DESCS_RW_BUFFERS],
> - SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
> + SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS,
> + SI_SGPR_RW_BUFFERS,
> /* The second set of usage/priority is used by
> * const buffers in RW buffer slots. */
> RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
> RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
> &ce_offset);
> sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
>
> - si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
> - 4, SI_NUM_VERTEX_BUFFERS, NULL);
> + si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
> + 4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
>
> sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
> sctx->total_ce_ram_allocated = ce_offset;
>
> if (sctx->b.chip_class >= GFX9)
> assert(ce_offset <= 4096);
> else
> assert(ce_offset <= 32768);
>
> /* Set pipe_context functions. */
> @@ -2186,20 +2257,40 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
>
> int first, count;
> u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
> assert(new_active_mask == 0);
>
> /* Upload/dump descriptors if slots are being enabled. */
> if (first < desc->first_active_slot ||
> first + count > desc->first_active_slot + desc->num_active_slots)
> sctx->descriptors_dirty |= 1u << desc_idx;
>
> + /* Enable or disable CE for this descriptor array. */
> + bool used_ce = desc->uses_ce;
> + desc->uses_ce = desc->first_ce_slot <= first &&
> + desc->first_ce_slot + desc->num_ce_slots >= first + count;
> +
> + if (desc->uses_ce != used_ce) {
> + /* Upload or dump descriptors if we're disabling or enabling CE,
> + * respectively. */
> + sctx->descriptors_dirty |= 1u << desc_idx;
> +
> + /* If we're enabling CE, re-upload all descriptors to CE RAM.
> + * When CE was disabled, uploads to CE RAM stopped.
> + */
> + if (desc->uses_ce) {
> + desc->dirty_mask |=
> + u_bit_consecutive64(desc->first_ce_slot,
> + desc->num_ce_slots);
> + }
> + }
> +
> desc->first_active_slot = first;
> desc->num_active_slots = count;
> }
>
> void si_set_active_descriptors_for_shader(struct si_context *sctx,
> struct si_shader_selector *sel)
> {
> if (!sel)
> return;
>
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index dfabaa3..275f830 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -221,20 +221,27 @@ struct si_descriptors {
> /* The maximum number of descriptors. */
> unsigned num_elements;
>
> /* The buffer where the descriptors have been uploaded. */
> struct r600_resource *buffer;
> int buffer_offset; /* can be negative if not using lower slots */
>
> /* Offset in CE RAM */
> unsigned ce_offset;
>
> + /* Slots allocated in CE RAM. If we get active slots outside of this
> + * range, direct uploads to memory will be used instead. This basically
> + * governs switching between onchip (CE) and offchip (upload) modes.
> + */
> + unsigned first_ce_slot;
> + unsigned num_ce_slots;
> +
> /* Slots that are used by currently-bound shaders.
> * With CE: It determines which slots are dumped to L2.
> * It doesn't skip uploads to CE RAM.
> * Without CE: It determines which slots are uploaded.
> */
> unsigned first_active_slot;
> unsigned num_active_slots;
>
> /* Slots that have been changed and need to be uploaded. */
> uint64_t dirty_mask;
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list