[Mesa-dev] [PATCH 08/25] radv: add tessellation ring allocation support.
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Fri Mar 31 06:59:55 UTC 2017
On Thu, Mar 30, 2017 at 10:00 AM, Dave Airlie <airlied at gmail.com> wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> This patch adds support for the offchip rings for storing
> tessellation factors and attribute data.
>
> It includes the register setup for the TF ring
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/amd/vulkan/radv_cmd_buffer.c | 6 ++
> src/amd/vulkan/radv_device.c | 210 ++++++++++++++++++++++++++++++++++++---
> src/amd/vulkan/radv_private.h | 4 +
> 3 files changed, 207 insertions(+), 13 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index dbd74de..70f6fad 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -221,6 +221,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
> cmd_buffer->compute_scratch_size_needed = 0;
> cmd_buffer->esgs_ring_size_needed = 0;
> cmd_buffer->gsvs_ring_size_needed = 0;
> + cmd_buffer->tess_rings_needed = false;
>
> if (cmd_buffer->upload.upload_bo)
> cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> @@ -1896,6 +1897,9 @@ void radv_CmdBindPipeline(
> if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
> cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
>
> + if (radv_pipeline_has_tess(pipeline))
> + cmd_buffer->tess_rings_needed = true;
> +
> if (radv_pipeline_has_gs(pipeline)) {
> struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
> AC_UD_SCRATCH_RING_OFFSETS);
> @@ -2063,6 +2067,8 @@ void radv_CmdExecuteCommands(
> primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
> if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
> primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
> + if (secondary->tess_rings_needed)
> + primary->tess_rings_needed = true;
>
> if (secondary->ring_offsets_idx != -1) {
> if (primary->ring_offsets_idx == -1)
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index fe531e1..b75d76b 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -845,6 +845,10 @@ radv_queue_finish(struct radv_queue *queue)
> queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
> if (queue->gsvs_ring_bo)
> queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
> + if (queue->tess_factor_ring_bo)
> + queue->device->ws->buffer_destroy(queue->tess_factor_ring_bo);
> + if (queue->tess_offchip_ring_bo)
> + queue->device->ws->buffer_destroy(queue->tess_offchip_ring_bo);
> if (queue->compute_scratch_bo)
> queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
> }
> @@ -1182,20 +1186,29 @@ static void radv_dump_trace(struct radv_device *device,
> }
>
> static void
> -fill_geom_rings(struct radv_queue *queue,
> - uint32_t *map,
> - uint32_t esgs_ring_size,
> - struct radeon_winsys_bo *esgs_ring_bo,
> - uint32_t gsvs_ring_size,
> - struct radeon_winsys_bo *gsvs_ring_bo)
> +fill_geom_tess_rings(struct radv_queue *queue,
> + uint32_t *map,
> + uint32_t esgs_ring_size,
> + struct radeon_winsys_bo *esgs_ring_bo,
> + uint32_t gsvs_ring_size,
> + struct radeon_winsys_bo *gsvs_ring_bo,
> + uint32_t tess_factor_ring_size,
> + struct radeon_winsys_bo *tess_factor_ring_bo,
> + uint32_t tess_offchip_ring_size,
> + struct radeon_winsys_bo *tess_offchip_ring_bo)
> {
> uint64_t esgs_va = 0, gsvs_va = 0;
> + uint64_t tess_factor_va = 0, tess_offchip_va = 0;
> uint32_t *desc = &map[4];
>
> if (esgs_ring_bo)
> esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo);
> if (gsvs_ring_bo)
> gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo);
> + if (tess_factor_ring_bo)
> + tess_factor_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo);
> + if (tess_offchip_ring_bo)
> + tess_offchip_va = queue->device->ws->buffer_get_va(tess_offchip_ring_bo);
>
> /* stride 0, num records - size, add tid, swizzle, elsize4,
> index stride 64 */
> @@ -1270,6 +1283,88 @@ fill_geom_rings(struct radv_queue *queue,
> S_008F0C_ELEMENT_SIZE(1) |
> S_008F0C_INDEX_STRIDE(1) |
> S_008F0C_ADD_TID_ENABLE(true);
> + desc += 4;
> +
> + desc[0] = tess_factor_va;
> + desc[1] = S_008F04_BASE_ADDRESS_HI(tess_factor_va >> 32) |
> + S_008F04_STRIDE(0) |
> + S_008F04_SWIZZLE_ENABLE(false);
> + desc[2] = tess_factor_ring_size;
> + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> + S_008F0C_ELEMENT_SIZE(0) |
> + S_008F0C_INDEX_STRIDE(0) |
> + S_008F0C_ADD_TID_ENABLE(false);
> + desc += 4;
> +
> + desc[0] = tess_offchip_va;
> + desc[1] = S_008F04_BASE_ADDRESS_HI(tess_offchip_va >> 32) |
> + S_008F04_STRIDE(0) |
> + S_008F04_SWIZZLE_ENABLE(false);
> + desc[2] = tess_offchip_ring_size;
> + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> + S_008F0C_ELEMENT_SIZE(0) |
> + S_008F0C_INDEX_STRIDE(0) |
> + S_008F0C_ADD_TID_ENABLE(false);
> +}
> +
> +static unsigned
> +radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buffers_p)
> +{
> + bool double_offchip_buffers = device->physical_device->rad_info.chip_class >= CIK &&
> + device->physical_device->rad_info.family != CHIP_CARRIZO &&
> + device->physical_device->rad_info.family != CHIP_STONEY;
> + unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
> + unsigned max_offchip_buffers = max_offchip_buffers_per_se *
> + device->physical_device->rad_info.max_se;
> + unsigned offchip_granularity;
> + unsigned hs_offchip_param;
> + switch (device->tess_offchip_block_dw_size) {
> + default:
> + assert(0);
> + /* fall through */
> + case 8192:
> + offchip_granularity = V_03093C_X_8K_DWORDS;
> + break;
> + case 4096:
> + offchip_granularity = V_03093C_X_4K_DWORDS;
> + break;
> + }
> +
> + switch (device->physical_device->rad_info.chip_class) {
> + case SI:
> + max_offchip_buffers = MIN2(max_offchip_buffers, 126);
> + break;
> + case CIK:
> + max_offchip_buffers = MIN2(max_offchip_buffers, 508);
> + break;
> + case VI:
> + default:
> + max_offchip_buffers = MIN2(max_offchip_buffers, 512);
> + break;
> + }
> +
> + *max_offchip_buffers_p = max_offchip_buffers;
> + if (device->physical_device->rad_info.chip_class >= CIK) {
> + if (device->physical_device->rad_info.chip_class >= VI)
> + --max_offchip_buffers;
> + hs_offchip_param =
> + S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
> + S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
> + } else {
> + hs_offchip_param =
> + S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
> + }
> + return hs_offchip_param;
> }
>
> static VkResult
> @@ -1278,6 +1373,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
> uint32_t compute_scratch_size,
> uint32_t esgs_ring_size,
> uint32_t gsvs_ring_size,
> + bool needs_tess_rings,
> struct radeon_winsys_cs **initial_preamble_cs,
> struct radeon_winsys_cs **continue_preamble_cs)
> {
> @@ -1286,12 +1382,32 @@ radv_get_preamble_cs(struct radv_queue *queue,
> struct radeon_winsys_bo *compute_scratch_bo = NULL;
> struct radeon_winsys_bo *esgs_ring_bo = NULL;
> struct radeon_winsys_bo *gsvs_ring_bo = NULL;
> + struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
> + struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
> struct radeon_winsys_cs *dest_cs[2] = {0};
> + bool add_tess_rings = false;
> + unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
> + unsigned max_offchip_buffers;
> + unsigned hs_offchip_param = 0;
> + if (!queue->has_tess_rings) {
> + if (needs_tess_rings)
> + add_tess_rings = true;
> + }
> +
> + if (add_tess_rings) {
> + tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se;
> + hs_offchip_param = radv_get_hs_offchip_param(queue->device,
> + &max_offchip_buffers);
This only sets hs_offchip_param when the tess rings are added. We also
need it if the tess rings already exist but we reemit the preamble due
to other reasons.
> + tess_offchip_ring_size = max_offchip_buffers *
> + queue->device->tess_offchip_block_dw_size * 4;
> +
> + }
>
> if (scratch_size <= queue->scratch_size &&
> compute_scratch_size <= queue->compute_scratch_size &&
> esgs_ring_size <= queue->esgs_ring_size &&
> gsvs_ring_size <= queue->gsvs_ring_size &&
> + !add_tess_rings &&
> queue->initial_preamble_cs) {
> *initial_preamble_cs = queue->initial_preamble_cs;
> *continue_preamble_cs = queue->continue_preamble_cs;
> @@ -1349,12 +1465,35 @@ radv_get_preamble_cs(struct radv_queue *queue,
> gsvs_ring_size = queue->gsvs_ring_size;
> }
>
> + if (add_tess_rings) {
> + tess_factor_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
> + tess_factor_ring_size,
> + 256,
> + RADEON_DOMAIN_VRAM,
> + RADEON_FLAG_NO_CPU_ACCESS);
> + if (!tess_factor_ring_bo)
> + goto fail;
> + tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
> + tess_offchip_ring_size,
> + 256,
> + RADEON_DOMAIN_VRAM,
> + RADEON_FLAG_NO_CPU_ACCESS);
> + if (!tess_offchip_ring_bo)
> + goto fail;
> + } else {
> + tess_factor_ring_bo = queue->tess_factor_ring_bo;
> + tess_offchip_ring_bo = queue->tess_offchip_ring_bo;
> + }
> +
> if (scratch_bo != queue->scratch_bo ||
> esgs_ring_bo != queue->esgs_ring_bo ||
> - gsvs_ring_bo != queue->gsvs_ring_bo) {
> + gsvs_ring_bo != queue->gsvs_ring_bo ||
> + tess_factor_ring_bo != queue->tess_factor_ring_bo ||
> + tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
> uint32_t size = 0;
> - if (gsvs_ring_bo || esgs_ring_bo)
> - size = 80; /* 2 dword + 2 padding + 4 dword * 4 */
> + if (gsvs_ring_bo || esgs_ring_bo ||
> + tess_factor_ring_bo || tess_offchip_ring_bo)
> + size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
> else if (scratch_bo)
> size = 8; /* 2 dword */
>
> @@ -1386,6 +1525,12 @@ radv_get_preamble_cs(struct radv_queue *queue,
> if (gsvs_ring_bo)
> queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
>
> + if (tess_factor_ring_bo)
> + queue->device->ws->cs_add_buffer(cs, tess_factor_ring_bo, 8);
> +
> + if (tess_offchip_ring_bo)
> + queue->device->ws->cs_add_buffer(cs, tess_offchip_ring_bo, 8);
> +
> if (descriptor_bo)
> queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
>
> @@ -1400,18 +1545,24 @@ radv_get_preamble_cs(struct radv_queue *queue,
> map[1] = rsrc1;
> }
>
> - if (esgs_ring_bo || gsvs_ring_bo)
> - fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
> + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo)
> + fill_geom_tess_rings(queue, map,
> + esgs_ring_size, esgs_ring_bo,
> + gsvs_ring_size, gsvs_ring_bo,
> + tess_factor_ring_size, tess_factor_ring_bo,
> + tess_offchip_ring_size, tess_offchip_ring_bo);
>
> queue->device->ws->buffer_unmap(descriptor_bo);
> }
>
> - if (esgs_ring_bo || gsvs_ring_bo) {
> + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo) {
> radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
> radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
> + }
>
> + if (esgs_ring_bo || gsvs_ring_bo) {
> if (queue->device->physical_device->rad_info.chip_class >= CIK) {
> radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
> radeon_emit(cs, esgs_ring_size >> 8);
> @@ -1423,6 +1574,24 @@ radv_get_preamble_cs(struct radv_queue *queue,
> }
> }
>
> + if (tess_factor_ring_bo) {
> + uint64_t tf_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo);
> + if (queue->device->physical_device->rad_info.chip_class >= CIK) {
> + radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
> + S_030938_SIZE(tess_factor_ring_size / 4));
> + radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
> + tf_va >> 8);
> + radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param);
> + } else {
> + radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
> + S_008988_SIZE(tess_factor_ring_size / 4));
> + radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE,
> + tf_va >> 8);
> + radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM,
> + hs_offchip_param);
> + }
> + }
> +
> if (descriptor_bo) {
> uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
> R_00B130_SPI_SHADER_USER_DATA_VS_0,
> @@ -1504,6 +1673,15 @@ radv_get_preamble_cs(struct radv_queue *queue,
> queue->gsvs_ring_size = gsvs_ring_size;
> }
>
> + if (tess_factor_ring_bo != queue->tess_factor_ring_bo) {
> + queue->tess_factor_ring_bo = tess_factor_ring_bo;
> + }
> +
> + if (tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
> + queue->tess_offchip_ring_bo = tess_offchip_ring_bo;
> + queue->has_tess_rings = true;
> + }
> +
> if (descriptor_bo != queue->descriptor_bo) {
> if (queue->descriptor_bo)
> queue->device->ws->buffer_destroy(queue->descriptor_bo);
> @@ -1530,6 +1708,10 @@ fail:
> queue->device->ws->buffer_destroy(esgs_ring_bo);
> if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo)
> queue->device->ws->buffer_destroy(gsvs_ring_bo);
> + if (tess_factor_ring_bo && tess_factor_ring_bo != queue->tess_factor_ring_bo)
> + queue->device->ws->buffer_destroy(tess_factor_ring_bo);
> + if (tess_offchip_ring_bo && tess_offchip_ring_bo != queue->tess_offchip_ring_bo)
> + queue->device->ws->buffer_destroy(tess_offchip_ring_bo);
> return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> }
>
> @@ -1551,6 +1733,7 @@ VkResult radv_QueueSubmit(
> struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
> VkResult result;
> bool fence_emitted = false;
> + bool tess_rings_needed = false;
>
> /* Do this first so failing to allocate scratch buffers can't result in
> * partially executed submissions. */
> @@ -1564,11 +1747,12 @@ VkResult radv_QueueSubmit(
> cmd_buffer->compute_scratch_size_needed);
> esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
> gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
> + tess_rings_needed |= cmd_buffer->tess_rings_needed;
> }
> }
>
> result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
> - esgs_ring_size, gsvs_ring_size,
> + esgs_ring_size, gsvs_ring_size, tess_rings_needed,
> &initial_preamble_cs, &continue_preamble_cs);
> if (result != VK_SUCCESS)
> return result;
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 35ff1c7..30d48cb 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -459,12 +459,15 @@ struct radv_queue {
> uint32_t compute_scratch_size;
> uint32_t esgs_ring_size;
> uint32_t gsvs_ring_size;
> + bool has_tess_rings;
>
> struct radeon_winsys_bo *scratch_bo;
> struct radeon_winsys_bo *descriptor_bo;
> struct radeon_winsys_bo *compute_scratch_bo;
> struct radeon_winsys_bo *esgs_ring_bo;
> struct radeon_winsys_bo *gsvs_ring_bo;
> + struct radeon_winsys_bo *tess_factor_ring_bo;
> + struct radeon_winsys_bo *tess_offchip_ring_bo;
> struct radeon_winsys_cs *initial_preamble_cs;
> struct radeon_winsys_cs *continue_preamble_cs;
> };
> @@ -744,6 +747,7 @@ struct radv_cmd_buffer {
> uint32_t compute_scratch_size_needed;
> uint32_t esgs_ring_size_needed;
> uint32_t gsvs_ring_size_needed;
> + bool tess_rings_needed;
>
> int ring_offsets_idx; /* just used for verification */
> };
> --
> 2.9.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list