[Mesa-dev] [PATCH 25/29] radv: emit geometry ring size and pointers via preamble
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Mon Jan 30 21:08:25 UTC 2017
On Mon, Jan 30, 2017, at 07:02, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> This uses the scratch infrastructure to handle the esgs
> and gsvs rings.
>
> (this replaces the old code that did this with patching).
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/amd/vulkan/radv_cmd_buffer.c | 28 ++++++
> src/amd/vulkan/radv_device.c | 199
> ++++++++++++++++++++++++++++++++++++---
> src/amd/vulkan/radv_private.h | 8 ++
> 3 files changed, 224 insertions(+), 11 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c
> b/src/amd/vulkan/radv_cmd_buffer.c
> index c351636..b1b125f 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -1458,6 +1458,8 @@ static void radv_reset_cmd_buffer(struct
> radv_cmd_buffer *cmd_buffer)
> cmd_buffer->upload.offset = 0;
>
> cmd_buffer->record_fail = false;
> +
> + cmd_buffer->ring_offsets_idx = -1;
> }
>
> VkResult radv_ResetCommandBuffer(
> @@ -1644,6 +1646,7 @@ VkResult radv_EndCommandBuffer(
>
> if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
> si_emit_cache_flush(cmd_buffer);
> +
> if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
> cmd_buffer->record_fail)
> return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> @@ -1730,6 +1733,20 @@ void radv_CmdBindPipeline(
> radv_dynamic_state_copy(&cmd_buffer->state.dynamic,
> &pipeline->dynamic_state,
> pipeline->dynamic_state_mask);
> +
> + if (pipeline->graphics.esgs_ring_size >
> cmd_buffer->esgs_ring_size_needed)
> + cmd_buffer->esgs_ring_size_needed =
> pipeline->graphics.esgs_ring_size;
> + if (pipeline->graphics.gsvs_ring_size >
> cmd_buffer->gsvs_ring_size_needed)
> + cmd_buffer->gsvs_ring_size_needed =
> pipeline->graphics.gsvs_ring_size;
> +
> + if (radv_pipeline_has_gs(pipeline)) {
> + struct ac_userdata_info *loc =
> radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
> +
> AC_UD_SCRATCH_RING_OFFSETS);
> + if (cmd_buffer->ring_offsets_idx == -1)
> + cmd_buffer->ring_offsets_idx =
> loc->sgpr_idx;
> + else if (loc->sgpr_idx != -1)
> + assert(loc->sgpr_idx !=
> cmd_buffer->ring_offsets_idx);
> + }
> break;
> default:
> assert(!"invalid bind point");
> @@ -1882,6 +1899,17 @@ void radv_CmdExecuteCommands(
> primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
> secondary->compute_scratch_size_needed);
>
> + if (secondary->esgs_ring_size_needed >
> primary->esgs_ring_size_needed)
> + primary->esgs_ring_size_needed =
> secondary->esgs_ring_size_needed;
> + if (secondary->gsvs_ring_size_needed >
> primary->gsvs_ring_size_needed)
> + primary->gsvs_ring_size_needed =
> secondary->gsvs_ring_size_needed;
> +
> + if (secondary->ring_offsets_idx != -1) {
> + if (primary->ring_offsets_idx == -1)
> + primary->ring_offsets_idx =
> secondary->ring_offsets_idx;
> + else
> + assert(secondary->ring_offsets_idx ==
> primary->ring_offsets_idx);
> + }
> primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
> }
>
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index af16c89..fd4e7f5 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -760,6 +760,10 @@ radv_queue_finish(struct radv_queue *queue)
> queue->device->ws->buffer_destroy(queue->descriptor_bo);
> if (queue->scratch_bo)
> queue->device->ws->buffer_destroy(queue->scratch_bo);
> + if (queue->esgs_ring_bo)
> + queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
> + if (queue->gsvs_ring_bo)
> + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
> if (queue->compute_scratch_bo)
> queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
> }
> @@ -1042,24 +1046,118 @@ static void radv_dump_trace(struct radv_device
> *device,
> fclose(f);
> }
>
> +static void
> +fill_geom_rings(struct radv_queue *queue,
> + uint32_t *map,
> + uint32_t esgs_ring_size,
> + struct radeon_winsys_bo *esgs_ring_bo,
> + uint32_t gsvs_ring_size,
> + struct radeon_winsys_bo *gsvs_ring_bo)
> +{
> + uint64_t esgs_va, gsvs_va;
> + esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo);
> + gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo);
> + uint32_t *desc = &map[4];
> +
> + /* stride 0, num records - size, add tid, swizzle, elsize4,
> + index stride 64 */
> + desc[0] = esgs_va;
> + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) |
> + S_008F04_STRIDE(0) |
> + S_008F04_SWIZZLE_ENABLE(true);
> + desc[2] = esgs_ring_size;
> + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> + S_008F0C_ELEMENT_SIZE(1) |
> + S_008F0C_INDEX_STRIDE(3) |
> + S_008F0C_ADD_TID_ENABLE(true);
> +
> + desc += 4;
> + /* GS entry for ES->GS ring */
> + /* stride 0, num records - size, elsize0,
> + index stride 0 */
> + desc[0] = esgs_va;
> + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)|
> + S_008F04_STRIDE(0) |
> + S_008F04_SWIZZLE_ENABLE(false);
> + desc[2] = esgs_ring_size;
> + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> + S_008F0C_ELEMENT_SIZE(0) |
> + S_008F0C_INDEX_STRIDE(0) |
> + S_008F0C_ADD_TID_ENABLE(false);
> +
> + desc += 4;
> + /* VS entry for GS->VS ring */
> + /* stride 0, num records - size, elsize0,
> + index stride 0 */
> + desc[0] = gsvs_va;
> + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
> + S_008F04_STRIDE(0) |
> + S_008F04_SWIZZLE_ENABLE(false);
> + desc[2] = gsvs_ring_size;
> + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> + S_008F0C_ELEMENT_SIZE(0) |
> + S_008F0C_INDEX_STRIDE(0) |
> + S_008F0C_ADD_TID_ENABLE(false);
> + desc += 4;
> +
> + /* stride gsvs_itemsize, num records 64
> + elsize 4, index stride 16 */
> + /* shader will patch stride and desc[2] */
> + desc[0] = gsvs_va;
> + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
> + S_008F04_STRIDE(0) |
> + S_008F04_SWIZZLE_ENABLE(true);
> + desc[2] = 0;
> + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> + S_008F0C_ELEMENT_SIZE(1) |
> + S_008F0C_INDEX_STRIDE(1) |
> + S_008F0C_ADD_TID_ENABLE(true);
> +}
> +
> static VkResult
> radv_get_preamble_cs(struct radv_queue *queue,
> uint32_t scratch_size,
> uint32_t compute_scratch_size,
> + uint32_t esgs_ring_size,
> + uint32_t gsvs_ring_size,
> struct radeon_winsys_cs **preamble_cs)
> {
> struct radeon_winsys_bo *scratch_bo = NULL;
> struct radeon_winsys_bo *descriptor_bo = NULL;
> struct radeon_winsys_bo *compute_scratch_bo = NULL;
> + struct radeon_winsys_bo *esgs_ring_bo = NULL;
> + struct radeon_winsys_bo *gsvs_ring_bo = NULL;
> struct radeon_winsys_cs *cs = NULL;
>
> - if (!scratch_size && !compute_scratch_size) {
> + if (!scratch_size && !compute_scratch_size && !esgs_ring_size &&
> !gsvs_ring_size) {
> *preamble_cs = NULL;
> return VK_SUCCESS;
> }
>
> if (scratch_size <= queue->scratch_size &&
> - compute_scratch_size <= queue->compute_scratch_size) {
> + compute_scratch_size <= queue->compute_scratch_size &&
> + esgs_ring_size <= queue->esgs_ring_size &&
> + gsvs_ring_size <= queue->gsvs_ring_size) {
> *preamble_cs = queue->preamble_cs;
> return VK_SUCCESS;
> }
> @@ -1087,9 +1185,40 @@ radv_get_preamble_cs(struct radv_queue *queue,
> } else
> compute_scratch_bo = queue->compute_scratch_bo;
>
> - if (scratch_bo != queue->scratch_bo) {
> + if (esgs_ring_size > queue->esgs_ring_size) {
> + esgs_ring_bo =
> queue->device->ws->buffer_create(queue->device->ws,
> +
> esgs_ring_size,
> + 4096,
> +
> RADEON_DOMAIN_VRAM,
> +
> RADEON_FLAG_NO_CPU_ACCESS);
> + if (!esgs_ring_bo)
> + goto fail;
> + } else
> + esgs_ring_bo = queue->esgs_ring_bo;
> +
> +
> + if (gsvs_ring_size > queue->gsvs_ring_size) {
> + gsvs_ring_bo =
> queue->device->ws->buffer_create(queue->device->ws,
> +
> gsvs_ring_size,
> + 4096,
> +
> RADEON_DOMAIN_VRAM,
> +
> RADEON_FLAG_NO_CPU_ACCESS);
> + if (!gsvs_ring_bo)
> + goto fail;
> + } else
> + gsvs_ring_bo = queue->gsvs_ring_bo;
> +
> + if (scratch_bo != queue->scratch_bo ||
> + esgs_ring_bo != queue->esgs_ring_bo ||
> + gsvs_ring_bo != queue->gsvs_ring_bo) {
> + uint32_t size = 0;
> + if (gsvs_ring_bo || esgs_ring_bo)
> + size = 80; /* 2 dword + 2 padding + 4 dword * 4
> */
> + else if (scratch_bo)
> + size = 8; /* 2 dword */
> +
> descriptor_bo = queue->device->ws->buffer_create(queue->device->ws,
> - 8,
> + size,
> 4096,
> RADEON_DOMAIN_VRAM,
> RADEON_FLAG_CPU_ACCESS);
> @@ -1107,22 +1236,49 @@ radv_get_preamble_cs(struct radv_queue *queue,
> if (scratch_bo)
> queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
>
> + if (esgs_ring_bo)
> + queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
> +
> + if (gsvs_ring_bo)
> + queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
> +
> if (descriptor_bo)
> queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
>
> if (descriptor_bo != queue->descriptor_bo) {
> - uint64_t scratch_va =
> queue->device->ws->buffer_get_va(scratch_bo);
> - uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> - S_008F04_SWIZZLE_ENABLE(1);
> -
> uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
>
> - map[0] = scratch_va;
> - map[1] = rsrc1;
> + if (scratch_bo) {
> + uint64_t scratch_va =
> queue->device->ws->buffer_get_va(scratch_bo);
> + uint32_t rsrc1 =
> S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
> + S_008F04_SWIZZLE_ENABLE(1);
> + map[0] = scratch_va;
> + map[1] = rsrc1;
> + }
> +
> + if (esgs_ring_bo || gsvs_ring_bo)
> + fill_geom_rings(queue, map, esgs_ring_size,
> esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
You need to MAX2 these sizes with the sizes in the queue, otherwise you
can use a smaller size if the current cmd_buffer e.g. needs a larger
scratch, but small or no gs ringbuffers.
>
> queue->device->ws->buffer_unmap(descriptor_bo);
> }
>
> + if (esgs_ring_bo || gsvs_ring_bo) {
> + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) |
> EVENT_INDEX(4));
> + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) |
> EVENT_INDEX(0));
> +
> + if (queue->device->physical_device->rad_info.chip_class
> >= CIK) {
> + radeon_set_uconfig_reg_seq(cs,
> R_030900_VGT_ESGS_RING_SIZE, 2);
> + radeon_emit(cs, esgs_ring_size >> 8);
> + radeon_emit(cs, gsvs_ring_size >> 8);
> + } else {
> + radeon_set_config_reg_seq(cs,
> R_0088C8_VGT_ESGS_RING_SIZE, 2);
> + radeon_emit(cs, esgs_ring_size >> 8);
> + radeon_emit(cs, gsvs_ring_size >> 8);
> + }
> + }
> +
> if (descriptor_bo) {
> uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
> R_00B130_SPI_SHADER_USER_DATA_VS_0,
> @@ -1174,6 +1330,20 @@ radv_get_preamble_cs(struct radv_queue *queue,
> queue->compute_scratch_size = compute_scratch_size;
> }
>
> + if (esgs_ring_bo != queue->esgs_ring_bo) {
> + if (queue->esgs_ring_bo)
> +
> queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
> + queue->esgs_ring_bo = esgs_ring_bo;
> + queue->esgs_ring_size = esgs_ring_size;
> + }
> +
> + if (gsvs_ring_bo != queue->gsvs_ring_bo) {
> + if (queue->gsvs_ring_bo)
> +
> queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
> + queue->gsvs_ring_bo = gsvs_ring_bo;
> + queue->gsvs_ring_size = gsvs_ring_size;
> + }
> +
> if (descriptor_bo != queue->descriptor_bo) {
> if (queue->descriptor_bo)
> queue->device->ws->buffer_destroy(queue->descriptor_bo);
> @@ -1192,6 +1362,10 @@ fail:
> queue->device->ws->buffer_destroy(scratch_bo);
> if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo)
> queue->device->ws->buffer_destroy(compute_scratch_bo);
> + if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo)
> + queue->device->ws->buffer_destroy(esgs_ring_bo);
> + if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo)
> + queue->device->ws->buffer_destroy(gsvs_ring_bo);
> return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> }
>
> @@ -1209,6 +1383,7 @@ VkResult radv_QueueSubmit(
> uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX;
> uint32_t scratch_size = 0;
> uint32_t compute_scratch_size = 0;
> + uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
> struct radeon_winsys_cs *preamble_cs = NULL;
> VkResult result;
>
> @@ -1222,10 +1397,12 @@ VkResult radv_QueueSubmit(
> scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed);
> compute_scratch_size = MAX2(compute_scratch_size,
> cmd_buffer->compute_scratch_size_needed);
> + esgs_ring_size = MAX2(esgs_ring_size,
> cmd_buffer->esgs_ring_size_needed);
> + gsvs_ring_size = MAX2(gsvs_ring_size,
> cmd_buffer->gsvs_ring_size_needed);
> }
> }
>
> - result = radv_get_preamble_cs(queue, scratch_size,
> compute_scratch_size, &preamble_cs);
> + result = radv_get_preamble_cs(queue, scratch_size,
> compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs);
> if (result != VK_SUCCESS)
> return result;
>
> diff --git a/src/amd/vulkan/radv_private.h
> b/src/amd/vulkan/radv_private.h
> index 3a0318b..57aa9ea 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -470,10 +470,14 @@ struct radv_queue {
>
> uint32_t scratch_size;
> uint32_t compute_scratch_size;
> + uint32_t esgs_ring_size;
> + uint32_t gsvs_ring_size;
>
> struct radeon_winsys_bo *scratch_bo;
> struct radeon_winsys_bo *descriptor_bo;
> struct radeon_winsys_bo *compute_scratch_bo;
> + struct radeon_winsys_bo *esgs_ring_bo;
> + struct radeon_winsys_bo *gsvs_ring_bo;
> struct radeon_winsys_cs *preamble_cs;
> };
>
> @@ -742,6 +746,10 @@ struct radv_cmd_buffer {
>
> uint32_t scratch_size_needed;
> uint32_t compute_scratch_size_needed;
> + uint32_t esgs_ring_size_needed;
> + uint32_t gsvs_ring_size_needed;
> +
These need to be reset on command buffer reset.
With these two comments fixed, patches 1-20, 22-30 (with 30 moved before
29) are
Review-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
I still consider it strange that 21 is needed and running it through CTS
now.
Bas
> + int ring_offsets_idx; /* just used for verification */
> };
>
> struct radv_image;
> --
> 2.9.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list