[Mesa-dev] [PATCH 25/29] radv: emit geometry ring size and pointers via preamble

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Mon Jan 30 21:08:25 UTC 2017



On Mon, Jan 30, 2017, at 07:02, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
> 
> This uses the scratch infrastructure to handle the esgs
> and gsvs rings.
> 
> (this replaces the old code that did this with patching).
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/amd/vulkan/radv_cmd_buffer.c |  28 ++++++
>  src/amd/vulkan/radv_device.c     | 199
>  ++++++++++++++++++++++++++++++++++++---
>  src/amd/vulkan/radv_private.h    |   8 ++
>  3 files changed, 224 insertions(+), 11 deletions(-)
> 
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c
> b/src/amd/vulkan/radv_cmd_buffer.c
> index c351636..b1b125f 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -1458,6 +1458,8 @@ static void  radv_reset_cmd_buffer(struct
> radv_cmd_buffer *cmd_buffer)
>  	cmd_buffer->upload.offset = 0;
>  
>  	cmd_buffer->record_fail = false;
> +
> +       cmd_buffer->ring_offsets_idx = -1;
>  }
>  
>  VkResult radv_ResetCommandBuffer(
> @@ -1644,6 +1646,7 @@ VkResult radv_EndCommandBuffer(
>  
>  	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
>  		si_emit_cache_flush(cmd_buffer);
> +
>  	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
>  	    cmd_buffer->record_fail)
>  		return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> @@ -1730,6 +1733,20 @@ void radv_CmdBindPipeline(
>  		radv_dynamic_state_copy(&cmd_buffer->state.dynamic,
>  					&pipeline->dynamic_state,
>  					pipeline->dynamic_state_mask);
> +
> +               if (pipeline->graphics.esgs_ring_size >
> cmd_buffer->esgs_ring_size_needed)
> +                       cmd_buffer->esgs_ring_size_needed =
> pipeline->graphics.esgs_ring_size;
> +               if (pipeline->graphics.gsvs_ring_size >
> cmd_buffer->gsvs_ring_size_needed)
> +                       cmd_buffer->gsvs_ring_size_needed =
> pipeline->graphics.gsvs_ring_size;
> +
> +               if (radv_pipeline_has_gs(pipeline)) {
> +                       struct ac_userdata_info *loc =
> radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
> +                                                                        
>    AC_UD_SCRATCH_RING_OFFSETS);
> +                       if (cmd_buffer->ring_offsets_idx == -1)
> +                               cmd_buffer->ring_offsets_idx =
> loc->sgpr_idx;
> +                       else if (loc->sgpr_idx != -1)
> +                               assert(loc->sgpr_idx !=
> cmd_buffer->ring_offsets_idx);
> +               }
>  		break;
>  	default:
>  		assert(!"invalid bind point");
> @@ -1882,6 +1899,17 @@ void radv_CmdExecuteCommands(
>  		primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
>  		                                            secondary->compute_scratch_size_needed);
>  
> +               if (secondary->esgs_ring_size_needed >
> primary->esgs_ring_size_needed)
> +                       primary->esgs_ring_size_needed =
> secondary->esgs_ring_size_needed;
> +               if (secondary->gsvs_ring_size_needed >
> primary->gsvs_ring_size_needed)
> +                       primary->gsvs_ring_size_needed =
> secondary->gsvs_ring_size_needed;
> +
> +               if (secondary->ring_offsets_idx != -1) {
> +                       if (primary->ring_offsets_idx == -1)
> +                               primary->ring_offsets_idx =
> secondary->ring_offsets_idx;
> +                       else
> +                               assert(secondary->ring_offsets_idx ==
> primary->ring_offsets_idx);
> +               }
>  		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
>  	}
>  
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index af16c89..fd4e7f5 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -760,6 +760,10 @@ radv_queue_finish(struct radv_queue *queue)
>  		queue->device->ws->buffer_destroy(queue->descriptor_bo);
>  	if (queue->scratch_bo)
>  		queue->device->ws->buffer_destroy(queue->scratch_bo);
> +       if (queue->esgs_ring_bo)
> +               queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
> +       if (queue->gsvs_ring_bo)
> +               queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
>  	if (queue->compute_scratch_bo)
>  		queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
>  }
> @@ -1042,24 +1046,118 @@ static void radv_dump_trace(struct radv_device
> *device,
>  	fclose(f);
>  }
>  
> +static void
> +fill_geom_rings(struct radv_queue *queue,
> +               uint32_t *map,
> +               uint32_t esgs_ring_size,
> +               struct radeon_winsys_bo *esgs_ring_bo,
> +               uint32_t gsvs_ring_size,
> +               struct radeon_winsys_bo *gsvs_ring_bo)
> +{
> +       uint64_t esgs_va, gsvs_va;
> +       esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo);
> +       gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo);
> +       uint32_t *desc = &map[4];
> +
> +       /* stride 0, num records - size, add tid, swizzle, elsize4,
> +          index stride 64 */
> +       desc[0] = esgs_va;
> +       desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) |
> +               S_008F04_STRIDE(0) |
> +               S_008F04_SWIZZLE_ENABLE(true);
> +       desc[2] = esgs_ring_size;
> +       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> +               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> +               S_008F0C_ELEMENT_SIZE(1) |
> +               S_008F0C_INDEX_STRIDE(3) |
> +               S_008F0C_ADD_TID_ENABLE(true);
> +
> +       desc += 4;
> +       /* GS entry for ES->GS ring */
> +       /* stride 0, num records - size, elsize0,
> +          index stride 0 */
> +       desc[0] = esgs_va;
> +       desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)|
> +               S_008F04_STRIDE(0) |
> +               S_008F04_SWIZZLE_ENABLE(false);
> +       desc[2] = esgs_ring_size;
> +       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> +               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> +               S_008F0C_ELEMENT_SIZE(0) |
> +               S_008F0C_INDEX_STRIDE(0) |
> +               S_008F0C_ADD_TID_ENABLE(false);
> +
> +       desc += 4;
> +       /* VS entry for GS->VS ring */
> +       /* stride 0, num records - size, elsize0,
> +          index stride 0 */
> +       desc[0] = gsvs_va;
> +       desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
> +               S_008F04_STRIDE(0) |
> +               S_008F04_SWIZZLE_ENABLE(false);
> +       desc[2] = gsvs_ring_size;
> +       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> +               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> +               S_008F0C_ELEMENT_SIZE(0) |
> +               S_008F0C_INDEX_STRIDE(0) |
> +               S_008F0C_ADD_TID_ENABLE(false);
> +       desc += 4;
> +
> +       /* stride gsvs_itemsize, num records 64
> +          elsize 4, index stride 16 */
> +       /* shader will patch stride and desc[2] */
> +       desc[0] = gsvs_va;
> +       desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
> +               S_008F04_STRIDE(0) |
> +               S_008F04_SWIZZLE_ENABLE(true);
> +       desc[2] = 0;
> +       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> +               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
> +               S_008F0C_ELEMENT_SIZE(1) |
> +               S_008F0C_INDEX_STRIDE(1) |
> +               S_008F0C_ADD_TID_ENABLE(true);
> +}
> +
>  static VkResult
>  radv_get_preamble_cs(struct radv_queue *queue,
>                       uint32_t scratch_size,
>                       uint32_t compute_scratch_size,
> +                    uint32_t esgs_ring_size,
> +                    uint32_t gsvs_ring_size,
>                       struct radeon_winsys_cs **preamble_cs)
>  {
>  	struct radeon_winsys_bo *scratch_bo = NULL;
>  	struct radeon_winsys_bo *descriptor_bo = NULL;
>  	struct radeon_winsys_bo *compute_scratch_bo = NULL;
> +       struct radeon_winsys_bo *esgs_ring_bo = NULL;
> +       struct radeon_winsys_bo *gsvs_ring_bo = NULL;
>  	struct radeon_winsys_cs *cs = NULL;
>  
> -       if (!scratch_size && !compute_scratch_size) {
> +       if (!scratch_size && !compute_scratch_size && !esgs_ring_size &&
> !gsvs_ring_size) {
>  		*preamble_cs = NULL;
>  		return VK_SUCCESS;
>  	}
>  
>  	if (scratch_size <= queue->scratch_size &&
> -           compute_scratch_size <= queue->compute_scratch_size) {
> +           compute_scratch_size <= queue->compute_scratch_size &&
> +           esgs_ring_size <= queue->esgs_ring_size &&
> +           gsvs_ring_size <= queue->gsvs_ring_size) {
>  		*preamble_cs = queue->preamble_cs;
>  		return VK_SUCCESS;
>  	}
> @@ -1087,9 +1185,40 @@ radv_get_preamble_cs(struct radv_queue *queue,
>  	} else
>  		compute_scratch_bo = queue->compute_scratch_bo;
>  
> -       if (scratch_bo != queue->scratch_bo) {
> +       if (esgs_ring_size > queue->esgs_ring_size) {
> +               esgs_ring_bo =
> queue->device->ws->buffer_create(queue->device->ws,
> +                                                              
> esgs_ring_size,
> +                                                               4096,
> +                                                              
> RADEON_DOMAIN_VRAM,
> +                                                              
> RADEON_FLAG_NO_CPU_ACCESS);
> +               if (!esgs_ring_bo)
> +                       goto fail;
> +       } else
> +               esgs_ring_bo = queue->esgs_ring_bo;
> +
> +
> +       if (gsvs_ring_size > queue->gsvs_ring_size) {
> +               gsvs_ring_bo =
> queue->device->ws->buffer_create(queue->device->ws,
> +                                                              
> gsvs_ring_size,
> +                                                               4096,
> +                                                              
> RADEON_DOMAIN_VRAM,
> +                                                              
> RADEON_FLAG_NO_CPU_ACCESS);
> +               if (!gsvs_ring_bo)
> +                       goto fail;
> +       } else
> +               gsvs_ring_bo = queue->gsvs_ring_bo;
> +
> +       if (scratch_bo != queue->scratch_bo ||
> +           esgs_ring_bo != queue->esgs_ring_bo ||
> +           gsvs_ring_bo != queue->gsvs_ring_bo) {
> +               uint32_t size = 0;
> +               if (gsvs_ring_bo || esgs_ring_bo)
> +                       size = 80; /* 2 dword + 2 padding + 4 dword * 4
> */
> +               else if (scratch_bo)
> +                       size = 8; /* 2 dword */
> +
>  		descriptor_bo = queue->device->ws->buffer_create(queue->device->ws,
> -                                                                8,
> +                                                                size,
>  		                                                 4096,
>  		                                                 RADEON_DOMAIN_VRAM,
>  		                                                 RADEON_FLAG_CPU_ACCESS);
> @@ -1107,22 +1236,49 @@ radv_get_preamble_cs(struct radv_queue *queue,
>  	if (scratch_bo)
>  		queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
>  
> +       if (esgs_ring_bo)
> +               queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
> +
> +       if (gsvs_ring_bo)
> +               queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
> +
>  	if (descriptor_bo)
>  		queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
>  
>  	if (descriptor_bo != queue->descriptor_bo) {
> -               uint64_t scratch_va =
> queue->device->ws->buffer_get_va(scratch_bo);
> -               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> -                                S_008F04_SWIZZLE_ENABLE(1);
> -
>  		uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
>  
> -               map[0] = scratch_va;
> -               map[1] = rsrc1;
> +               if (scratch_bo) {
> +                       uint64_t scratch_va =
> queue->device->ws->buffer_get_va(scratch_bo);
> +                       uint32_t rsrc1 =
> S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
> +                               S_008F04_SWIZZLE_ENABLE(1);
> +                       map[0] = scratch_va;
> +                       map[1] = rsrc1;
> +               }
> +
> +               if (esgs_ring_bo || gsvs_ring_bo)
> +                       fill_geom_rings(queue, map, esgs_ring_size,
> esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);

You need to MAX2 these sizes with the sizes in the queue, otherwise you
can use a smaller size if the current cmd_buffer e.g. needs a larger
scratch, but small or no gs ringbuffers.
>  
>  		queue->device->ws->buffer_unmap(descriptor_bo);
>  	}
>  
> +       if (esgs_ring_bo || gsvs_ring_bo) {
> +               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> +               radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) |
> EVENT_INDEX(4));
> +               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> +               radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) |
> EVENT_INDEX(0));
> +
> +               if (queue->device->physical_device->rad_info.chip_class
> >= CIK) {
> +                       radeon_set_uconfig_reg_seq(cs,
> R_030900_VGT_ESGS_RING_SIZE, 2);
> +                       radeon_emit(cs, esgs_ring_size >> 8);
> +                       radeon_emit(cs, gsvs_ring_size >> 8);
> +               } else {
> +                       radeon_set_config_reg_seq(cs,
> R_0088C8_VGT_ESGS_RING_SIZE, 2);
> +                       radeon_emit(cs, esgs_ring_size >> 8);
> +                       radeon_emit(cs, gsvs_ring_size >> 8);
> +               }
> +       }
> +
>  	if (descriptor_bo) {
>  		uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
>  		                   R_00B130_SPI_SHADER_USER_DATA_VS_0,
> @@ -1174,6 +1330,20 @@ radv_get_preamble_cs(struct radv_queue *queue,
>  		queue->compute_scratch_size = compute_scratch_size;
>  	}
>  
> +       if (esgs_ring_bo != queue->esgs_ring_bo) {
> +               if (queue->esgs_ring_bo)
> +                      
> queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
> +               queue->esgs_ring_bo = esgs_ring_bo;
> +               queue->esgs_ring_size = esgs_ring_size;
> +       }
> +
> +       if (gsvs_ring_bo != queue->gsvs_ring_bo) {
> +               if (queue->gsvs_ring_bo)
> +                      
> queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
> +               queue->gsvs_ring_bo = gsvs_ring_bo;
> +               queue->gsvs_ring_size = gsvs_ring_size;
> +       }
> +
>  	if (descriptor_bo != queue->descriptor_bo) {
>  		if (queue->descriptor_bo)
>  			queue->device->ws->buffer_destroy(queue->descriptor_bo);
> @@ -1192,6 +1362,10 @@ fail:
>  		queue->device->ws->buffer_destroy(scratch_bo);
>  	if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo)
>  		queue->device->ws->buffer_destroy(compute_scratch_bo);
> +       if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo)
> +               queue->device->ws->buffer_destroy(esgs_ring_bo);
> +       if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo)
> +               queue->device->ws->buffer_destroy(gsvs_ring_bo);
>  	return VK_ERROR_OUT_OF_DEVICE_MEMORY;
>  }
>  
> @@ -1209,6 +1383,7 @@ VkResult radv_QueueSubmit(
>  	uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX;
>  	uint32_t scratch_size = 0;
>  	uint32_t compute_scratch_size = 0;
> +       uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
>  	struct radeon_winsys_cs *preamble_cs = NULL;
>  	VkResult result;
>  
> @@ -1222,10 +1397,12 @@ VkResult radv_QueueSubmit(
>  			scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed);
>  			compute_scratch_size = MAX2(compute_scratch_size,
>  			                            cmd_buffer->compute_scratch_size_needed);
> +                       esgs_ring_size = MAX2(esgs_ring_size,
> cmd_buffer->esgs_ring_size_needed);
> +                       gsvs_ring_size = MAX2(gsvs_ring_size,
> cmd_buffer->gsvs_ring_size_needed);
>  		}
>  	}
>  
> -       result = radv_get_preamble_cs(queue, scratch_size,
> compute_scratch_size, &preamble_cs);
> +       result = radv_get_preamble_cs(queue, scratch_size,
> compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs);
>  	if (result != VK_SUCCESS)
>  		return result;
>  
> diff --git a/src/amd/vulkan/radv_private.h
> b/src/amd/vulkan/radv_private.h
> index 3a0318b..57aa9ea 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -470,10 +470,14 @@ struct radv_queue {
>  
>  	uint32_t scratch_size;
>  	uint32_t compute_scratch_size;
> +       uint32_t esgs_ring_size;
> +       uint32_t gsvs_ring_size;
>  
>  	struct radeon_winsys_bo *scratch_bo;
>  	struct radeon_winsys_bo *descriptor_bo;
>  	struct radeon_winsys_bo *compute_scratch_bo;
> +       struct radeon_winsys_bo *esgs_ring_bo;
> +       struct radeon_winsys_bo *gsvs_ring_bo;
>  	struct radeon_winsys_cs *preamble_cs;
>  };
>  
> @@ -742,6 +746,10 @@ struct radv_cmd_buffer {
>  
>  	uint32_t scratch_size_needed;
>  	uint32_t compute_scratch_size_needed;
> +       uint32_t esgs_ring_size_needed;
> +       uint32_t gsvs_ring_size_needed;
> +

These need to be reset on command buffer reset.

With these two comments fixed, patches 1-20, 22-30 (with 30 moved before
29) are

Review-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>

I still consider it strange that 21 is needed and running it through CTS
now.

Bas
> +       int ring_offsets_idx; /* just used for verification */
>  };
>  
>  struct radv_image;
> -- 
> 2.9.3
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list