[Mesa-dev] [PATCH 6/6] radv: implement fast HTILE clears for depth or stencil only on GFX9

Samuel Pitoiset samuel.pitoiset at gmail.com
Mon Nov 12 16:57:12 UTC 2018


This allows to fast clear the depth part (or the stencil part)
of a depth+stencil surface when HTILE is enabled. I didn't test
on GFX8, so it's disabled currently.

This gives a very nice boost, for example when clearing the depth
aspect of a 4096x4096 D32_SFLOAT_S8_UINT image (18x faster).

BEFORE: 235 us
AFTER: 13 us

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
 src/amd/vulkan/radv_meta_clear.c | 268 ++++++++++++++++++++++++++++++-
 src/amd/vulkan/radv_private.h    |   6 +
 2 files changed, 269 insertions(+), 5 deletions(-)

diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index b3128d021d..364a38daba 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -303,6 +303,22 @@ create_color_pipeline(struct radv_device *device,
 	return result;
 }
 
+static void
+finish_meta_clear_htile_mask_state(struct radv_device *device)
+{
+	struct radv_meta_state *state = &device->meta_state;
+
+	radv_DestroyPipeline(radv_device_to_handle(device),
+			     state->clear_htile_mask_pipeline,
+			     &state->alloc);
+	radv_DestroyPipelineLayout(radv_device_to_handle(device),
+				   state->clear_htile_mask_p_layout,
+				   &state->alloc);
+	radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
+					state->clear_htile_mask_ds_layout,
+					&state->alloc);
+}
+
 void
 radv_device_finish_meta_clear_state(struct radv_device *device)
 {
@@ -339,6 +355,8 @@ radv_device_finish_meta_clear_state(struct radv_device *device)
 	radv_DestroyPipelineLayout(radv_device_to_handle(device),
 				   state->clear_depth_p_layout,
 				   &state->alloc);
+
+	finish_meta_clear_htile_mask_state(device);
 }
 
 static void
@@ -746,6 +764,69 @@ emit_depthstencil_clear(struct radv_cmd_buffer *cmd_buffer,
 	}
 }
 
+static uint32_t
+clear_htile_mask(struct radv_cmd_buffer *cmd_buffer,
+		 struct radeon_winsys_bo *bo, uint64_t offset, uint64_t size,
+		 uint32_t htile_value, uint32_t htile_mask)
+{
+	struct radv_device *device = cmd_buffer->device;
+	struct radv_meta_state *state = &device->meta_state;
+	uint64_t block_count = round_up_u64(size, 1024);
+	struct radv_meta_saved_state saved_state;
+
+	radv_meta_save(&saved_state, cmd_buffer,
+		       RADV_META_SAVE_COMPUTE_PIPELINE |
+		       RADV_META_SAVE_CONSTANTS |
+		       RADV_META_SAVE_DESCRIPTORS);
+
+	struct radv_buffer dst_buffer = {
+		.bo = bo,
+		.offset = offset,
+		.size = size
+	};
+
+	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
+			     VK_PIPELINE_BIND_POINT_COMPUTE,
+			     state->clear_htile_mask_pipeline);
+
+	radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+			              state->clear_htile_mask_p_layout,
+				      0, /* set */
+				      1, /* descriptorWriteCount */
+				      (VkWriteDescriptorSet[]) {
+				              {
+				                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+				                      .dstBinding = 0,
+				                      .dstArrayElement = 0,
+				                      .descriptorCount = 1,
+				                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+				                      .pBufferInfo = &(VkDescriptorBufferInfo) {
+				                              .buffer = radv_buffer_to_handle(&dst_buffer),
+				                              .offset = 0,
+				                              .range = size
+				                      }
+				              }
+				      });
+
+	const unsigned constants[2] = {
+		htile_value & htile_mask,
+		~htile_mask,
+	};
+
+	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+			      state->clear_htile_mask_p_layout,
+			      VK_SHADER_STAGE_COMPUTE_BIT, 0, 8,
+			      constants);
+
+	radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
+
+	radv_meta_restore(&saved_state, cmd_buffer);
+
+	return RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+	       RADV_CMD_FLAG_INV_VMEM_L1 |
+	       RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
+}
+
 static uint32_t
 radv_get_htile_fast_clear_value(const struct radv_image *image,
 				VkClearDepthStencilValue value)
@@ -761,6 +842,24 @@ radv_get_htile_fast_clear_value(const struct radv_image *image,
 	return clear_value;
 }
 
+static uint32_t
+radv_get_htile_mask(const struct radv_image *image, VkImageAspectFlags aspects)
+{
+	uint32_t mask = 0;
+
+	if (!image->surface.has_stencil) {
+		/* All the HTILE buffer is used when there is no stencil. */
+		mask = UINT32_MAX;
+	} else {
+		if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
+			mask |= 0xfffffc0f;
+		if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
+			mask |= 0x000003f0;
+	}
+
+	return mask;
+}
+
 static bool
 radv_is_fast_clear_depth_allowed(VkClearDepthStencilValue value)
 {
@@ -788,6 +887,7 @@ emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
 	VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
 	VkImageAspectFlags aspects = clear_att->aspectMask;
 	uint32_t clear_word, flush_bits;
+	uint32_t htile_mask;
 
 	if (!radv_image_has_htile(iview->image))
 		return false;
@@ -821,9 +921,10 @@ emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
 	if (clear_rect->layerCount != iview->image->info.array_size)
 		return false;
 
-	if (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT) ||
+	if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9 &&
+	    (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT) ||
 	    ((vk_format_aspects(iview->image->vk_format) & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-	     !(aspects & VK_IMAGE_ASPECT_STENCIL_BIT)))
+	     !(aspects & VK_IMAGE_ASPECT_STENCIL_BIT))))
 		return false;
 
 	if (((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
@@ -841,6 +942,7 @@ emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
 		return false;
 
 	clear_word = radv_get_htile_fast_clear_value(iview->image, clear_value);
+	htile_mask = radv_get_htile_mask(iview->image, aspects);
 
 	if (pre_flush) {
 		cmd_buffer->state.flush_bits |= (RADV_CMD_FLAG_FLUSH_AND_INV_DB |
@@ -850,9 +952,19 @@ emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
 		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
 
-	flush_bits = radv_fill_buffer(cmd_buffer, iview->image->bo,
-				      iview->image->offset + iview->image->htile_offset,
-				      iview->image->surface.htile_size, clear_word);
+	if (htile_mask == UINT_MAX) {
+		/* Clear the whole HTILE buffer. */
+		flush_bits = radv_fill_buffer(cmd_buffer, iview->image->bo,
+					      iview->image->offset + iview->image->htile_offset,
+					      iview->image->surface.htile_size, clear_word);
+	} else {
+		/* Only clear depth or stencil bytes in the HTILE buffer. */
+		assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9);
+		flush_bits = clear_htile_mask(cmd_buffer, iview->image->bo,
+					      iview->image->offset + iview->image->htile_offset,
+					      iview->image->surface.htile_size, clear_word,
+					      htile_mask);
+	}
 
 	radv_update_ds_clear_metadata(cmd_buffer, iview->image, clear_value, aspects);
 	if (post_flush) {
@@ -864,6 +976,148 @@ emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
 	return true;
 }
 
+static nir_shader *
+build_clear_htile_mask_shader()
+{
+	nir_builder b;
+
+	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_clear_htile_mask");
+	b.shader->info.cs.local_size[0] = 64;
+	b.shader->info.cs.local_size[1] = 1;
+	b.shader->info.cs.local_size[2] = 1;
+
+	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
+	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
+	nir_ssa_def *block_size = nir_imm_ivec4(&b,
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
+
+	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+
+	nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16));
+	offset = nir_channel(&b, offset, 0);
+
+	nir_intrinsic_instr *buf =
+		nir_intrinsic_instr_create(b.shader,
+					   nir_intrinsic_vulkan_resource_index);
+
+	buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	nir_intrinsic_set_desc_set(buf, 0);
+	nir_intrinsic_set_binding(buf, 0);
+	nir_ssa_dest_init(&buf->instr, &buf->dest, 1, 32, NULL);
+	nir_builder_instr_insert(&b, &buf->instr);
+
+	nir_intrinsic_instr *constants =
+		nir_intrinsic_instr_create(b.shader,
+					   nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(constants, 0);
+	nir_intrinsic_set_range(constants, 8);
+	constants->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	constants->num_components = 2;
+	nir_ssa_dest_init(&constants->instr, &constants->dest, 2, 32, "constants");
+	nir_builder_instr_insert(&b, &constants->instr);
+
+	nir_intrinsic_instr *load =
+		nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
+	load->src[0] = nir_src_for_ssa(&buf->dest.ssa);
+	load->src[1] = nir_src_for_ssa(offset);
+	nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
+	load->num_components = 4;
+	nir_builder_instr_insert(&b, &load->instr);
+
+	/* data = (data & ~htile_mask) | (htile_value & htile_mask) */
+	nir_ssa_def *data =
+		nir_iand(&b, &load->dest.ssa,
+			 nir_channel(&b, &constants->dest.ssa, 1));
+	data = nir_ior(&b, data, nir_channel(&b, &constants->dest.ssa, 0));
+
+	nir_intrinsic_instr *store =
+		nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
+	store->src[0] = nir_src_for_ssa(data);
+	store->src[1] = nir_src_for_ssa(&buf->dest.ssa);
+	store->src[2] = nir_src_for_ssa(offset);
+	nir_intrinsic_set_write_mask(store, 0xf);
+	store->num_components = 4;
+	nir_builder_instr_insert(&b, &store->instr);
+
+	return b.shader;
+}
+
+static VkResult
+init_meta_clear_htile_mask_state(struct radv_device *device)
+{
+	struct radv_meta_state *state = &device->meta_state;
+	struct radv_shader_module cs = { .nir = NULL };
+	VkResult result;
+
+	cs.nir = build_clear_htile_mask_shader();
+
+	VkDescriptorSetLayoutCreateInfo ds_layout_info = {
+		.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+		.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
+		.bindingCount = 1,
+		.pBindings = (VkDescriptorSetLayoutBinding[]) {
+			{
+				.binding = 0,
+				.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+				.descriptorCount = 1,
+				.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+				.pImmutableSamplers = NULL
+			},
+		}
+	};
+
+	result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
+						&ds_layout_info, &state->alloc,
+						&state->clear_htile_mask_ds_layout);
+	if (result != VK_SUCCESS)
+		goto fail;
+
+	VkPipelineLayoutCreateInfo p_layout_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+		.setLayoutCount = 1,
+		.pSetLayouts = &state->clear_htile_mask_ds_layout,
+		.pushConstantRangeCount = 1,
+		.pPushConstantRanges = &(VkPushConstantRange){
+			VK_SHADER_STAGE_COMPUTE_BIT, 0, 8,
+		},
+	};
+
+	result = radv_CreatePipelineLayout(radv_device_to_handle(device),
+					  &p_layout_info, &state->alloc,
+					  &state->clear_htile_mask_p_layout);
+	if (result != VK_SUCCESS)
+		goto fail;
+
+	VkPipelineShaderStageCreateInfo shader_stage = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
+		.module = radv_shader_module_to_handle(&cs),
+		.pName = "main",
+		.pSpecializationInfo = NULL,
+	};
+
+	VkComputePipelineCreateInfo pipeline_info = {
+		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+		.stage = shader_stage,
+		.flags = 0,
+		.layout = state->clear_htile_mask_p_layout,
+	};
+
+	result = radv_CreateComputePipelines(radv_device_to_handle(device),
+					     radv_pipeline_cache_to_handle(&state->cache),
+					     1, &pipeline_info, NULL,
+					     &state->clear_htile_mask_pipeline);
+
+	ralloc_free(cs.nir);
+	return result;
+fail:
+	ralloc_free(cs.nir);
+	return result;
+}
+
 VkResult
 radv_device_init_meta_clear_state(struct radv_device *device, bool on_demand)
 {
@@ -898,6 +1152,10 @@ radv_device_init_meta_clear_state(struct radv_device *device, bool on_demand)
 	if (res != VK_SUCCESS)
 		goto fail;
 
+	res = init_meta_clear_htile_mask_state(device);
+	if (res != VK_SUCCESS)
+		goto fail;
+
 	if (on_demand)
 		return VK_SUCCESS;
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 1628be1002..4d7e740905 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -457,6 +457,12 @@ struct radv_meta_state {
 
 	VkPipelineLayout                          clear_color_p_layout;
 	VkPipelineLayout                          clear_depth_p_layout;
+
+	/* Optimized compute fast HTILE clear for stencil or depth only. */
+	VkPipeline clear_htile_mask_pipeline;
+	VkPipelineLayout clear_htile_mask_p_layout;
+	VkDescriptorSetLayout clear_htile_mask_ds_layout;
+
 	struct {
 		VkRenderPass render_pass[NUM_META_FS_KEYS][RADV_META_DST_LAYOUT_COUNT];
 
-- 
2.19.1



More information about the mesa-dev mailing list