[Mesa-dev] [PATCH] radeonsi: implement TC-compatible HTILE

Marek Olšák maraeo at gmail.com
Wed Oct 12 13:54:54 UTC 2016


From: Marek Olšák <marek.olsak at amd.com>

so that decompress blits aren't needed and depth texturing needs less
memory bandwidth.

Z16 and Z24 are promoted to Z32_FLOAT by the driver, because TC-compatible
HTILE only supports Z32_FLOAT. This doubles memory footprint for Z16.
The format promotion is not visible to state trackers.

This is part of TC-compatible renderbuffer compression, which has 3 parts:
DCC, HTILE, FMASK. Only TC-compatible FMASK compression is missing now.

I don't see a measurable increase in performance though.

(I tested Talos Principle and DiRT: Showdown, the latter is improved by
 0.5%, which is almost noise, and it originally used layered Z16,
 so at least we know that Z16 promoted to Z32F isn't slower now)
---
 src/gallium/drivers/radeon/r600_pipe_common.h  |  3 ++
 src/gallium/drivers/radeon/r600_texture.c      | 67 ++++++++++++++++++++++----
 src/gallium/drivers/radeon/radeon_winsys.h     |  4 ++
 src/gallium/drivers/radeonsi/si_blit.c         | 11 ++++-
 src/gallium/drivers/radeonsi/si_descriptors.c  |  7 ++-
 src/gallium/drivers/radeonsi/si_shader.c       | 18 ++++++-
 src/gallium/drivers/radeonsi/si_state.c        | 39 +++++++++++++--
 src/gallium/drivers/radeonsi/si_state_draw.c   |  3 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_surface.c | 57 ++++++++++++++++++++--
 9 files changed, 185 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 290b228..5cfcad6 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -238,27 +238,29 @@ struct r600_cmask_info {
 	unsigned yalign;
 	unsigned slice_tile_max;
 	unsigned base_address_reg;
 };
 
 struct r600_htile_info {
 	unsigned pitch;
 	unsigned height;
 	unsigned xalign;
 	unsigned yalign;
+	unsigned alignment;
 };
 
 struct r600_texture {
 	struct r600_resource		resource;
 
 	uint64_t			size;
 	unsigned			num_level0_transfers;
+	enum pipe_format		db_render_format;
 	bool				is_depth;
 	bool				db_compatible;
 	bool				can_sample_z;
 	bool				can_sample_s;
 	unsigned			dirty_level_mask; /* each bit says if that mipmap is compressed */
 	unsigned			stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
 	struct r600_texture		*flushed_depth_texture;
 	struct radeon_surf		surface;
 
 	/* Colorbuffer compression and fast clear. */
@@ -266,20 +268,21 @@ struct r600_texture {
 	struct r600_cmask_info		cmask;
 	struct r600_resource		*cmask_buffer;
 	uint64_t			dcc_offset; /* 0 = disabled */
 	unsigned			cb_color_info; /* fast clear enable bit */
 	unsigned			color_clear_value[2];
 	unsigned			last_msaa_resolve_target_micro_mode;
 
 	/* Depth buffer compression and fast clear. */
 	struct r600_htile_info		htile;
 	struct r600_resource		*htile_buffer;
+	bool				tc_compatible_htile;
 	bool				depth_cleared; /* if it was cleared at least once */
 	float				depth_clear_value;
 	bool				stencil_cleared; /* if it was cleared at least once */
 	uint8_t				stencil_clear_value;
 
 	bool				non_disp_tiling; /* R600-Cayman only */
 
 	/* Whether the texture is a displayable back buffer and needs DCC
 	 * decompression, which is expensive. Therefore, it's enabled only
 	 * if statistics suggest that it will pay off and it's allocated
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 57cdbcf..625d091 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -185,21 +185,22 @@ static unsigned r600_texture_get_offset(struct r600_texture *rtex, unsigned leve
 	return rtex->surface.level[level].offset +
 	       box->z * rtex->surface.level[level].slice_size +
 	       box->y / util_format_get_blockheight(format) * rtex->surface.level[level].pitch_bytes +
 	       box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
 }
 
 static int r600_init_surface(struct r600_common_screen *rscreen,
 			     struct radeon_surf *surface,
 			     const struct pipe_resource *ptex,
 			     unsigned array_mode,
-			     bool is_flushed_depth)
+			     bool is_flushed_depth,
+			     bool tc_compatible_htile)
 {
 	const struct util_format_description *desc =
 		util_format_description(ptex->format);
 	bool is_depth, is_stencil;
 
 	is_depth = util_format_has_depth(desc);
 	is_stencil = util_format_has_stencil(desc);
 
 	surface->npix_x = ptex->width0;
 	surface->npix_y = ptex->height0;
@@ -249,25 +250,36 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
 		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_CUBEMAP, TYPE);
 		break;
 	case PIPE_BUFFER:
 	default:
 		return -EINVAL;
 	}
 
 	if (!is_flushed_depth && is_depth) {
 		surface->flags |= RADEON_SURF_ZBUFFER;
 
+		if (tc_compatible_htile &&
+		    array_mode == RADEON_SURF_MODE_2D) {
+			/* TC-compatible HTILE only supports Z32_FLOAT.
+			 * Promote Z16 to Z32. DB->CB copies will convert
+			 * the format for transfers.
+			 */
+			surface->bpe = 4;
+			surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+		}
+
 		if (is_stencil) {
 			surface->flags |= RADEON_SURF_SBUFFER |
 					  RADEON_SURF_HAS_SBUFFER_MIPTREE;
 		}
 	}
+
 	if (rscreen->chip_class >= SI) {
 		surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
 	}
 
 	if (rscreen->chip_class >= VI &&
 	    (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC ||
 	     ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT))
 		surface->flags |= RADEON_SURF_DISABLE_DCC;
 
 	if (ptex->bind & PIPE_BIND_SCANOUT) {
@@ -897,43 +909,57 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	slice_elements = (width * height) / (8 * 8);
 	slice_bytes = slice_elements * 4;
 
 	pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
 	base_align = num_pipes * pipe_interleave_bytes;
 
 	rtex->htile.pitch = width;
 	rtex->htile.height = height;
 	rtex->htile.xalign = cl_width * 8;
 	rtex->htile.yalign = cl_height * 8;
+	rtex->htile.alignment = base_align;
 
 	return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
 		align(slice_bytes, base_align);
 }
 
 static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
 					struct r600_texture *rtex)
 {
-	unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
+	uint64_t htile_size, alignment;
+	uint32_t clear_value;
+
+	if (rtex->tc_compatible_htile) {
+		htile_size = rtex->surface.htile_size;
+		alignment = rtex->surface.htile_alignment;
+		clear_value = 0x0000030F;
+	} else {
+		htile_size = r600_texture_get_htile_size(rscreen, rtex);
+		alignment = rtex->htile.alignment;
+		clear_value = 0;
+	}
 
 	if (!htile_size)
 		return;
 
 	rtex->htile_buffer = (struct r600_resource*)
-			     pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-						PIPE_USAGE_DEFAULT, htile_size);
+			     r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
+							PIPE_USAGE_DEFAULT,
+							htile_size, alignment);
 	if (rtex->htile_buffer == NULL) {
 		/* this is not a fatal error as we can still keep rendering
 		 * without htile buffer */
 		R600_ERR("Failed to create buffer object for htile buffer.\n");
 	} else {
-		r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0,
-					 htile_size, 0, R600_COHERENCY_NONE);
+		r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
+					 0, htile_size, clear_value,
+					 R600_COHERENCY_NONE);
 	}
 }
 
 void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 {
 	int i;
 
 	fprintf(f, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
 		"blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
 		"bpe=%u, nsamples=%u, flags=0x%x, %s\n",
@@ -960,24 +986,25 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 
 	if (rtex->cmask.size)
 		fprintf(f, "  CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, "
 			"height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n",
 			rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
 			rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign,
 			rtex->cmask.yalign, rtex->cmask.slice_tile_max);
 
 	if (rtex->htile_buffer)
 		fprintf(f, "  HTile: size=%u, alignment=%u, pitch=%u, height=%u, "
-			"xalign=%u, yalign=%u\n",
+			"xalign=%u, yalign=%u, TC_compatible = %u\n",
 			rtex->htile_buffer->b.b.width0,
 			rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
-			rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign);
+			rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign,
+			rtex->tc_compatible_htile);
 
 	if (rtex->dcc_offset) {
 		fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
 			rtex->dcc_offset, rtex->surface.dcc_size,
 			rtex->surface.dcc_alignment);
 		for (i = 0; i <= rtex->surface.last_level; i++)
 			fprintf(f, "  DCCLevel[%i]: enabled=%u, offset=%"PRIu64", "
 				"fast_clear_size=%"PRIu64"\n",
 				i, rtex->surface.level[i].dcc_enabled,
 				rtex->surface.level[i].dcc_offset,
@@ -1047,20 +1074,30 @@ r600_texture_create_object(struct pipe_screen *screen,
 
 	/* don't include stencil-only formats which we don't support for rendering */
 	rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
 
 	rtex->surface = *surface;
 	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
 		FREE(rtex);
 		return NULL;
 	}
 
+	rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
+	assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
+	       rtex->tc_compatible_htile);
+
+	/* TC-compatible HTILE only supports Z32_FLOAT. */
+	if (rtex->tc_compatible_htile)
+		rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+	else
+		rtex->db_render_format = base->format;
+
 	/* Tiled depth textures utilize the non-displayable tile order.
 	 * This must be done after r600_setup_surface.
 	 * Applies to R600-Cayman. */
 	rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D;
 	/* Applies to GCN. */
 	rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
 
 	/* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
 	 * between frames, so the only thing that can enable separate DCC
 	 * with DRI2 is multiple slow clears within a frame.
@@ -1234,25 +1271,34 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 
 	/* The allocator will switch to 1D if needed. */
 	return RADEON_SURF_MODE_2D;
 }
 
 struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 					  const struct pipe_resource *templ)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct radeon_surf surface = {0};
+	bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH;
+	bool tc_compatible_htile =
+		rscreen->chip_class >= VI &&
+		(templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+		!(rscreen->debug_flags & DBG_NO_HYPERZ) &&
+		!is_flushed_depth &&
+		templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+		util_format_is_depth_or_stencil(templ->format);
+
 	int r;
 
 	r = r600_init_surface(rscreen, &surface, templ,
 			      r600_choose_tiling(rscreen, templ),
-			      templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
+			      is_flushed_depth, tc_compatible_htile);
 	if (r) {
 		return NULL;
 	}
 	r = rscreen->ws->surface_best(rscreen->ws, &surface);
 	if (r) {
 		return NULL;
 	}
 	return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0,
 								  0, NULL, &surface);
 }
@@ -1289,21 +1335,22 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	surface.mtilea = metadata.mtilea;
 	surface.num_banks = metadata.num_banks;
 
 	if (metadata.macrotile == RADEON_LAYOUT_TILED)
 		array_mode = RADEON_SURF_MODE_2D;
 	else if (metadata.microtile == RADEON_LAYOUT_TILED)
 		array_mode = RADEON_SURF_MODE_1D;
 	else
 		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
 
-	r = r600_init_surface(rscreen, &surface, templ, array_mode, false);
+	r = r600_init_surface(rscreen, &surface, templ, array_mode,
+			      false, false);
 	if (r) {
 		return NULL;
 	}
 
 	if (metadata.scanout)
 		surface.flags |= RADEON_SURF_SCANOUT;
 
 	rtex = r600_texture_create_object(screen, templ, stride,
 					  offset, buf, &surface);
 	if (!rtex)
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 7146737..8946209 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -271,20 +271,21 @@ enum radeon_feature_id {
 #define     RADEON_SURF_MODE_1D                     2
 #define     RADEON_SURF_MODE_2D                     3
 #define RADEON_SURF_SCANOUT                     (1 << 16)
 #define RADEON_SURF_ZBUFFER                     (1 << 17)
 #define RADEON_SURF_SBUFFER                     (1 << 18)
 #define RADEON_SURF_Z_OR_SBUFFER                (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
 #define RADEON_SURF_HAS_SBUFFER_MIPTREE         (1 << 19)
 #define RADEON_SURF_HAS_TILE_MODE_INDEX         (1 << 20)
 #define RADEON_SURF_FMASK                       (1 << 21)
 #define RADEON_SURF_DISABLE_DCC                 (1 << 22)
+#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
 
 #define RADEON_SURF_GET(v, field)   (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
 #define RADEON_SURF_SET(v, field)   (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
 #define RADEON_SURF_CLR(v, field)   ((v) & ~(RADEON_SURF_ ## field ## _MASK << RADEON_SURF_ ## field ## _SHIFT))
 
 struct radeon_surf_level {
     uint64_t                    offset;
     uint64_t                    slice_size;
     uint32_t                    npix_x;
     uint32_t                    npix_y;
@@ -337,20 +338,23 @@ struct radeon_surf {
     /* Whether the depth miptree or stencil miptree as used by the DB are
      * adjusted from their TC compatible form to ensure depth/stencil
      * compatibility. If either is true, the corresponding plane cannot be
      * sampled from.
      */
     bool                        depth_adjusted;
     bool                        stencil_adjusted;
 
     uint64_t                    dcc_size;
     uint64_t                    dcc_alignment;
+    /* TC-compatible HTILE only. */
+    uint64_t                    htile_size;
+    uint64_t                    htile_alignment;
 };
 
 struct radeon_bo_list_item {
     uint64_t bo_size;
     uint64_t vm_address;
     uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */
 };
 
 struct radeon_winsys {
     /**
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index c143601..db41f56 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -325,20 +325,22 @@ si_flush_depth_texture(struct si_context *sctx,
 		levels_s = level_mask & tex->stencil_dirty_level_mask;
 
 		if (levels_s) {
 			if (r600_can_sample_zs(tex, true))
 				inplace_planes |= PIPE_MASK_S;
 			else
 				copy_planes |= PIPE_MASK_S;
 		}
 	}
 
+	assert(!tex->tc_compatible_htile || levels_z == 0);
+
 	/* We may have to allocate the flushed texture here when called from
 	 * si_decompress_subresource.
 	 */
 	if (copy_planes &&
 	    (tex->flushed_depth_texture ||
 	     r600_init_flushed_depth_texture(&sctx->b.b, &tex->resource.b.b, NULL))) {
 		struct r600_texture *dst = tex->flushed_depth_texture;
 		unsigned fully_copied_levels;
 		unsigned levels = 0;
 
@@ -692,35 +694,40 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 			tex = (struct r600_texture *)fb->cbufs[i]->texture;
 			if (tex->fmask.size == 0)
 				tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
 		}
 	}
 
 	if (zstex && zstex->htile_buffer &&
 	    zsbuf->u.tex.level == 0 &&
 	    zsbuf->u.tex.first_layer == 0 &&
 	    zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) {
-		if (buffers & PIPE_CLEAR_DEPTH) {
+		/* TC-compatible HTILE only supports depth clears to 0 or 1. */
+		if (buffers & PIPE_CLEAR_DEPTH &&
+		    (!zstex->tc_compatible_htile ||
+		     depth == 0 || depth == 1)) {
 			/* Need to disable EXPCLEAR temporarily if clearing
 			 * to a new value. */
 			if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
 				sctx->db_depth_disable_expclear = true;
 			}
 
 			zstex->depth_clear_value = depth;
 			sctx->framebuffer.dirty_zsbuf = true;
 			si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
 			sctx->db_depth_clear = true;
 			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 		}
 
-		if (buffers & PIPE_CLEAR_STENCIL) {
+		/* TC-compatible HTILE only supports stencil clears to 0. */
+		if (buffers & PIPE_CLEAR_STENCIL &&
+		    (!zstex->tc_compatible_htile || stencil == 0)) {
 			stencil &= 0xff;
 
 			/* Need to disable EXPCLEAR temporarily if clearing
 			 * to a new value. */
 			if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
 				sctx->db_stencil_disable_expclear = true;
 			}
 
 			zstex->stencil_clear_value = stencil;
 			sctx->framebuffer.dirty_zsbuf = true;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 350242a..19cae65 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -392,20 +392,23 @@ void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
 	state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
 	state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level,
 							     is_stencil));
 	state[4] |= S_008F20_PITCH(pitch - 1);
 
 	if (tex->dcc_offset && tex->surface.level[first_level].dcc_enabled) {
 		state[6] |= S_008F28_COMPRESSION_EN(1);
 		state[7] = ((!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
 			    tex->dcc_offset +
 			    base_level_info->dcc_offset) >> 8;
+	} else if (tex->tc_compatible_htile) {
+		state[6] |= S_008F28_COMPRESSION_EN(1);
+		state[7] = tex->htile_buffer->gpu_address >> 8;
 	}
 }
 
 static void si_set_sampler_view(struct si_context *sctx,
 				unsigned shader,
 				unsigned slot, struct pipe_sampler_view *view,
 				bool disallow_early_out)
 {
 	struct si_sampler_views *views = &sctx->samplers[shader].views;
 	struct si_sampler_view *rview = (struct si_sampler_view*)view;
@@ -501,22 +504,24 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 			samplers->compressed_colortex_mask &= ~(1u << slot);
 			si_set_sampler_view(sctx, shader, slot, NULL, false);
 			continue;
 		}
 
 		si_set_sampler_view(sctx, shader, slot, views[i], false);
 
 		if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
 			struct r600_texture *rtex =
 				(struct r600_texture*)views[i]->texture;
+			struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
 
-			if (rtex->db_compatible) {
+			if (rtex->db_compatible &&
+			    (!rtex->tc_compatible_htile || rview->is_stencil_sampler)) {
 				samplers->depth_texture_mask |= 1u << slot;
 			} else {
 				samplers->depth_texture_mask &= ~(1u << slot);
 			}
 			if (is_compressed_colortex(rtex)) {
 				samplers->compressed_colortex_mask |= 1u << slot;
 			} else {
 				samplers->compressed_colortex_mask &= ~(1u << slot);
 			}
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 49d4121..621d57f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4535,26 +4535,40 @@ static void tex_fetch_args(
 	}
 
 	/* Pack LOD bias value */
 	if (opcode == TGSI_OPCODE_TXB)
 		address[count++] = coords[3];
 	if (opcode == TGSI_OPCODE_TXB2)
 		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 
 	/* Pack depth comparison value */
 	if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
+		LLVMValueRef z;
+
 		if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-			address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
+			z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 		} else {
 			assert(ref_pos >= 0);
-			address[count++] = coords[ref_pos];
+			z = coords[ref_pos];
 		}
+
+		/* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
+		 * so the depth comparison value isn't clamped for Z16 and
+		 * Z24 anymore. Do it manually here.
+		 *
+		 * It's unnecessary if the original texture format was
+		 * Z32_FLOAT, but we don't know that here.
+		 */
+		if (ctx->screen->b.chip_class == VI)
+			z = radeon_llvm_saturate(bld_base, z);
+
+		address[count++] = z;
 	}
 
 	/* Pack user derivatives */
 	if (opcode == TGSI_OPCODE_TXD) {
 		int param, num_src_deriv_channels;
 
 		switch (target) {
 		case TGSI_TEXTURE_3D:
 			num_src_deriv_channels = 3;
 			num_deriv_channels = 3;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index ad65fc2..b23749c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -679,20 +679,23 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 /*
  * inferred state between framebuffer and rasterizer
  */
 static void si_update_poly_offset_state(struct si_context *sctx)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
 	if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf)
 		return;
 
+	/* Use the user format, not db_render_format, so that the polygon
+	 * offset behaves as expected by applications.
+	 */
 	switch (sctx->framebuffer.state.zsbuf->texture->format) {
 	case PIPE_FORMAT_Z16_UNORM:
 		si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
 		break;
 	default: /* 24-bit */
 		si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
 		break;
 	case PIPE_FORMAT_Z32_FLOAT:
 	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 		si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
@@ -2133,32 +2136,32 @@ static void si_init_depth_surface(struct si_context *sctx,
 				  struct r600_surface *surf)
 {
 	struct r600_texture *rtex = (struct r600_texture*)surf->base.texture;
 	unsigned level = surf->base.u.tex.level;
 	struct radeon_surf_level *levelinfo = &rtex->surface.level[level];
 	unsigned format;
 	uint32_t z_info, s_info, db_depth_info;
 	uint64_t z_offs, s_offs;
 	uint32_t db_htile_data_base, db_htile_surface;
 
-	format = si_translate_dbformat(rtex->resource.b.b.format);
+	format = si_translate_dbformat(rtex->db_render_format);
 
 	if (format == V_028040_Z_INVALID) {
 		R600_ERR("Invalid DB format: %d, disabling DB.\n", rtex->resource.b.b.format);
 	}
 	assert(format != V_028040_Z_INVALID);
 
 	s_offs = z_offs = rtex->resource.gpu_address;
 	z_offs += rtex->surface.level[level].offset;
 	s_offs += rtex->surface.stencil_level[level].offset;
 
-	db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
+	db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!rtex->tc_compatible_htile);
 
 	z_info = S_028040_FORMAT(format);
 	if (rtex->resource.b.b.nr_samples > 1) {
 		z_info |= S_028040_NUM_SAMPLES(util_logbase2(rtex->resource.b.b.nr_samples));
 	}
 
 	if (rtex->surface.flags & RADEON_SURF_SBUFFER)
 		s_info = S_028044_FORMAT(V_028044_STENCIL_8);
 	else
 		s_info = S_028044_FORMAT(V_028044_STENCIL_INVALID);
@@ -2201,27 +2204,51 @@ static void si_init_depth_surface(struct si_context *sctx,
 			 * uses. Problem was reproduced on Verde, Bonaire,
 			 * Tonga, and Carrizo.
 			 *
 			 * Disabling EXPCLEAR works around the problem.
 			 *
 			 * Check piglit's arb_texture_multisample-stencil-clear
 			 * test if you want to try changing this.
 			 */
 			if (rtex->resource.b.b.nr_samples <= 1)
 				s_info |= S_028044_ALLOW_EXPCLEAR(1);
-		} else
-			/* Use all of the htile_buffer for depth if there's no stencil. */
+		} else if (!rtex->tc_compatible_htile) {
+			/* Use all of the htile_buffer for depth if there's no stencil.
+			 * This must not be set when TC-compatible HTILE is enabled
+			 * due to a hw bug.
+			 */
 			s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+		}
 
 		uint64_t va = rtex->htile_buffer->gpu_address;
 		db_htile_data_base = va >> 8;
 		db_htile_surface = S_028ABC_FULL_CACHE(1);
+
+		if (rtex->tc_compatible_htile) {
+			db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+
+			switch (rtex->resource.b.b.nr_samples) {
+			case 0:
+			case 1:
+				z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
+				break;
+			case 2:
+			case 4:
+				z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
+				break;
+			case 8:
+				z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
+				break;
+			default:
+				assert(0);
+			}
+		}
 	} else {
 		db_htile_data_base = 0;
 		db_htile_surface = 0;
 	}
 
 	assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
 
 	surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
 			      S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
 	surf->db_htile_data_base = db_htile_data_base;
@@ -2349,20 +2376,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (rtex->dcc_gather_statistics) {
 			/* Dirty tracking must be enabled for DCC usage analysis. */
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 			vi_separate_dcc_start_query(ctx, rtex);
 		}
 	}
 
 	if (state->zsbuf) {
 		surf = (struct r600_surface*)state->zsbuf;
+		rtex = (struct r600_texture*)surf->base.texture;
 
 		if (!surf->depth_initialized) {
 			si_init_depth_surface(sctx, surf);
 		}
 		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 
 	si_update_poly_offset_state(sctx);
 	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
@@ -3014,20 +3042,23 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 		 */
 		if (tmp->flushed_depth_texture->resource.b.b.format != tmp->resource.b.b.format)
 			pipe_format = tmp->flushed_depth_texture->resource.b.b.format;
 
 		tmp = tmp->flushed_depth_texture;
 	}
 
 	surflevel = tmp->surface.level;
 
 	if (tmp->db_compatible) {
+		if (!view->is_stencil_sampler)
+			pipe_format = tmp->db_render_format;
+
 		switch (pipe_format) {
 		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 			pipe_format = PIPE_FORMAT_Z32_FLOAT;
 			break;
 		case PIPE_FORMAT_X8Z24_UNORM:
 		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 			/* Z24 is always stored like this for DB
 			 * compatibility.
 			 */
 			pipe_format = PIPE_FORMAT_Z24X8_UNORM;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index c14e852..d18137b 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1111,21 +1111,22 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	     sctx->b.family == CHIP_FIJI) &&
 	    r600_get_strmout_en(&sctx->b)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
 	}
 
 	/* Set the depth buffer as dirty. */
 	if (sctx->framebuffer.state.zsbuf) {
 		struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
 		struct r600_texture *rtex = (struct r600_texture *)surf->texture;
 
-		rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+		if (!rtex->tc_compatible_htile)
+			rtex->dirty_level_mask |= 1 << surf->u.tex.level;
 
 		if (rtex->surface.flags & RADEON_SURF_SBUFFER)
 			rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
 	}
 	if (sctx->framebuffer.compressed_cb_mask) {
 		struct pipe_surface *surf;
 		struct r600_texture *rtex;
 		unsigned mask = sctx->framebuffer.compressed_cb_mask;
 
 		do {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
index 8bfea45..1bf07a7 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -130,20 +130,21 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
       regValue.pMacroTileConfig = NULL;
       regValue.noOfMacroEntries = 0;
    } else {
       regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode;
       regValue.noOfMacroEntries = ARRAY_SIZE(ws->amdinfo.gb_macro_tile_mode);
    }
 
    createFlags.value = 0;
    createFlags.useTileIndex = 1;
    createFlags.degradeBaseLevel = 1;
+   createFlags.useHtileSliceAlign = 1;
 
    addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
    addrCreateInput.chipFamily = ws->family;
    addrCreateInput.chipRevision = ws->rev_id;
    addrCreateInput.createFlags = createFlags;
    addrCreateInput.callbacks.allocSysMem = allocSysMem;
    addrCreateInput.callbacks.freeSysMem = freeSysMem;
    addrCreateInput.callbacks.debugPrint = 0;
    addrCreateInput.regValue = regValue;
 
@@ -153,21 +154,23 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
 
    return addrCreateOutput.hLib;
 }
 
 static int compute_level(struct amdgpu_winsys *ws,
                          struct radeon_surf *surf, bool is_stencil,
                          unsigned level, unsigned type, bool compressed,
                          ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
                          ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
                          ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
-                         ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut)
+                         ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
+                         ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
+                         ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
 {
    struct radeon_surf_level *surf_level;
    ADDR_E_RETURNCODE ret;
 
    AddrSurfInfoIn->mipLevel = level;
    AddrSurfInfoIn->width = u_minify(surf->npix_x, level);
    AddrSurfInfoIn->height = u_minify(surf->npix_y, level);
 
    if (type == RADEON_SURF_TYPE_3D)
       AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level);
@@ -250,20 +253,46 @@ static int compute_level(struct amdgpu_winsys *ws,
 
       if (ret == ADDR_OK) {
          surf_level->dcc_offset = surf->dcc_size;
          surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
          surf_level->dcc_enabled = true;
          surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
          surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
       }
    }
 
+   /* TC-compatible HTILE. */
+   if (!is_stencil &&
+       AddrSurfInfoIn->flags.depth &&
+       AddrSurfInfoIn->flags.tcCompatible &&
+       surf_level->mode == RADEON_SURF_MODE_2D &&
+       level == 0) {
+      AddrHtileIn->flags.tcCompatible = 1;
+      AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
+      AddrHtileIn->height = AddrSurfInfoOut->height;
+      AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
+      AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
+      AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
+      AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+      ret = AddrComputeHtileInfo(ws->addrlib,
+                                 AddrHtileIn,
+                                 AddrHtileOut);
+
+      if (ret == ADDR_OK) {
+         surf->htile_size = AddrHtileOut->htileBytes;
+         surf->htile_alignment = AddrHtileOut->baseAlign;
+      }
+   }
+
    return 0;
 }
 
 #define   G_009910_MICRO_TILE_MODE(x)          (((x) >> 0) & 0x03)
 #define   G_009910_MICRO_TILE_MODE_NEW(x)      (((x) >> 22) & 0x07)
 
 static void set_micro_tile_mode(struct radeon_surf *surf,
                                 struct radeon_info *info)
 {
    uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]];
@@ -277,32 +306,36 @@ static void set_micro_tile_mode(struct radeon_surf *surf,
 static int amdgpu_surface_init(struct radeon_winsys *rws,
                                struct radeon_surf *surf)
 {
    struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
    unsigned level, mode, type;
    bool compressed;
    ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
    ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
    ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
    ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
+   ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
+   ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
    ADDR_TILEINFO AddrTileInfoIn = {0};
    ADDR_TILEINFO AddrTileInfoOut = {0};
    int r;
 
    r = amdgpu_surface_sanity(surf);
    if (r)
       return r;
 
    AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
    AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
    AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
    AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
+   AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
+   AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
    AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
 
    type = RADEON_SURF_GET(surf->flags, TYPE);
    mode = RADEON_SURF_GET(surf->flags, MODE);
    compressed = surf->blk_w == 4 && surf->blk_h == 4;
 
    /* MSAA and FMASK require 2D tiling. */
    if (surf->nsamples > 1 ||
        (surf->flags & RADEON_SURF_FMASK))
       mode = RADEON_SURF_MODE_2D;
@@ -354,21 +387,26 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
       AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
    else
       AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
 
    AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
    AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
    AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
    AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
    AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
-   AddrSurfInfoIn.flags.degrade4Space = 1;
+   AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
+
+   /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
+    * requested, because TC-compatible HTILE requires 2D tiling.
+    */
+   AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible;
 
    /* DCC notes:
     * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
     *   with samples >= 4.
     * - Mipmapped array textures have low performance (discovered by a closed
     *   driver team).
     */
    AddrSurfInfoIn.flags.dccCompatible = ws->info.chip_class >= VI &&
                                         !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
                                         !(surf->flags & RADEON_SURF_DISABLE_DCC) &&
@@ -436,25 +474,28 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
          if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
             AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
          else
             AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
       }
    }
 
    surf->bo_size = 0;
    surf->dcc_size = 0;
    surf->dcc_alignment = 1;
+   surf->htile_size = 0;
+   surf->htile_alignment = 1;
 
    /* Calculate texture layout information. */
    for (level = 0; level <= surf->last_level; level++) {
       r = compute_level(ws, surf, false, level, type, compressed,
-                        &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
+                        &AddrSurfInfoIn, &AddrSurfInfoOut,
+                        &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
       if (r)
          return r;
 
       if (level == 0) {
          surf->bo_alignment = AddrSurfInfoOut.baseAlign;
          surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1;
          set_micro_tile_mode(surf, &ws->info);
 
          /* For 2D modes only. */
          if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
@@ -468,26 +509,28 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
             surf->macro_tile_index = 0;
          }
       }
    }
 
    /* Calculate texture layout information for stencil. */
    if (surf->flags & RADEON_SURF_SBUFFER) {
       AddrSurfInfoIn.bpp = 8;
       AddrSurfInfoIn.flags.depth = 0;
       AddrSurfInfoIn.flags.stencil = 1;
+      AddrSurfInfoIn.flags.tcCompatible = 0;
       /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
       AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;
 
       for (level = 0; level <= surf->last_level; level++) {
          r = compute_level(ws, surf, true, level, type, compressed,
-                           &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
+                           &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut,
+                           NULL, NULL);
          if (r)
             return r;
 
          /* DB uses the depth pitch for both stencil and depth. */
          if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x)
             surf->stencil_adjusted = true;
 
          if (level == 0) {
             /* For 2D modes only. */
             if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
@@ -501,20 +544,26 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    /* Recalculate the whole DCC miptree size including disabled levels.
     * This is what addrlib does, but calling addrlib would be a lot more
     * complicated.
     */
    if (surf->dcc_size && surf->last_level > 0) {
       surf->dcc_size = align64(surf->bo_size >> 8,
                                ws->info.pipe_interleave_bytes *
                                ws->info.num_tile_pipes);
    }
 
+   /* Make sure HTILE covers the whole miptree, because the shader reads
+    * TC-compatible HTILE even for levels where it's disabled by DB.
+    */
+   if (surf->htile_size && surf->last_level)
+	   surf->htile_size *= 2;
+
    return 0;
 }
 
 static int amdgpu_surface_best(struct radeon_winsys *rws,
                                struct radeon_surf *surf)
 {
    return 0;
 }
 
 void amdgpu_surface_init_functions(struct amdgpu_winsys *ws)
-- 
2.7.4



More information about the mesa-dev mailing list