[Mesa-dev] [PATCH] radeonsi: implement TC-compatible HTILE
Nicolai Hähnle
nhaehnle at gmail.com
Thu Oct 13 15:48:23 UTC 2016
On 12.10.2016 15:54, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> so that decompress blits aren't needed and depth texturing needs less
> memory bandwidth.
>
> Z16 and Z24 are promoted to Z32_FLOAT by the driver, because TC-compatible
> HTILE only supports Z32_FLOAT. This doubles memory footprint for Z16.
> The format promotion is not visible to state trackers.
>
> This is part of TC-compatible renderbuffer compression, which has 3 parts:
> DCC, HTILE, FMASK. Only TC-compatible FMASK compression is missing now.
>
> I don't see a measurable increase in performance though.
>
> (I tested Talos Principle and DiRT: Showdown, the latter is improved by
> 0.5%, which is almost noise, and it originally used layered Z16,
> so at least we know that Z16 promoted to Z32F isn't slower now)
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
> ---
> src/gallium/drivers/radeon/r600_pipe_common.h | 3 ++
> src/gallium/drivers/radeon/r600_texture.c | 67 ++++++++++++++++++++++----
> src/gallium/drivers/radeon/radeon_winsys.h | 4 ++
> src/gallium/drivers/radeonsi/si_blit.c | 11 ++++-
> src/gallium/drivers/radeonsi/si_descriptors.c | 7 ++-
> src/gallium/drivers/radeonsi/si_shader.c | 18 ++++++-
> src/gallium/drivers/radeonsi/si_state.c | 39 +++++++++++++--
> src/gallium/drivers/radeonsi/si_state_draw.c | 3 +-
> src/gallium/winsys/amdgpu/drm/amdgpu_surface.c | 57 ++++++++++++++++++++--
> 9 files changed, 185 insertions(+), 24 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
> index 290b228..5cfcad6 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -238,27 +238,29 @@ struct r600_cmask_info {
> unsigned yalign;
> unsigned slice_tile_max;
> unsigned base_address_reg;
> };
>
> struct r600_htile_info {
> unsigned pitch;
> unsigned height;
> unsigned xalign;
> unsigned yalign;
> + unsigned alignment;
> };
>
> struct r600_texture {
> struct r600_resource resource;
>
> uint64_t size;
> unsigned num_level0_transfers;
> + enum pipe_format db_render_format;
> bool is_depth;
> bool db_compatible;
> bool can_sample_z;
> bool can_sample_s;
> unsigned dirty_level_mask; /* each bit says if that mipmap is compressed */
> unsigned stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
> struct r600_texture *flushed_depth_texture;
> struct radeon_surf surface;
>
> /* Colorbuffer compression and fast clear. */
> @@ -266,20 +268,21 @@ struct r600_texture {
> struct r600_cmask_info cmask;
> struct r600_resource *cmask_buffer;
> uint64_t dcc_offset; /* 0 = disabled */
> unsigned cb_color_info; /* fast clear enable bit */
> unsigned color_clear_value[2];
> unsigned last_msaa_resolve_target_micro_mode;
>
> /* Depth buffer compression and fast clear. */
> struct r600_htile_info htile;
> struct r600_resource *htile_buffer;
> + bool tc_compatible_htile;
> bool depth_cleared; /* if it was cleared at least once */
> float depth_clear_value;
> bool stencil_cleared; /* if it was cleared at least once */
> uint8_t stencil_clear_value;
>
> bool non_disp_tiling; /* R600-Cayman only */
>
> /* Whether the texture is a displayable back buffer and needs DCC
> * decompression, which is expensive. Therefore, it's enabled only
> * if statistics suggest that it will pay off and it's allocated
> diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
> index 57cdbcf..625d091 100644
> --- a/src/gallium/drivers/radeon/r600_texture.c
> +++ b/src/gallium/drivers/radeon/r600_texture.c
> @@ -185,21 +185,22 @@ static unsigned r600_texture_get_offset(struct r600_texture *rtex, unsigned leve
> return rtex->surface.level[level].offset +
> box->z * rtex->surface.level[level].slice_size +
> box->y / util_format_get_blockheight(format) * rtex->surface.level[level].pitch_bytes +
> box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
> }
>
> static int r600_init_surface(struct r600_common_screen *rscreen,
> struct radeon_surf *surface,
> const struct pipe_resource *ptex,
> unsigned array_mode,
> - bool is_flushed_depth)
> + bool is_flushed_depth,
> + bool tc_compatible_htile)
> {
> const struct util_format_description *desc =
> util_format_description(ptex->format);
> bool is_depth, is_stencil;
>
> is_depth = util_format_has_depth(desc);
> is_stencil = util_format_has_stencil(desc);
>
> surface->npix_x = ptex->width0;
> surface->npix_y = ptex->height0;
> @@ -249,25 +250,36 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
> surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_CUBEMAP, TYPE);
> break;
> case PIPE_BUFFER:
> default:
> return -EINVAL;
> }
>
> if (!is_flushed_depth && is_depth) {
> surface->flags |= RADEON_SURF_ZBUFFER;
>
> + if (tc_compatible_htile &&
> + array_mode == RADEON_SURF_MODE_2D) {
> + /* TC-compatible HTILE only supports Z32_FLOAT.
> + * Promote Z16 to Z32. DB->CB copies will convert
> + * the format for transfers.
> + */
> + surface->bpe = 4;
> + surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
> + }
> +
> if (is_stencil) {
> surface->flags |= RADEON_SURF_SBUFFER |
> RADEON_SURF_HAS_SBUFFER_MIPTREE;
> }
> }
> +
> if (rscreen->chip_class >= SI) {
> surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
> }
>
> if (rscreen->chip_class >= VI &&
> (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC ||
> ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT))
> surface->flags |= RADEON_SURF_DISABLE_DCC;
>
> if (ptex->bind & PIPE_BIND_SCANOUT) {
> @@ -897,43 +909,57 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
> slice_elements = (width * height) / (8 * 8);
> slice_bytes = slice_elements * 4;
>
> pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
> base_align = num_pipes * pipe_interleave_bytes;
>
> rtex->htile.pitch = width;
> rtex->htile.height = height;
> rtex->htile.xalign = cl_width * 8;
> rtex->htile.yalign = cl_height * 8;
> + rtex->htile.alignment = base_align;
>
> return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
> align(slice_bytes, base_align);
> }
>
> static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
> struct r600_texture *rtex)
> {
> - unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
> + uint64_t htile_size, alignment;
> + uint32_t clear_value;
> +
> + if (rtex->tc_compatible_htile) {
> + htile_size = rtex->surface.htile_size;
> + alignment = rtex->surface.htile_alignment;
> + clear_value = 0x0000030F;
> + } else {
> + htile_size = r600_texture_get_htile_size(rscreen, rtex);
> + alignment = rtex->htile.alignment;
> + clear_value = 0;
> + }
>
> if (!htile_size)
> return;
>
> rtex->htile_buffer = (struct r600_resource*)
> - pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
> - PIPE_USAGE_DEFAULT, htile_size);
> + r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
> + PIPE_USAGE_DEFAULT,
> + htile_size, alignment);
> if (rtex->htile_buffer == NULL) {
> /* this is not a fatal error as we can still keep rendering
> * without htile buffer */
> R600_ERR("Failed to create buffer object for htile buffer.\n");
> } else {
> - r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0,
> - htile_size, 0, R600_COHERENCY_NONE);
> + r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
> + 0, htile_size, clear_value,
> + R600_COHERENCY_NONE);
> }
> }
>
> void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
> {
> int i;
>
> fprintf(f, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
> "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
> "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
> @@ -960,24 +986,25 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
>
> if (rtex->cmask.size)
> fprintf(f, " CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, "
> "height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n",
> rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
> rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign,
> rtex->cmask.yalign, rtex->cmask.slice_tile_max);
>
> if (rtex->htile_buffer)
> fprintf(f, " HTile: size=%u, alignment=%u, pitch=%u, height=%u, "
> - "xalign=%u, yalign=%u\n",
> + "xalign=%u, yalign=%u, TC_compatible = %u\n",
> rtex->htile_buffer->b.b.width0,
> rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
> - rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign);
> + rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign,
> + rtex->tc_compatible_htile);
>
> if (rtex->dcc_offset) {
> fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
> rtex->dcc_offset, rtex->surface.dcc_size,
> rtex->surface.dcc_alignment);
> for (i = 0; i <= rtex->surface.last_level; i++)
> fprintf(f, " DCCLevel[%i]: enabled=%u, offset=%"PRIu64", "
> "fast_clear_size=%"PRIu64"\n",
> i, rtex->surface.level[i].dcc_enabled,
> rtex->surface.level[i].dcc_offset,
> @@ -1047,20 +1074,30 @@ r600_texture_create_object(struct pipe_screen *screen,
>
> /* don't include stencil-only formats which we don't support for rendering */
> rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
>
> rtex->surface = *surface;
> if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
> FREE(rtex);
> return NULL;
> }
>
> + rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
> + assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
> + rtex->tc_compatible_htile);
> +
> + /* TC-compatible HTILE only supports Z32_FLOAT. */
> + if (rtex->tc_compatible_htile)
> + rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
> + else
> + rtex->db_render_format = base->format;
> +
> /* Tiled depth textures utilize the non-displayable tile order.
> * This must be done after r600_setup_surface.
> * Applies to R600-Cayman. */
> rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D;
> /* Applies to GCN. */
> rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
>
> /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
> * between frames, so the only thing that can enable separate DCC
> * with DRI2 is multiple slow clears within a frame.
> @@ -1234,25 +1271,34 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
>
> /* The allocator will switch to 1D if needed. */
> return RADEON_SURF_MODE_2D;
> }
>
> struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
> const struct pipe_resource *templ)
> {
> struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
> struct radeon_surf surface = {0};
> + bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH;
> + bool tc_compatible_htile =
> + rscreen->chip_class >= VI &&
> + (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
> + !(rscreen->debug_flags & DBG_NO_HYPERZ) &&
> + !is_flushed_depth &&
> + templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
> + util_format_is_depth_or_stencil(templ->format);
> +
> int r;
>
> r = r600_init_surface(rscreen, &surface, templ,
> r600_choose_tiling(rscreen, templ),
> - templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
> + is_flushed_depth, tc_compatible_htile);
> if (r) {
> return NULL;
> }
> r = rscreen->ws->surface_best(rscreen->ws, &surface);
> if (r) {
> return NULL;
> }
> return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0,
> 0, NULL, &surface);
> }
> @@ -1289,21 +1335,22 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
> surface.mtilea = metadata.mtilea;
> surface.num_banks = metadata.num_banks;
>
> if (metadata.macrotile == RADEON_LAYOUT_TILED)
> array_mode = RADEON_SURF_MODE_2D;
> else if (metadata.microtile == RADEON_LAYOUT_TILED)
> array_mode = RADEON_SURF_MODE_1D;
> else
> array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
>
> - r = r600_init_surface(rscreen, &surface, templ, array_mode, false);
> + r = r600_init_surface(rscreen, &surface, templ, array_mode,
> + false, false);
> if (r) {
> return NULL;
> }
>
> if (metadata.scanout)
> surface.flags |= RADEON_SURF_SCANOUT;
>
> rtex = r600_texture_create_object(screen, templ, stride,
> offset, buf, &surface);
> if (!rtex)
> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
> index 7146737..8946209 100644
> --- a/src/gallium/drivers/radeon/radeon_winsys.h
> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
> @@ -271,20 +271,21 @@ enum radeon_feature_id {
> #define RADEON_SURF_MODE_1D 2
> #define RADEON_SURF_MODE_2D 3
> #define RADEON_SURF_SCANOUT (1 << 16)
> #define RADEON_SURF_ZBUFFER (1 << 17)
> #define RADEON_SURF_SBUFFER (1 << 18)
> #define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
> #define RADEON_SURF_HAS_SBUFFER_MIPTREE (1 << 19)
> #define RADEON_SURF_HAS_TILE_MODE_INDEX (1 << 20)
> #define RADEON_SURF_FMASK (1 << 21)
> #define RADEON_SURF_DISABLE_DCC (1 << 22)
> +#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23)
>
> #define RADEON_SURF_GET(v, field) (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
> #define RADEON_SURF_SET(v, field) (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
> #define RADEON_SURF_CLR(v, field) ((v) & ~(RADEON_SURF_ ## field ## _MASK << RADEON_SURF_ ## field ## _SHIFT))
>
> struct radeon_surf_level {
> uint64_t offset;
> uint64_t slice_size;
> uint32_t npix_x;
> uint32_t npix_y;
> @@ -337,20 +338,23 @@ struct radeon_surf {
> /* Whether the depth miptree or stencil miptree as used by the DB are
> * adjusted from their TC compatible form to ensure depth/stencil
> * compatibility. If either is true, the corresponding plane cannot be
> * sampled from.
> */
> bool depth_adjusted;
> bool stencil_adjusted;
>
> uint64_t dcc_size;
> uint64_t dcc_alignment;
> + /* TC-compatible HTILE only. */
> + uint64_t htile_size;
> + uint64_t htile_alignment;
> };
>
> struct radeon_bo_list_item {
> uint64_t bo_size;
> uint64_t vm_address;
> uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */
> };
>
> struct radeon_winsys {
> /**
> diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
> index c143601..db41f56 100644
> --- a/src/gallium/drivers/radeonsi/si_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_blit.c
> @@ -325,20 +325,22 @@ si_flush_depth_texture(struct si_context *sctx,
> levels_s = level_mask & tex->stencil_dirty_level_mask;
>
> if (levels_s) {
> if (r600_can_sample_zs(tex, true))
> inplace_planes |= PIPE_MASK_S;
> else
> copy_planes |= PIPE_MASK_S;
> }
> }
>
> + assert(!tex->tc_compatible_htile || levels_z == 0);
> +
> /* We may have to allocate the flushed texture here when called from
> * si_decompress_subresource.
> */
> if (copy_planes &&
> (tex->flushed_depth_texture ||
> r600_init_flushed_depth_texture(&sctx->b.b, &tex->resource.b.b, NULL))) {
> struct r600_texture *dst = tex->flushed_depth_texture;
> unsigned fully_copied_levels;
> unsigned levels = 0;
>
> @@ -692,35 +694,40 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
> tex = (struct r600_texture *)fb->cbufs[i]->texture;
> if (tex->fmask.size == 0)
> tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
> }
> }
>
> if (zstex && zstex->htile_buffer &&
> zsbuf->u.tex.level == 0 &&
> zsbuf->u.tex.first_layer == 0 &&
> zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) {
> - if (buffers & PIPE_CLEAR_DEPTH) {
> + /* TC-compatible HTILE only supports depth clears to 0 or 1. */
> + if (buffers & PIPE_CLEAR_DEPTH &&
> + (!zstex->tc_compatible_htile ||
> + depth == 0 || depth == 1)) {
> /* Need to disable EXPCLEAR temporarily if clearing
> * to a new value. */
> if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
> sctx->db_depth_disable_expclear = true;
> }
>
> zstex->depth_clear_value = depth;
> sctx->framebuffer.dirty_zsbuf = true;
> si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
> sctx->db_depth_clear = true;
> si_mark_atom_dirty(sctx, &sctx->db_render_state);
> }
>
> - if (buffers & PIPE_CLEAR_STENCIL) {
> + /* TC-compatible HTILE only supports stencil clears to 0. */
> + if (buffers & PIPE_CLEAR_STENCIL &&
> + (!zstex->tc_compatible_htile || stencil == 0)) {
> stencil &= 0xff;
>
> /* Need to disable EXPCLEAR temporarily if clearing
> * to a new value. */
> if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
> sctx->db_stencil_disable_expclear = true;
> }
>
> zstex->stencil_clear_value = stencil;
> sctx->framebuffer.dirty_zsbuf = true;
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 350242a..19cae65 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -392,20 +392,23 @@ void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
> state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
> state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level,
> is_stencil));
> state[4] |= S_008F20_PITCH(pitch - 1);
>
> if (tex->dcc_offset && tex->surface.level[first_level].dcc_enabled) {
> state[6] |= S_008F28_COMPRESSION_EN(1);
> state[7] = ((!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
> tex->dcc_offset +
> base_level_info->dcc_offset) >> 8;
> + } else if (tex->tc_compatible_htile) {
> + state[6] |= S_008F28_COMPRESSION_EN(1);
> + state[7] = tex->htile_buffer->gpu_address >> 8;
> }
> }
>
> static void si_set_sampler_view(struct si_context *sctx,
> unsigned shader,
> unsigned slot, struct pipe_sampler_view *view,
> bool disallow_early_out)
> {
> struct si_sampler_views *views = &sctx->samplers[shader].views;
> struct si_sampler_view *rview = (struct si_sampler_view*)view;
> @@ -501,22 +504,24 @@ static void si_set_sampler_views(struct pipe_context *ctx,
> samplers->compressed_colortex_mask &= ~(1u << slot);
> si_set_sampler_view(sctx, shader, slot, NULL, false);
> continue;
> }
>
> si_set_sampler_view(sctx, shader, slot, views[i], false);
>
> if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
> struct r600_texture *rtex =
> (struct r600_texture*)views[i]->texture;
> + struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
>
> - if (rtex->db_compatible) {
> + if (rtex->db_compatible &&
> + (!rtex->tc_compatible_htile || rview->is_stencil_sampler)) {
> samplers->depth_texture_mask |= 1u << slot;
> } else {
> samplers->depth_texture_mask &= ~(1u << slot);
> }
> if (is_compressed_colortex(rtex)) {
> samplers->compressed_colortex_mask |= 1u << slot;
> } else {
> samplers->compressed_colortex_mask &= ~(1u << slot);
> }
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 49d4121..621d57f 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -4535,26 +4535,40 @@ static void tex_fetch_args(
> }
>
> /* Pack LOD bias value */
> if (opcode == TGSI_OPCODE_TXB)
> address[count++] = coords[3];
> if (opcode == TGSI_OPCODE_TXB2)
> address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
>
> /* Pack depth comparison value */
> if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
> + LLVMValueRef z;
> +
> if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
> - address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
> + z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
> } else {
> assert(ref_pos >= 0);
> - address[count++] = coords[ref_pos];
> + z = coords[ref_pos];
> }
> +
> + /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
> + * so the depth comparison value isn't clamped for Z16 and
> + * Z24 anymore. Do it manually here.
> + *
> + * It's unnecessary if the original texture format was
> + * Z32_FLOAT, but we don't know that here.
> + */
> + if (ctx->screen->b.chip_class == VI)
> + z = radeon_llvm_saturate(bld_base, z);
> +
> + address[count++] = z;
> }
>
> /* Pack user derivatives */
> if (opcode == TGSI_OPCODE_TXD) {
> int param, num_src_deriv_channels;
>
> switch (target) {
> case TGSI_TEXTURE_3D:
> num_src_deriv_channels = 3;
> num_deriv_channels = 3;
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index ad65fc2..b23749c 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -679,20 +679,23 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
> /*
> * inferred state between framebuffer and rasterizer
> */
> static void si_update_poly_offset_state(struct si_context *sctx)
> {
> struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>
> if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf)
> return;
>
> + /* Use the user format, not db_render_format, so that the polygon
> + * offset behaves as expected by applications.
> + */
> switch (sctx->framebuffer.state.zsbuf->texture->format) {
> case PIPE_FORMAT_Z16_UNORM:
> si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
> break;
> default: /* 24-bit */
> si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
> break;
> case PIPE_FORMAT_Z32_FLOAT:
> case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
> si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
> @@ -2133,32 +2136,32 @@ static void si_init_depth_surface(struct si_context *sctx,
> struct r600_surface *surf)
> {
> struct r600_texture *rtex = (struct r600_texture*)surf->base.texture;
> unsigned level = surf->base.u.tex.level;
> struct radeon_surf_level *levelinfo = &rtex->surface.level[level];
> unsigned format;
> uint32_t z_info, s_info, db_depth_info;
> uint64_t z_offs, s_offs;
> uint32_t db_htile_data_base, db_htile_surface;
>
> - format = si_translate_dbformat(rtex->resource.b.b.format);
> + format = si_translate_dbformat(rtex->db_render_format);
>
> if (format == V_028040_Z_INVALID) {
> R600_ERR("Invalid DB format: %d, disabling DB.\n", rtex->resource.b.b.format);
> }
> assert(format != V_028040_Z_INVALID);
>
> s_offs = z_offs = rtex->resource.gpu_address;
> z_offs += rtex->surface.level[level].offset;
> s_offs += rtex->surface.stencil_level[level].offset;
>
> - db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
> + db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!rtex->tc_compatible_htile);
>
> z_info = S_028040_FORMAT(format);
> if (rtex->resource.b.b.nr_samples > 1) {
> z_info |= S_028040_NUM_SAMPLES(util_logbase2(rtex->resource.b.b.nr_samples));
> }
>
> if (rtex->surface.flags & RADEON_SURF_SBUFFER)
> s_info = S_028044_FORMAT(V_028044_STENCIL_8);
> else
> s_info = S_028044_FORMAT(V_028044_STENCIL_INVALID);
> @@ -2201,27 +2204,51 @@ static void si_init_depth_surface(struct si_context *sctx,
> * uses. Problem was reproduced on Verde, Bonaire,
> * Tonga, and Carrizo.
> *
> * Disabling EXPCLEAR works around the problem.
> *
> * Check piglit's arb_texture_multisample-stencil-clear
> * test if you want to try changing this.
> */
> if (rtex->resource.b.b.nr_samples <= 1)
> s_info |= S_028044_ALLOW_EXPCLEAR(1);
> - } else
> - /* Use all of the htile_buffer for depth if there's no stencil. */
> + } else if (!rtex->tc_compatible_htile) {
> + /* Use all of the htile_buffer for depth if there's no stencil.
> + * This must not be set when TC-compatible HTILE is enabled
> + * due to a hw bug.
> + */
> s_info |= S_028044_TILE_STENCIL_DISABLE(1);
> + }
>
> uint64_t va = rtex->htile_buffer->gpu_address;
> db_htile_data_base = va >> 8;
> db_htile_surface = S_028ABC_FULL_CACHE(1);
> +
> + if (rtex->tc_compatible_htile) {
> + db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
> +
> + switch (rtex->resource.b.b.nr_samples) {
> + case 0:
> + case 1:
> + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
> + break;
> + case 2:
> + case 4:
> + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
> + break;
> + case 8:
> + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
> + break;
> + default:
> + assert(0);
> + }
> + }
> } else {
> db_htile_data_base = 0;
> db_htile_surface = 0;
> }
>
> assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
>
> surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
> S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
> surf->db_htile_data_base = db_htile_data_base;
> @@ -2349,20 +2376,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>
> if (rtex->dcc_gather_statistics) {
> /* Dirty tracking must be enabled for DCC usage analysis. */
> sctx->framebuffer.compressed_cb_mask |= 1 << i;
> vi_separate_dcc_start_query(ctx, rtex);
> }
> }
>
> if (state->zsbuf) {
> surf = (struct r600_surface*)state->zsbuf;
> + rtex = (struct r600_texture*)surf->base.texture;
>
> if (!surf->depth_initialized) {
> si_init_depth_surface(sctx, surf);
> }
> r600_context_add_resource_size(ctx, surf->base.texture);
> }
>
> si_update_poly_offset_state(sctx);
> si_mark_atom_dirty(sctx, &sctx->cb_render_state);
> si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
> @@ -3014,20 +3042,23 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
> */
> if (tmp->flushed_depth_texture->resource.b.b.format != tmp->resource.b.b.format)
> pipe_format = tmp->flushed_depth_texture->resource.b.b.format;
>
> tmp = tmp->flushed_depth_texture;
> }
>
> surflevel = tmp->surface.level;
>
> if (tmp->db_compatible) {
> + if (!view->is_stencil_sampler)
> + pipe_format = tmp->db_render_format;
> +
> switch (pipe_format) {
> case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
> pipe_format = PIPE_FORMAT_Z32_FLOAT;
> break;
> case PIPE_FORMAT_X8Z24_UNORM:
> case PIPE_FORMAT_S8_UINT_Z24_UNORM:
> /* Z24 is always stored like this for DB
> * compatibility.
> */
> pipe_format = PIPE_FORMAT_Z24X8_UNORM;
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index c14e852..d18137b 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -1111,21 +1111,22 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> sctx->b.family == CHIP_FIJI) &&
> r600_get_strmout_en(&sctx->b)) {
> sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
> }
>
> /* Set the depth buffer as dirty. */
> if (sctx->framebuffer.state.zsbuf) {
> struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
> struct r600_texture *rtex = (struct r600_texture *)surf->texture;
>
> - rtex->dirty_level_mask |= 1 << surf->u.tex.level;
> + if (!rtex->tc_compatible_htile)
> + rtex->dirty_level_mask |= 1 << surf->u.tex.level;
>
> if (rtex->surface.flags & RADEON_SURF_SBUFFER)
> rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
> }
> if (sctx->framebuffer.compressed_cb_mask) {
> struct pipe_surface *surf;
> struct r600_texture *rtex;
> unsigned mask = sctx->framebuffer.compressed_cb_mask;
>
> do {
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
> index 8bfea45..1bf07a7 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
> @@ -130,20 +130,21 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
> regValue.pMacroTileConfig = NULL;
> regValue.noOfMacroEntries = 0;
> } else {
> regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode;
> regValue.noOfMacroEntries = ARRAY_SIZE(ws->amdinfo.gb_macro_tile_mode);
> }
>
> createFlags.value = 0;
> createFlags.useTileIndex = 1;
> createFlags.degradeBaseLevel = 1;
> + createFlags.useHtileSliceAlign = 1;
>
> addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
> addrCreateInput.chipFamily = ws->family;
> addrCreateInput.chipRevision = ws->rev_id;
> addrCreateInput.createFlags = createFlags;
> addrCreateInput.callbacks.allocSysMem = allocSysMem;
> addrCreateInput.callbacks.freeSysMem = freeSysMem;
> addrCreateInput.callbacks.debugPrint = 0;
> addrCreateInput.regValue = regValue;
>
> @@ -153,21 +154,23 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
>
> return addrCreateOutput.hLib;
> }
>
> static int compute_level(struct amdgpu_winsys *ws,
> struct radeon_surf *surf, bool is_stencil,
> unsigned level, unsigned type, bool compressed,
> ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
> ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
> ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
> - ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut)
> + ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
> + ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
> + ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
> {
> struct radeon_surf_level *surf_level;
> ADDR_E_RETURNCODE ret;
>
> AddrSurfInfoIn->mipLevel = level;
> AddrSurfInfoIn->width = u_minify(surf->npix_x, level);
> AddrSurfInfoIn->height = u_minify(surf->npix_y, level);
>
> if (type == RADEON_SURF_TYPE_3D)
> AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level);
> @@ -250,20 +253,46 @@ static int compute_level(struct amdgpu_winsys *ws,
>
> if (ret == ADDR_OK) {
> surf_level->dcc_offset = surf->dcc_size;
> surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
> surf_level->dcc_enabled = true;
> surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
> surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
> }
> }
>
> + /* TC-compatible HTILE. */
> + if (!is_stencil &&
> + AddrSurfInfoIn->flags.depth &&
> + AddrSurfInfoIn->flags.tcCompatible &&
> + surf_level->mode == RADEON_SURF_MODE_2D &&
> + level == 0) {
> + AddrHtileIn->flags.tcCompatible = 1;
> + AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
> + AddrHtileIn->height = AddrSurfInfoOut->height;
> + AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
> + AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
> + AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
> + AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
> + AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
> + AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
> +
> + ret = AddrComputeHtileInfo(ws->addrlib,
> + AddrHtileIn,
> + AddrHtileOut);
> +
> + if (ret == ADDR_OK) {
> + surf->htile_size = AddrHtileOut->htileBytes;
> + surf->htile_alignment = AddrHtileOut->baseAlign;
> + }
> + }
> +
> return 0;
> }
>
> #define G_009910_MICRO_TILE_MODE(x) (((x) >> 0) & 0x03)
> #define G_009910_MICRO_TILE_MODE_NEW(x) (((x) >> 22) & 0x07)
>
> static void set_micro_tile_mode(struct radeon_surf *surf,
> struct radeon_info *info)
> {
> uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]];
> @@ -277,32 +306,36 @@ static void set_micro_tile_mode(struct radeon_surf *surf,
> static int amdgpu_surface_init(struct radeon_winsys *rws,
> struct radeon_surf *surf)
> {
> struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
> unsigned level, mode, type;
> bool compressed;
> ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
> ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
> ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
> ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
> + ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
> + ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
> ADDR_TILEINFO AddrTileInfoIn = {0};
> ADDR_TILEINFO AddrTileInfoOut = {0};
> int r;
>
> r = amdgpu_surface_sanity(surf);
> if (r)
> return r;
>
> AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
> AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
> AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
> AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
> + AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
> + AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
> AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
>
> type = RADEON_SURF_GET(surf->flags, TYPE);
> mode = RADEON_SURF_GET(surf->flags, MODE);
> compressed = surf->blk_w == 4 && surf->blk_h == 4;
>
> /* MSAA and FMASK require 2D tiling. */
> if (surf->nsamples > 1 ||
> (surf->flags & RADEON_SURF_FMASK))
> mode = RADEON_SURF_MODE_2D;
> @@ -354,21 +387,26 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
> else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
> AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
> else
> AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
>
> AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
> AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
> AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
> AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
> AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
> - AddrSurfInfoIn.flags.degrade4Space = 1;
> + AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
> +
> + /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
> + * requested, because TC-compatible HTILE requires 2D tiling.
> + */
> + AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible;
>
> /* DCC notes:
> * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
> * with samples >= 4.
> * - Mipmapped array textures have low performance (discovered by a closed
> * driver team).
> */
> AddrSurfInfoIn.flags.dccCompatible = ws->info.chip_class >= VI &&
> !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
> !(surf->flags & RADEON_SURF_DISABLE_DCC) &&
> @@ -436,25 +474,28 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
> if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
> AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
> else
> AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
> }
> }
>
> surf->bo_size = 0;
> surf->dcc_size = 0;
> surf->dcc_alignment = 1;
> + surf->htile_size = 0;
> + surf->htile_alignment = 1;
>
> /* Calculate texture layout information. */
> for (level = 0; level <= surf->last_level; level++) {
> r = compute_level(ws, surf, false, level, type, compressed,
> - &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
> + &AddrSurfInfoIn, &AddrSurfInfoOut,
> + &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
> if (r)
> return r;
>
> if (level == 0) {
> surf->bo_alignment = AddrSurfInfoOut.baseAlign;
> surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1;
> set_micro_tile_mode(surf, &ws->info);
>
> /* For 2D modes only. */
> if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
> @@ -468,26 +509,28 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
> surf->macro_tile_index = 0;
> }
> }
> }
>
> /* Calculate texture layout information for stencil. */
> if (surf->flags & RADEON_SURF_SBUFFER) {
> AddrSurfInfoIn.bpp = 8;
> AddrSurfInfoIn.flags.depth = 0;
> AddrSurfInfoIn.flags.stencil = 1;
> + AddrSurfInfoIn.flags.tcCompatible = 0;
> /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
> AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;
>
> for (level = 0; level <= surf->last_level; level++) {
> r = compute_level(ws, surf, true, level, type, compressed,
> - &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
> + &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut,
> + NULL, NULL);
> if (r)
> return r;
>
> /* DB uses the depth pitch for both stencil and depth. */
> if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x)
> surf->stencil_adjusted = true;
>
> if (level == 0) {
> /* For 2D modes only. */
> if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
> @@ -501,20 +544,26 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
> /* Recalculate the whole DCC miptree size including disabled levels.
> * This is what addrlib does, but calling addrlib would be a lot more
> * complicated.
> */
> if (surf->dcc_size && surf->last_level > 0) {
> surf->dcc_size = align64(surf->bo_size >> 8,
> ws->info.pipe_interleave_bytes *
> ws->info.num_tile_pipes);
> }
>
> + /* Make sure HTILE covers the whole miptree, because the shader reads
> + * TC-compatible HTILE even for levels where it's disabled by DB.
> + */
> + if (surf->htile_size && surf->last_level)
> + surf->htile_size *= 2;
> +
> return 0;
> }
>
> static int amdgpu_surface_best(struct radeon_winsys *rws,
> struct radeon_surf *surf)
> {
> return 0;
> }
>
> void amdgpu_surface_init_functions(struct amdgpu_winsys *ws)
>
More information about the mesa-dev
mailing list