[Mesa-dev] [PATCH 3/3] radeonsi: use a fragment shader blit instead of DB->CB copy for ZS CPU mappings
Dieter Nützel
Dieter at nuetzel-hh.de
Sun Jun 23 22:17:59 UTC 2019
For the series
Tested-by: Dieter Nützel <Dieter at nuetzel-hh.de>
on Polaris 20, openSUSE Tumbleweed, KDE Plasma 5
Dieter
Am 21.06.2019 19:02, schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak at amd.com>
>
> This mainly removes and simplifies code that is no longer needed.
>
> There were some issues with the DB->CB stencil copy on gfx10, so let's
> just use a fragment shader blit for all ZS mappings. It's more
> reliable.
> ---
> src/gallium/drivers/radeonsi/si_blit.c | 29 +---
> src/gallium/drivers/radeonsi/si_pipe.h | 9 +-
> src/gallium/drivers/radeonsi/si_state.c | 2 +-
> src/gallium/drivers/radeonsi/si_texture.c | 166 +++++++---------------
> 4 files changed, 52 insertions(+), 154 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_blit.c
> b/src/gallium/drivers/radeonsi/si_blit.c
> index 5806342cca9..638f2ee4d24 100644
> --- a/src/gallium/drivers/radeonsi/si_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_blit.c
> @@ -173,45 +173,20 @@ si_blit_dbcb_copy(struct si_context *sctx,
> }
>
> sctx->decompression_enabled = false;
> sctx->dbcb_depth_copy_enabled = false;
> sctx->dbcb_stencil_copy_enabled = false;
> si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
>
> return fully_copied_levels;
> }
>
> -void si_blit_decompress_depth(struct pipe_context *ctx,
> - struct si_texture *texture,
> - struct si_texture *staging,
> - unsigned first_level, unsigned last_level,
> - unsigned first_layer, unsigned last_layer,
> - unsigned first_sample, unsigned last_sample)
> -{
> - const struct util_format_description *desc;
> - unsigned planes = 0;
> -
> - assert(staging != NULL && "use si_blit_decompress_zs_in_place
> instead");
> -
> - desc = util_format_description(staging->buffer.b.b.format);
> -
> - if (util_format_has_depth(desc))
> - planes |= PIPE_MASK_Z;
> - if (util_format_has_stencil(desc))
> - planes |= PIPE_MASK_S;
> -
> - si_blit_dbcb_copy(
> - (struct si_context *)ctx, texture, staging, planes,
> - u_bit_consecutive(first_level, last_level - first_level + 1),
> - first_layer, last_layer, first_sample, last_sample);
> -}
> -
> /* Helper function for si_blit_decompress_zs_in_place.
> */
> static void
> si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
> struct si_texture *texture,
> unsigned planes, unsigned level_mask,
> unsigned first_layer, unsigned last_layer)
> {
> struct pipe_surface *zsurf, surf_tmpl = {{0}};
> unsigned layer, max_layer, checked_last_layer;
> @@ -348,21 +323,21 @@ si_decompress_depth(struct si_context *sctx,
> u_log_printf(sctx->log,
> "\n------------------------------------------------\n"
> "Decompress Depth (levels %u - %u, levels Z: 0x%x S:
> 0x%x)\n\n",
> first_level, last_level, levels_z, levels_s);
>
> /* We may have to allocate the flushed texture here when called from
> * si_decompress_subresource.
> */
> if (copy_planes &&
> (tex->flushed_depth_texture ||
> - si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b,
> NULL))) {
> + si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
> struct si_texture *dst = tex->flushed_depth_texture;
> unsigned fully_copied_levels;
> unsigned levels = 0;
>
> assert(tex->flushed_depth_texture);
>
> if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
> copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
>
> if (copy_planes & PIPE_MASK_Z) {
> @@ -1242,21 +1217,21 @@ static void si_blit(struct pipe_context *ctx,
> assert(util_blitter_is_blit_supported(sctx->blitter, info));
>
> /* The driver doesn't decompress resources automatically while
> * u_blitter is rendering. */
> vi_disable_dcc_if_incompatible_format(sctx, info->src.resource,
> info->src.level,
> info->src.format);
> vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource,
> info->dst.level,
> info->dst.format);
> - si_decompress_subresource(ctx, info->src.resource, info->mask,
> + si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS,
> info->src.level,
> info->src.box.z,
> info->src.box.z + info->src.box.depth - 1);
>
> if (sctx->screen->debug_flags & DBG(FORCE_DMA) &&
> util_try_blit_via_copy_region(ctx, info))
> return;
>
> si_blitter_begin(sctx, SI_BLIT |
> (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> index 11678e1b4cb..8512c27b2cd 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -1202,26 +1202,20 @@ void si_blitter_end(struct si_context *sctx);
> void si_init_blit_functions(struct si_context *sctx);
> void si_decompress_textures(struct si_context *sctx, unsigned
> shader_mask);
> void si_resource_copy_region(struct pipe_context *ctx,
> struct pipe_resource *dst,
> unsigned dst_level,
> unsigned dstx, unsigned dsty, unsigned dstz,
> struct pipe_resource *src,
> unsigned src_level,
> const struct pipe_box *src_box);
> void si_decompress_dcc(struct si_context *sctx, struct si_texture
> *tex);
> -void si_blit_decompress_depth(struct pipe_context *ctx,
> - struct si_texture *texture,
> - struct si_texture *staging,
> - unsigned first_level, unsigned last_level,
> - unsigned first_layer, unsigned last_layer,
> - unsigned first_sample, unsigned last_sample);
>
> /* si_buffer.c */
> bool si_rings_is_buffer_referenced(struct si_context *sctx,
> struct pb_buffer *buf,
> enum radeon_bo_usage usage);
> void *si_buffer_map_sync_with_rings(struct si_context *sctx,
> struct si_resource *resource,
> unsigned usage);
> void si_init_resource_fields(struct si_screen *sscreen,
> struct si_resource *res,
> @@ -1445,22 +1439,21 @@ bool si_prepare_for_dma_blit(struct si_context
> *sctx,
> unsigned dst_level, unsigned dstx,
> unsigned dsty, unsigned dstz,
> struct si_texture *src,
> unsigned src_level,
> const struct pipe_box *src_box);
> void si_eliminate_fast_color_clear(struct si_context *sctx,
> struct si_texture *tex);
> void si_texture_discard_cmask(struct si_screen *sscreen,
> struct si_texture *tex);
> bool si_init_flushed_depth_texture(struct pipe_context *ctx,
> - struct pipe_resource *texture,
> - struct si_texture **staging);
> + struct pipe_resource *texture);
> void si_print_texture_info(struct si_screen *sscreen,
> struct si_texture *tex, struct u_log_context *log);
> struct pipe_resource *si_texture_create(struct pipe_screen *screen,
> const struct pipe_resource *templ);
> bool vi_dcc_formats_compatible(enum pipe_format format1,
> enum pipe_format format2);
> bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
> unsigned level,
> enum pipe_format view_format);
> void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
> diff --git a/src/gallium/drivers/radeonsi/si_state.c
> b/src/gallium/drivers/radeonsi/si_state.c
> index b9fc77f7918..3996d280470 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -4120,21 +4120,21 @@ si_create_sampler_view_custom(struct
> pipe_context *ctx,
> state->target == PIPE_TEXTURE_RECT ||
> state->target == PIPE_TEXTURE_CUBE)
> last_layer = state->u.tex.first_layer;
>
> /* Texturing with separate depth and stencil. */
> pipe_format = state->format;
>
> /* Depth/stencil texturing sometimes needs separate texture. */
> if (tex->is_depth && !si_can_sample_zs(tex,
> view->is_stencil_sampler)) {
> if (!tex->flushed_depth_texture &&
> - !si_init_flushed_depth_texture(ctx, texture, NULL)) {
> + !si_init_flushed_depth_texture(ctx, texture)) {
> pipe_resource_reference(&view->base.texture, NULL);
> FREE(view);
> return NULL;
> }
>
> assert(tex->flushed_depth_texture);
>
> /* Override format for the case where the flushed texture
> * contains only Z or only S.
> */
> diff --git a/src/gallium/drivers/radeonsi/si_texture.c
> b/src/gallium/drivers/radeonsi/si_texture.c
> index dd383635675..be2562c45b4 100644
> --- a/src/gallium/drivers/radeonsi/si_texture.c
> +++ b/src/gallium/drivers/radeonsi/si_texture.c
> @@ -121,59 +121,58 @@ static void si_copy_region_with_blit(struct
> pipe_context *pipe,
> blit.src.box = *src_box;
> blit.dst.resource = dst;
> blit.dst.format = dst->format;
> blit.dst.level = dst_level;
> blit.dst.box.x = dstx;
> blit.dst.box.y = dsty;
> blit.dst.box.z = dstz;
> blit.dst.box.width = src_box->width;
> blit.dst.box.height = src_box->height;
> blit.dst.box.depth = src_box->depth;
> - blit.mask = util_format_get_mask(src->format) &
> - util_format_get_mask(dst->format);
> + blit.mask = util_format_get_mask(dst->format);
> blit.filter = PIPE_TEX_FILTER_NEAREST;
>
> if (blit.mask) {
> pipe->blit(pipe, &blit);
> }
> }
>
> /* Copy from a full GPU texture to a transfer's staging one. */
> static void si_copy_to_staging_texture(struct pipe_context *ctx,
> struct si_transfer *stransfer)
> {
> struct si_context *sctx = (struct si_context*)ctx;
> struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
> struct pipe_resource *dst = &stransfer->staging->b.b;
> struct pipe_resource *src = transfer->resource;
>
> - if (src->nr_samples > 1) {
> + if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) {
> si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
> src, transfer->level, &transfer->box);
> return;
> }
>
> sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
> &transfer->box);
> }
>
> /* Copy from a transfer's staging texture to a full GPU one. */
> static void si_copy_from_staging_texture(struct pipe_context *ctx,
> struct si_transfer *stransfer)
> {
> struct si_context *sctx = (struct si_context*)ctx;
> struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
> struct pipe_resource *dst = transfer->resource;
> struct pipe_resource *src = &stransfer->staging->b.b;
> struct pipe_box sbox;
>
> u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height,
> transfer->box.depth, &sbox);
>
> - if (dst->nr_samples > 1) {
> + if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) {
> si_copy_region_with_blit(ctx, dst, transfer->level,
> transfer->box.x, transfer->box.y, transfer->box.z,
> src, 0, &sbox);
> return;
> }
>
> if (util_format_is_compressed(dst->format)) {
> sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
> sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
> }
> @@ -1707,80 +1706,71 @@ static struct pipe_resource
> *si_texture_from_handle(struct pipe_screen *screen,
> sscreen->info.max_alignment,
> &stride, &offset);
> if (!buf)
> return NULL;
>
> return si_texture_from_winsys_buffer(sscreen, templ, buf, stride,
> offset, usage, true);
> }
>
> bool si_init_flushed_depth_texture(struct pipe_context *ctx,
> - struct pipe_resource *texture,
> - struct si_texture **staging)
> + struct pipe_resource *texture)
> {
> struct si_texture *tex = (struct si_texture*)texture;
> struct pipe_resource resource;
> - struct si_texture **flushed_depth_texture = staging ?
> - staging : &tex->flushed_depth_texture;
> enum pipe_format pipe_format = texture->format;
>
> - if (!staging) {
> - if (tex->flushed_depth_texture)
> - return true; /* it's ready */
> -
> - if (!tex->can_sample_z && tex->can_sample_s) {
> - switch (pipe_format) {
> - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
> - /* Save memory by not allocating the S plane. */
> - pipe_format = PIPE_FORMAT_Z32_FLOAT;
> - break;
> - case PIPE_FORMAT_Z24_UNORM_S8_UINT:
> - case PIPE_FORMAT_S8_UINT_Z24_UNORM:
> - /* Save memory bandwidth by not copying the
> - * stencil part during flush.
> - *
> - * This potentially increases memory bandwidth
> - * if an application uses both Z and S texturing
> - * simultaneously (a flushed Z24S8 texture
> - * would be stored compactly), but how often
> - * does that really happen?
> - */
> - pipe_format = PIPE_FORMAT_Z24X8_UNORM;
> - break;
> - default:;
> - }
> - } else if (!tex->can_sample_s && tex->can_sample_z) {
> - assert(util_format_has_stencil(util_format_description(pipe_format)));
> -
> - /* DB->CB copies to an 8bpp surface don't work. */
> - pipe_format = PIPE_FORMAT_X24S8_UINT;
> + assert(!tex->flushed_depth_texture);
> +
> + if (!tex->can_sample_z && tex->can_sample_s) {
> + switch (pipe_format) {
> + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
> + /* Save memory by not allocating the S plane. */
> + pipe_format = PIPE_FORMAT_Z32_FLOAT;
> + break;
> + case PIPE_FORMAT_Z24_UNORM_S8_UINT:
> + case PIPE_FORMAT_S8_UINT_Z24_UNORM:
> + /* Save memory bandwidth by not copying the
> + * stencil part during flush.
> + *
> + * This potentially increases memory bandwidth
> + * if an application uses both Z and S texturing
> + * simultaneously (a flushed Z24S8 texture
> + * would be stored compactly), but how often
> + * does that really happen?
> + */
> + pipe_format = PIPE_FORMAT_Z24X8_UNORM;
> + break;
> + default:;
> }
> + } else if (!tex->can_sample_s && tex->can_sample_z) {
> + assert(util_format_has_stencil(util_format_description(pipe_format)));
> +
> + /* DB->CB copies to an 8bpp surface don't work. */
> + pipe_format = PIPE_FORMAT_X24S8_UINT;
> }
>
> memset(&resource, 0, sizeof(resource));
> resource.target = texture->target;
> resource.format = pipe_format;
> resource.width0 = texture->width0;
> resource.height0 = texture->height0;
> resource.depth0 = texture->depth0;
> resource.array_size = texture->array_size;
> resource.last_level = texture->last_level;
> resource.nr_samples = texture->nr_samples;
> - resource.usage = staging ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
> + resource.usage = PIPE_USAGE_DEFAULT;
> resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
> resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
>
> - if (staging)
> - resource.flags |= SI_RESOURCE_FLAG_TRANSFER;
> -
> - *flushed_depth_texture = (struct si_texture
> *)ctx->screen->resource_create(ctx->screen, &resource);
> - if (*flushed_depth_texture == NULL) {
> + tex->flushed_depth_texture = (struct si_texture
> *)ctx->screen->resource_create(ctx->screen, &resource);
> + if (!tex->flushed_depth_texture) {
> PRINT_ERR("failed to create temporary texture to hold flushed
> depth\n");
> return false;
> }
> return true;
> }
>
> /**
> * Initialize the pipe_resource descriptor to be of the same size as
> the box,
> * which is supposed to hold a subregion of the texture "orig" at the
> given
> * mipmap level.
> @@ -1873,22 +1863,24 @@ static void *si_texture_transfer_map(struct
> pipe_context *ctx,
> struct si_texture *tex = (struct si_texture*)texture;
> struct si_transfer *trans;
> struct si_resource *buf;
> unsigned offset = 0;
> char *map;
> bool use_staging_texture = false;
>
> assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
> assert(box->width && box->height && box->depth);
>
> - /* Depth textures use staging unconditionally. */
> - if (!tex->is_depth) {
> + if (tex->is_depth) {
> + /* Depth textures use staging unconditionally. */
> + use_staging_texture = true;
> + } else {
> /* Degrade the tile mode if we get too many transfers on APUs.
> * On dGPUs, the staging texture is always faster.
> * Only count uploads that are at least 4x4 pixels large.
> */
> if (!sctx->screen->info.has_dedicated_vram &&
> level == 0 &&
> box->width >= 4 && box->height >= 4 &&
> p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
> bool can_invalidate =
> si_can_invalidate_texture(sctx->screen, tex,
> @@ -1929,90 +1921,36 @@ static void *si_texture_transfer_map(struct
> pipe_context *ctx,
> }
>
> trans = CALLOC_STRUCT(si_transfer);
> if (!trans)
> return NULL;
> pipe_resource_reference(&trans->b.b.resource, texture);
> trans->b.b.level = level;
> trans->b.b.usage = usage;
> trans->b.b.box = *box;
>
> - if (tex->is_depth) {
> - struct si_texture *staging_depth;
> -
> - if (tex->buffer.b.b.nr_samples > 1) {
> - /* MSAA depth buffers need to be converted to single sample
> buffers.
> - *
> - * Mapping MSAA depth buffers can occur if ReadPixels is called
> - * with a multisample GLX visual.
> - *
> - * First downsample the depth buffer to a temporary texture,
> - * then decompress the temporary one to staging.
> - *
> - * Only the region being mapped is transfered.
> - */
> - struct pipe_resource resource;
> -
> - si_init_temp_resource_from_box(&resource, texture, box, level, 0);
> -
> - if (!si_init_flushed_depth_texture(ctx, &resource, &staging_depth))
> {
> - PRINT_ERR("failed to create temporary texture to hold untiled
> copy\n");
> - goto fail_trans;
> - }
> -
> - if (usage & PIPE_TRANSFER_READ) {
> - struct pipe_resource *temp =
> ctx->screen->resource_create(ctx->screen, &resource);
> - if (!temp) {
> - PRINT_ERR("failed to create a temporary depth texture\n");
> - goto fail_trans;
> - }
> -
> - si_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level,
> box);
> - si_blit_decompress_depth(ctx, (struct si_texture*)temp,
> staging_depth,
> - 0, 0, 0, box->depth, 0, 0);
> - pipe_resource_reference(&temp, NULL);
> - }
> -
> - /* Just get the strides. */
> - si_texture_get_offset(sctx->screen, staging_depth, level, NULL,
> - &trans->b.b.stride,
> - &trans->b.b.layer_stride);
> - } else {
> - /* XXX: only readback the rectangle which is being mapped? */
> - /* XXX: when discard is true, no need to read back from depth
> texture */
> - if (!si_init_flushed_depth_texture(ctx, texture, &staging_depth)) {
> - PRINT_ERR("failed to create temporary texture to hold untiled
> copy\n");
> - goto fail_trans;
> - }
> -
> - si_blit_decompress_depth(ctx, tex, staging_depth,
> - level, level,
> - box->z, box->z + box->depth - 1,
> - 0, 0);
> -
> - offset = si_texture_get_offset(sctx->screen, staging_depth,
> - level, box,
> - &trans->b.b.stride,
> - &trans->b.b.layer_stride);
> - }
> -
> - trans->staging = &staging_depth->buffer;
> - buf = trans->staging;
> - } else if (use_staging_texture) {
> + if (use_staging_texture) {
> struct pipe_resource resource;
> struct si_texture *staging;
>
> si_init_temp_resource_from_box(&resource, texture, box, level,
> SI_RESOURCE_FLAG_TRANSFER);
> resource.usage = (usage & PIPE_TRANSFER_READ) ?
> PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
>
> + /* Since depth-stencil textures don't support linear tiling,
> + * blit from ZS to color and vice versa. u_blitter will do
> + * the packing for these formats.
> + */
> + if (tex->is_depth)
> + resource.format =
> util_blitter_get_color_format_for_zs(resource.format);
> +
> /* Create the temporary texture. */
> staging = (struct
> si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
> if (!staging) {
> PRINT_ERR("failed to create temporary texture to hold untiled
> copy\n");
> goto fail_trans;
> }
> trans->staging = &staging->buffer;
>
> /* Just get the strides. */
> si_texture_get_offset(sctx->screen, staging, 0, NULL,
> @@ -2063,30 +2001,22 @@ static void si_texture_transfer_unmap(struct
> pipe_context *ctx,
> /* Always unmap texture CPU mappings on 32-bit architectures, so that
> * we don't run out of the CPU address space.
> */
> if (sizeof(void*) == 4) {
> struct si_resource *buf =
> stransfer->staging ? stransfer->staging : &tex->buffer;
>
> sctx->ws->buffer_unmap(buf->buf);
> }
>
> - if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) {
> - if (tex->is_depth && tex->buffer.b.b.nr_samples <= 1) {
> - ctx->resource_copy_region(ctx, texture, transfer->level,
> - transfer->box.x, transfer->box.y, transfer->box.z,
> - &stransfer->staging->b.b, transfer->level,
> - &transfer->box);
> - } else {
> - si_copy_from_staging_texture(ctx, stransfer);
> - }
> - }
> + if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
> + si_copy_from_staging_texture(ctx, stransfer);
>
> if (stransfer->staging) {
> sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
> si_resource_reference(&stransfer->staging, NULL);
> }
>
> /* Heuristic for {upload, draw, upload, draw, ..}:
> *
> * Flush the gfx IB if we've allocated too much texture storage.
> *
More information about the mesa-dev
mailing list