[Mesa-dev] [PATCH] radeonsi: use CP DMA for the null const buffer clear on CIK
Marek Olšák
maraeo at gmail.com
Tue Apr 23 20:16:15 UTC 2019
No, the correct backport is attached.
Marek
On Tue, Apr 23, 2019 at 2:51 PM Dylan Baker <dylan at pnwbakers.com> wrote:
> Hi Marek and Samuel,
>
> I've staged this for 19.0, but I had to fix some very minor rebase
> conflicts.
> I'm getting ready to make a release, could one of you take a peak at the
> tip of
> the staging/19.0 branch and let me know if what I did looks okay?
>
> Thanks,
> Dylan
>
> Quoting Samuel Pitoiset (2019-04-16 08:24:01)
> > I don't have much context for that issue, so:
> >
> > Acked-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> >
> > On 4/12/19 10:15 PM, Marek Ol\u0161 k wrote:
> >
> > Done locally.
> >
> > Marek
> >
> > On Fri, Apr 12, 2019 at 12:20 PM Samuel Pitoiset <
> samuel.pitoiset at gmail.com
> > > wrote:
> >
> > I would suggest to document that workaround somewhere in the
> code.
> >
> > On 4/12/19 5:17 PM, Marek Ol\u0161 k wrote:
> > > From: Marek Ol\u0161 k <marek.olsak at amd.com>
> > >
> > > This is a workaround for a thread deadlock that I have no idea
> > > why it occurs.
> > >
> > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879
> > > Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e
> > > ---
> > > src/gallium/drivers/radeonsi/si_clear.c | 6 +++---
> > > src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++---
> > > src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
> > > src/gallium/drivers/radeonsi/si_pipe.h | 3 ++-
> > > src/gallium/drivers/radeonsi/si_test_dma.c | 2 +-
> > > 5 files changed, 12 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/src/gallium/drivers/radeonsi/si_clear.c
> b/src/gallium/
> > drivers/radeonsi/si_clear.c
> > > index e1805f2a1c9..ead680b857b 100644
> > > --- a/src/gallium/drivers/radeonsi/si_clear.c
> > > +++ b/src/gallium/drivers/radeonsi/si_clear.c
> > > @@ -256,21 +256,21 @@ void vi_dcc_clear_level(struct si_context
> > *sctx,
> > > * would be more efficient than separate
> per-layer
> > clear operations.
> > > */
> > > assert(tex->buffer.b.b.nr_storage_samples <= 2 ||
> > num_layers == 1);
> > >
> > > dcc_offset += tex->surface.u.legacy.level
> > [level].dcc_offset;
> > > clear_size = tex->surface.u.legacy.level
> > [level].dcc_fast_clear_size *
> > > num_layers;
> > > }
> > >
> > > si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
> > > - &clear_value, 4, SI_COHERENCY_CB_META);
> > > + &clear_value, 4, SI_COHERENCY_CB_META,
> false);
> > > }
> > >
> > > /* Set the same micro tile mode as the destination of the
> last MSAA
> > resolve.
> > > * This allows hitting the MSAA resolve fast path, which
> requires
> > that both
> > > * src and dst micro tile modes match.
> > > */
> > > static void si_set_optimal_micro_tile_mode(struct si_screen
> > *sscreen,
> > > struct si_texture
> *tex)
> > > {
> > > if (tex->buffer.b.is_shared ||
> > > @@ -489,21 +489,21 @@ static void si_do_fast_color_clear(struct
> > si_context *sctx,
> > >
> > > /* DCC fast clear with MSAA should clear
> CMASK
> > to 0xC. */
> > > if (tex->buffer.b.b.nr_samples >= 2 &&
> tex->
> > cmask_buffer) {
> > > /* TODO: This doesn't work with
> MSAA. *
> > /
> > > if (eliminate_needed)
> > > continue;
> > >
> > > uint32_t clear_value =
> 0xCCCCCCCC;
> > > si_clear_buffer(sctx, &tex->
> > cmask_buffer->b.b,
> > >
> tex->cmask_offset,
> > tex->surface.cmask_size,
> > > - &clear_value, 4,
> > SI_COHERENCY_CB_META);
> > > + &clear_value, 4,
> > SI_COHERENCY_CB_META, false);
> > > fmask_decompress_needed = true;
> > > }
> > >
> > > vi_dcc_clear_level(sctx, tex, 0,
> reset_value);
> > > tex->separate_dcc_dirty = true;
> > > } else {
> > > if (too_small)
> > > continue;
> > >
> > > /* 128-bit formats are unusupported */
> > > @@ -517,21 +517,21 @@ static void si_do_fast_color_clear(struct
> > si_context *sctx,
> > >
> > > /* ensure CMASK is enabled */
> > > si_alloc_separate_cmask(sctx->screen,
> tex);
> > > if (!tex->cmask_buffer)
> > > continue;
> > >
> > > /* Do the fast clear. */
> > > uint32_t clear_value = 0;
> > > si_clear_buffer(sctx,
> &tex->cmask_buffer->b.b,
> > > tex->cmask_offset, tex->
> > surface.cmask_size,
> > > - &clear_value, 4,
> > SI_COHERENCY_CB_META);
> > > + &clear_value, 4,
> > SI_COHERENCY_CB_META, false);
> > > eliminate_needed = true;
> > > }
> > >
> > > if ((eliminate_needed ||
> fmask_decompress_needed) &&
> > > !(tex->dirty_level_mask & (1 << level))) {
> > > tex->dirty_level_mask |= 1 << level;
> > > p_atomic_inc(&sctx->screen->
> > compressed_colortex_counter);
> > > }
> > >
> > > /* We can change the micro tile mode before a
> full
> > clear. */
> > > diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
> b/src/
> > gallium/drivers/radeonsi/si_compute_blit.c
> > > index 1abeac6adb0..fb0d8d2f1b6 100644
> > > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> > > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> > > @@ -179,21 +179,22 @@ static void
> si_compute_do_clear_or_copy(struct
> > si_context *sctx,
> > >
> > > /* Restore states. */
> > > ctx->bind_compute_state(ctx, saved_cs);
> > > ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src
> ? 2 :
> > 1, saved_sb,
> > > saved_writable_mask);
> > > si_compute_internal_end(sctx);
> > > }
> > >
> > > void si_clear_buffer(struct si_context *sctx, struct
> pipe_resource
> > *dst,
> > > uint64_t offset, uint64_t size, uint32_t
> > *clear_value,
> > > - uint32_t clear_value_size, enum si_coherency
> > coher)
> > > + uint32_t clear_value_size, enum si_coherency
> > coher,
> > > + bool force_cpdma)
> > > {
> > > if (!size)
> > > return;
> > >
> > > unsigned clear_alignment = MIN2(clear_value_size, 4);
> > >
> > > assert(clear_value_size != 3 && clear_value_size != 6);
> /* 12
> > is allowed. */
> > > assert(offset % clear_alignment == 0);
> > > assert(size % clear_alignment == 0);
> > > assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit
> sizes in
> > all codepaths */
> > > @@ -243,21 +244,22 @@ void si_clear_buffer(struct si_context
> *sctx,
> > struct pipe_resource *dst,
> > > return;
> > > }
> > >
> > > uint64_t aligned_size = size & ~3ull;
> > > if (aligned_size >= 4) {
> > > /* Before GFX9, CP DMA was very slow when
> clearing GTT,
> > so never
> > > * use CP DMA clears on those chips, because we
> can't
> > be certain
> > > * about buffer placements.
> > > */
> > > if (clear_value_size > 4 ||
> > > - (clear_value_size == 4 &&
> > > + (!force_cpdma &&
> > > + clear_value_size == 4 &&
> > > offset % 4 == 0 &&
> > > (size > 32*1024 || sctx->chip_class <=
> VI))) {
> > > si_compute_do_clear_or_copy(sctx, dst,
> offset,
> > NULL, 0,
> > > aligned_size,
> > clear_value,
> > >
> clear_value_size,
> > coher);
> > > } else {
> > > assert(clear_value_size == 4);
> > > si_cp_dma_clear_buffer(sctx,
> sctx->gfx_cs, dst,
> > offset,
> > > aligned_size,
> > *clear_value, 0, coher,
> > >
> get_cache_policy(sctx,
> > coher, size));
> > > @@ -277,21 +279,21 @@ void si_clear_buffer(struct si_context
> *sctx,
> > struct pipe_resource *dst,
> > > }
> > > }
> > >
> > > static void si_pipe_clear_buffer(struct pipe_context *ctx,
> > > struct pipe_resource *dst,
> > > unsigned offset, unsigned size,
> > > const void *clear_value,
> > > int clear_value_size)
> > > {
> > > si_clear_buffer((struct si_context*)ctx, dst, offset,
> size,
> > (uint32_t*)clear_value,
> > > - clear_value_size, SI_COHERENCY_SHADER);
> > > + clear_value_size, SI_COHERENCY_SHADER,
> false);
> > > }
> > >
> > > void si_copy_buffer(struct si_context *sctx,
> > > struct pipe_resource *dst, struct
> pipe_resource
> > *src,
> > > uint64_t dst_offset, uint64_t src_offset,
> unsigned
> > size)
> > > {
> > > if (!size)
> > > return;
> > >
> > > enum si_coherency coher = SI_COHERENCY_SHADER;
> > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
> b/src/gallium/
> > drivers/radeonsi/si_pipe.c
> > > index 5caeb575623..5d376e6181a 100644
> > > --- a/src/gallium/drivers/radeonsi/si_pipe.c
> > > +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> > > @@ -634,21 +634,21 @@ static struct pipe_context
> *si_create_context
> > (struct pipe_screen *screen,
> > > sizeof(sctx->sample_positions), &sctx->
> > sample_positions);
> > >
> > > /* this must be last */
> > > si_begin_new_gfx_cs(sctx);
> > >
> > > if (sctx->chip_class == CIK) {
> > > /* Clear the NULL constant buffer, because loads
> should
> > return zeros. */
> > > uint32_t clear_value = 0;
> > > si_clear_buffer(sctx,
> sctx->null_const_buf.buffer, 0,
> > >
> sctx->null_const_buf.buffer->width0,
> > > - &clear_value, 4,
> SI_COHERENCY_SHADER);
> > > + &clear_value, 4,
> SI_COHERENCY_SHADER,
> > true);
> > > }
> > > return &sctx->b;
> > > fail:
> > > fprintf(stderr, "radeonsi: Failed to create a
> context.\n");
> > > si_destroy_context(&sctx->b);
> > > return NULL;
> > > }
> > >
> > > static struct pipe_context *si_pipe_create_context(struct
> > pipe_screen *screen,
> > > void *priv,
> unsigned
> > flags)
> > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/
> > drivers/radeonsi/si_pipe.h
> > > index 301d38649bf..aaa95f32d20 100644
> > > --- a/src/gallium/drivers/radeonsi/si_pipe.h
> > > +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> > > @@ -1182,21 +1182,22 @@ bool vi_alpha_is_on_msb(enum
> pipe_format
> > format);
> > > void vi_dcc_clear_level(struct si_context *sctx,
> > > struct si_texture *tex,
> > > unsigned level, unsigned clear_value);
> > > void si_init_clear_functions(struct si_context *sctx);
> > >
> > > /* si_compute_blit.c */
> > > unsigned si_get_flush_flags(struct si_context *sctx, enum
> > si_coherency coher,
> > > enum si_cache_policy cache_policy);
> > > void si_clear_buffer(struct si_context *sctx, struct
> pipe_resource
> > *dst,
> > > uint64_t offset, uint64_t size, uint32_t
> > *clear_value,
> > > - uint32_t clear_value_size, enum si_coherency
> > coher);
> > > + uint32_t clear_value_size, enum si_coherency
> > coher,
> > > + bool force_cpdma);
> > > void si_copy_buffer(struct si_context *sctx,
> > > struct pipe_resource *dst, struct
> pipe_resource
> > *src,
> > > uint64_t dst_offset, uint64_t src_offset,
> unsigned
> > size);
> > > void si_compute_copy_image(struct si_context *sctx,
> > > struct pipe_resource *dst,
> > > unsigned dst_level,
> > > struct pipe_resource *src,
> > > unsigned src_level,
> > > unsigned dstx, unsigned dsty,
> unsigned dstz,
> > > const struct pipe_box *src_box);
> > > diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c
> b/src/gallium
> > /drivers/radeonsi/si_test_dma.c
> > > index 90a2032cd80..7e396e671be 100644
> > > --- a/src/gallium/drivers/radeonsi/si_test_dma.c
> > > +++ b/src/gallium/drivers/radeonsi/si_test_dma.c
> > > @@ -302,21 +302,21 @@ void si_test_dma(struct si_screen
> *sscreen)
> > > tsrc.width0, tsrc.height0,
> tsrc.array_size,
> > > array_mode_to_string(sscreen,
> &ssrc->surface),
> > bpp);
> > > fflush(stdout);
> > >
> > > /* set src pixels */
> > > set_random_pixels(ctx, src, &src_cpu);
> > >
> > > /* clear dst pixels */
> > > uint32_t zero = 0;
> > > si_clear_buffer(sctx, dst, 0,
> sdst->surface.surf_size,
> > &zero, 4,
> > > - SI_COHERENCY_SHADER);
> > > + SI_COHERENCY_SHADER, false);
> > > memset(dst_cpu.ptr, 0, dst_cpu.layer_stride *
> > tdst.array_size);
> > >
> > > /* preparation */
> > > max_width = MIN2(tsrc.width0, tdst.width0);
> > > max_height = MIN2(tsrc.height0, tdst.height0);
> > > max_depth = MIN2(tsrc.array_size,
> tdst.array_size);
> > >
> > > num = do_partial_copies ? num_partial_copies : 1;
> > > for (j = 0; j < num; j++) {
> > > int width, height, depth;
> >
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20190423/eb4bb6ea/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-radeonsi-use-CP-DMA-for-the-null-const-buffer-clear-.patch
Type: text/x-patch
Size: 5874 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20190423/eb4bb6ea/attachment-0001.bin>
More information about the mesa-dev
mailing list