[Mesa-dev] [PATCH] radeonsi: use CP DMA for the null const buffer clear on CIK
Samuel Pitoiset
samuel.pitoiset at gmail.com
Tue Apr 23 19:02:02 UTC 2019
Looks good to me.
On 4/23/19 8:51 PM, Dylan Baker wrote:
> Hi Marek and Samuel,
>
> I've staged this for 19.0, but I had to fix some very minor rebase conflicts.
> I'm getting ready to make a release, could one of you take a peak at the tip of
> the staging/19.0 branch and let me know if what I did looks okay?
>
> Thanks,
> Dylan
>
> Quoting Samuel Pitoiset (2019-04-16 08:24:01)
>> I don't have much context for that issue, so:
>>
>> Acked-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>>
>> On 4/12/19 10:15 PM, Marek Ol\u0161 k wrote:
>>
>> Done locally.
>>
>> Marek
>>
>> On Fri, Apr 12, 2019 at 12:20 PM Samuel Pitoiset <samuel.pitoiset at gmail.com
>> > wrote:
>>
>> I would suggest to document that workaround somewhere in the code.
>>
>> On 4/12/19 5:17 PM, Marek Ol\u0161 k wrote:
>> > From: Marek Ol\u0161 k <marek.olsak at amd.com>
>> >
>> > This is a workaround for a thread deadlock that I have no idea
>> > why it occurs.
>> >
>> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879
>> > Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e
>> > ---
>> > src/gallium/drivers/radeonsi/si_clear.c | 6 +++---
>> > src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++---
>> > src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
>> > src/gallium/drivers/radeonsi/si_pipe.h | 3 ++-
>> > src/gallium/drivers/radeonsi/si_test_dma.c | 2 +-
>> > 5 files changed, 12 insertions(+), 9 deletions(-)
>> >
>> > diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/
>> drivers/radeonsi/si_clear.c
>> > index e1805f2a1c9..ead680b857b 100644
>> > --- a/src/gallium/drivers/radeonsi/si_clear.c
>> > +++ b/src/gallium/drivers/radeonsi/si_clear.c
>> > @@ -256,21 +256,21 @@ void vi_dcc_clear_level(struct si_context
>> *sctx,
>> > * would be more efficient than separate per-layer
>> clear operations.
>> > */
>> > assert(tex->buffer.b.b.nr_storage_samples <= 2 ||
>> num_layers == 1);
>> >
>> > dcc_offset += tex->surface.u.legacy.level
>> [level].dcc_offset;
>> > clear_size = tex->surface.u.legacy.level
>> [level].dcc_fast_clear_size *
>> > num_layers;
>> > }
>> >
>> > si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
>> > - &clear_value, 4, SI_COHERENCY_CB_META);
>> > + &clear_value, 4, SI_COHERENCY_CB_META, false);
>> > }
>> >
>> > /* Set the same micro tile mode as the destination of the last MSAA
>> resolve.
>> > * This allows hitting the MSAA resolve fast path, which requires
>> that both
>> > * src and dst micro tile modes match.
>> > */
>> > static void si_set_optimal_micro_tile_mode(struct si_screen
>> *sscreen,
>> > struct si_texture *tex)
>> > {
>> > if (tex->buffer.b.is_shared ||
>> > @@ -489,21 +489,21 @@ static void si_do_fast_color_clear(struct
>> si_context *sctx,
>> >
>> > /* DCC fast clear with MSAA should clear CMASK
>> to 0xC. */
>> > if (tex->buffer.b.b.nr_samples >= 2 && tex->
>> cmask_buffer) {
>> > /* TODO: This doesn't work with MSAA. *
>> /
>> > if (eliminate_needed)
>> > continue;
>> >
>> > uint32_t clear_value = 0xCCCCCCCC;
>> > si_clear_buffer(sctx, &tex->
>> cmask_buffer->b.b,
>> > tex->cmask_offset,
>> tex->surface.cmask_size,
>> > - &clear_value, 4,
>> SI_COHERENCY_CB_META);
>> > + &clear_value, 4,
>> SI_COHERENCY_CB_META, false);
>> > fmask_decompress_needed = true;
>> > }
>> >
>> > vi_dcc_clear_level(sctx, tex, 0, reset_value);
>> > tex->separate_dcc_dirty = true;
>> > } else {
>> > if (too_small)
>> > continue;
>> >
>> > /* 128-bit formats are unusupported */
>> > @@ -517,21 +517,21 @@ static void si_do_fast_color_clear(struct
>> si_context *sctx,
>> >
>> > /* ensure CMASK is enabled */
>> > si_alloc_separate_cmask(sctx->screen, tex);
>> > if (!tex->cmask_buffer)
>> > continue;
>> >
>> > /* Do the fast clear. */
>> > uint32_t clear_value = 0;
>> > si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
>> > tex->cmask_offset, tex->
>> surface.cmask_size,
>> > - &clear_value, 4,
>> SI_COHERENCY_CB_META);
>> > + &clear_value, 4,
>> SI_COHERENCY_CB_META, false);
>> > eliminate_needed = true;
>> > }
>> >
>> > if ((eliminate_needed || fmask_decompress_needed) &&
>> > !(tex->dirty_level_mask & (1 << level))) {
>> > tex->dirty_level_mask |= 1 << level;
>> > p_atomic_inc(&sctx->screen->
>> compressed_colortex_counter);
>> > }
>> >
>> > /* We can change the micro tile mode before a full
>> clear. */
>> > diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/
>> gallium/drivers/radeonsi/si_compute_blit.c
>> > index 1abeac6adb0..fb0d8d2f1b6 100644
>> > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
>> > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
>> > @@ -179,21 +179,22 @@ static void si_compute_do_clear_or_copy(struct
>> si_context *sctx,
>> >
>> > /* Restore states. */
>> > ctx->bind_compute_state(ctx, saved_cs);
>> > ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 :
>> 1, saved_sb,
>> > saved_writable_mask);
>> > si_compute_internal_end(sctx);
>> > }
>> >
>> > void si_clear_buffer(struct si_context *sctx, struct pipe_resource
>> *dst,
>> > uint64_t offset, uint64_t size, uint32_t
>> *clear_value,
>> > - uint32_t clear_value_size, enum si_coherency
>> coher)
>> > + uint32_t clear_value_size, enum si_coherency
>> coher,
>> > + bool force_cpdma)
>> > {
>> > if (!size)
>> > return;
>> >
>> > unsigned clear_alignment = MIN2(clear_value_size, 4);
>> >
>> > assert(clear_value_size != 3 && clear_value_size != 6); /* 12
>> is allowed. */
>> > assert(offset % clear_alignment == 0);
>> > assert(size % clear_alignment == 0);
>> > assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in
>> all codepaths */
>> > @@ -243,21 +244,22 @@ void si_clear_buffer(struct si_context *sctx,
>> struct pipe_resource *dst,
>> > return;
>> > }
>> >
>> > uint64_t aligned_size = size & ~3ull;
>> > if (aligned_size >= 4) {
>> > /* Before GFX9, CP DMA was very slow when clearing GTT,
>> so never
>> > * use CP DMA clears on those chips, because we can't
>> be certain
>> > * about buffer placements.
>> > */
>> > if (clear_value_size > 4 ||
>> > - (clear_value_size == 4 &&
>> > + (!force_cpdma &&
>> > + clear_value_size == 4 &&
>> > offset % 4 == 0 &&
>> > (size > 32*1024 || sctx->chip_class <= VI))) {
>> > si_compute_do_clear_or_copy(sctx, dst, offset,
>> NULL, 0,
>> > aligned_size,
>> clear_value,
>> > clear_value_size,
>> coher);
>> > } else {
>> > assert(clear_value_size == 4);
>> > si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst,
>> offset,
>> > aligned_size,
>> *clear_value, 0, coher,
>> > get_cache_policy(sctx,
>> coher, size));
>> > @@ -277,21 +279,21 @@ void si_clear_buffer(struct si_context *sctx,
>> struct pipe_resource *dst,
>> > }
>> > }
>> >
>> > static void si_pipe_clear_buffer(struct pipe_context *ctx,
>> > struct pipe_resource *dst,
>> > unsigned offset, unsigned size,
>> > const void *clear_value,
>> > int clear_value_size)
>> > {
>> > si_clear_buffer((struct si_context*)ctx, dst, offset, size,
>> (uint32_t*)clear_value,
>> > - clear_value_size, SI_COHERENCY_SHADER);
>> > + clear_value_size, SI_COHERENCY_SHADER, false);
>> > }
>> >
>> > void si_copy_buffer(struct si_context *sctx,
>> > struct pipe_resource *dst, struct pipe_resource
>> *src,
>> > uint64_t dst_offset, uint64_t src_offset, unsigned
>> size)
>> > {
>> > if (!size)
>> > return;
>> >
>> > enum si_coherency coher = SI_COHERENCY_SHADER;
>> > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/
>> drivers/radeonsi/si_pipe.c
>> > index 5caeb575623..5d376e6181a 100644
>> > --- a/src/gallium/drivers/radeonsi/si_pipe.c
>> > +++ b/src/gallium/drivers/radeonsi/si_pipe.c
>> > @@ -634,21 +634,21 @@ static struct pipe_context *si_create_context
>> (struct pipe_screen *screen,
>> > sizeof(sctx->sample_positions), &sctx->
>> sample_positions);
>> >
>> > /* this must be last */
>> > si_begin_new_gfx_cs(sctx);
>> >
>> > if (sctx->chip_class == CIK) {
>> > /* Clear the NULL constant buffer, because loads should
>> return zeros. */
>> > uint32_t clear_value = 0;
>> > si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
>> > sctx->null_const_buf.buffer->width0,
>> > - &clear_value, 4, SI_COHERENCY_SHADER);
>> > + &clear_value, 4, SI_COHERENCY_SHADER,
>> true);
>> > }
>> > return &sctx->b;
>> > fail:
>> > fprintf(stderr, "radeonsi: Failed to create a context.\n");
>> > si_destroy_context(&sctx->b);
>> > return NULL;
>> > }
>> >
>> > static struct pipe_context *si_pipe_create_context(struct
>> pipe_screen *screen,
>> > void *priv, unsigned
>> flags)
>> > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/
>> drivers/radeonsi/si_pipe.h
>> > index 301d38649bf..aaa95f32d20 100644
>> > --- a/src/gallium/drivers/radeonsi/si_pipe.h
>> > +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>> > @@ -1182,21 +1182,22 @@ bool vi_alpha_is_on_msb(enum pipe_format
>> format);
>> > void vi_dcc_clear_level(struct si_context *sctx,
>> > struct si_texture *tex,
>> > unsigned level, unsigned clear_value);
>> > void si_init_clear_functions(struct si_context *sctx);
>> >
>> > /* si_compute_blit.c */
>> > unsigned si_get_flush_flags(struct si_context *sctx, enum
>> si_coherency coher,
>> > enum si_cache_policy cache_policy);
>> > void si_clear_buffer(struct si_context *sctx, struct pipe_resource
>> *dst,
>> > uint64_t offset, uint64_t size, uint32_t
>> *clear_value,
>> > - uint32_t clear_value_size, enum si_coherency
>> coher);
>> > + uint32_t clear_value_size, enum si_coherency
>> coher,
>> > + bool force_cpdma);
>> > void si_copy_buffer(struct si_context *sctx,
>> > struct pipe_resource *dst, struct pipe_resource
>> *src,
>> > uint64_t dst_offset, uint64_t src_offset, unsigned
>> size);
>> > void si_compute_copy_image(struct si_context *sctx,
>> > struct pipe_resource *dst,
>> > unsigned dst_level,
>> > struct pipe_resource *src,
>> > unsigned src_level,
>> > unsigned dstx, unsigned dsty, unsigned dstz,
>> > const struct pipe_box *src_box);
>> > diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium
>> /drivers/radeonsi/si_test_dma.c
>> > index 90a2032cd80..7e396e671be 100644
>> > --- a/src/gallium/drivers/radeonsi/si_test_dma.c
>> > +++ b/src/gallium/drivers/radeonsi/si_test_dma.c
>> > @@ -302,21 +302,21 @@ void si_test_dma(struct si_screen *sscreen)
>> > tsrc.width0, tsrc.height0, tsrc.array_size,
>> > array_mode_to_string(sscreen, &ssrc->surface),
>> bpp);
>> > fflush(stdout);
>> >
>> > /* set src pixels */
>> > set_random_pixels(ctx, src, &src_cpu);
>> >
>> > /* clear dst pixels */
>> > uint32_t zero = 0;
>> > si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size,
>> &zero, 4,
>> > - SI_COHERENCY_SHADER);
>> > + SI_COHERENCY_SHADER, false);
>> > memset(dst_cpu.ptr, 0, dst_cpu.layer_stride *
>> tdst.array_size);
>> >
>> > /* preparation */
>> > max_width = MIN2(tsrc.width0, tdst.width0);
>> > max_height = MIN2(tsrc.height0, tdst.height0);
>> > max_depth = MIN2(tsrc.array_size, tdst.array_size);
>> >
>> > num = do_partial_copies ? num_partial_copies : 1;
>> > for (j = 0; j < num; j++) {
>> > int width, height, depth;
>>
More information about the mesa-dev
mailing list