<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <p>I don't have much context for that issue, so:</p>
    <p>Acked-by: Samuel Pitoiset <a class="moz-txt-link-rfc2396E" href="mailto:samuel.pitoiset@gmail.com"><samuel.pitoiset@gmail.com></a><br>
    </p>
    <div class="moz-cite-prefix">On 4/12/19 10:15 PM, Marek Olšák wrote:<br>
    </div>
    <blockquote type="cite"
cite="mid:CAAxE2A6GhwM_En+QtwVVOOABHmoxo5P88PvsNZ=Ci+=6uy4AjQ@mail.gmail.com">
      <meta http-equiv="content-type" content="text/html; charset=UTF-8">
      <div dir="ltr">
        <div>Done locally.</div>
        <div><br>
        </div>
        <div>Marek<br>
        </div>
        <br>
        <div class="gmail_quote">
          <div dir="ltr" class="gmail_attr">On Fri, Apr 12, 2019 at
            12:20 PM Samuel Pitoiset <<a
              href="mailto:samuel.pitoiset@gmail.com"
              moz-do-not-send="true">samuel.pitoiset@gmail.com</a>>
            wrote:<br>
          </div>
          <blockquote class="gmail_quote" style="margin:0px 0px 0px
            0.8ex;border-left:1px solid
            rgb(204,204,204);padding-left:1ex">I would suggest to
            document that workaround somewhere in the code.<br>
            <br>
            On 4/12/19 5:17 PM, Marek Olšák wrote:<br>
            > From: Marek Olšák <<a
              href="mailto:marek.olsak@amd.com" target="_blank"
              moz-do-not-send="true">marek.olsak@amd.com</a>><br>
            ><br>
            > This is a workaround for a thread deadlock that I have
            no idea<br>
            > why it occurs.<br>
            ><br>
            > Bugzilla: <a
              href="https://bugs.freedesktop.org/show_bug.cgi?id=108879"
              rel="noreferrer" target="_blank" moz-do-not-send="true">https://bugs.freedesktop.org/show_bug.cgi?id=108879</a><br>
            > Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e<br>
            > ---<br>
            >   src/gallium/drivers/radeonsi/si_clear.c        | 6
            +++---<br>
            >   src/gallium/drivers/radeonsi/si_compute_blit.c | 8
            +++++---<br>
            >   src/gallium/drivers/radeonsi/si_pipe.c         | 2 +-<br>
            >   src/gallium/drivers/radeonsi/si_pipe.h         | 3
            ++-<br>
            >   src/gallium/drivers/radeonsi/si_test_dma.c     | 2 +-<br>
            >   5 files changed, 12 insertions(+), 9 deletions(-)<br>
            ><br>
            > diff --git a/src/gallium/drivers/radeonsi/si_clear.c
            b/src/gallium/drivers/radeonsi/si_clear.c<br>
            > index e1805f2a1c9..ead680b857b 100644<br>
            > --- a/src/gallium/drivers/radeonsi/si_clear.c<br>
            > +++ b/src/gallium/drivers/radeonsi/si_clear.c<br>
            > @@ -256,21 +256,21 @@ void vi_dcc_clear_level(struct
            si_context *sctx,<br>
            >                * would be more efficient than separate
            per-layer clear operations.<br>
            >                */<br>
            >             
             assert(tex->buffer.b.b.nr_storage_samples <= 2 ||
            num_layers == 1);<br>
            >   <br>
            >               dcc_offset +=
            tex->surface.u.legacy.level[level].dcc_offset;<br>
            >               clear_size =
            tex->surface.u.legacy.level[level].dcc_fast_clear_size *<br>
            >                            num_layers;<br>
            >       }<br>
            >   <br>
            >       si_clear_buffer(sctx, dcc_buffer, dcc_offset,
            clear_size,<br>
            > -                     &clear_value, 4,
            SI_COHERENCY_CB_META);<br>
            > +                     &clear_value, 4,
            SI_COHERENCY_CB_META, false);<br>
            >   }<br>
            >   <br>
            >   /* Set the same micro tile mode as the destination of
            the last MSAA resolve.<br>
            >    * This allows hitting the MSAA resolve fast path,
            which requires that both<br>
            >    * src and dst micro tile modes match.<br>
            >    */<br>
            >   static void si_set_optimal_micro_tile_mode(struct
            si_screen *sscreen,<br>
            >                                          struct
            si_texture *tex)<br>
            >   {<br>
            >       if (tex->buffer.b.is_shared ||<br>
            > @@ -489,21 +489,21 @@ static void
            si_do_fast_color_clear(struct si_context *sctx,<br>
            >   <br>
            >                       /* DCC fast clear with MSAA
            should clear CMASK to 0xC. */<br>
            >                       if (tex->buffer.b.b.nr_samples
            >= 2 && tex->cmask_buffer) {<br>
            >                               /* TODO: This doesn't
            work with MSAA. */<br>
            >                               if (eliminate_needed)<br>
            >                                       continue;<br>
            >   <br>
            >                               uint32_t clear_value =
            0xCCCCCCCC;<br>
            >                               si_clear_buffer(sctx,
            &tex->cmask_buffer->b.b,<br>
            >                                             
             tex->cmask_offset, tex->surface.cmask_size,<br>
            > -                                           
             &clear_value, 4, SI_COHERENCY_CB_META);<br>
            > +                                           
             &clear_value, 4, SI_COHERENCY_CB_META, false);<br>
            >                               fmask_decompress_needed =
            true;<br>
            >                       }<br>
            >   <br>
            >                       vi_dcc_clear_level(sctx, tex, 0,
            reset_value);<br>
            >                       tex->separate_dcc_dirty =
            true;<br>
            >               } else {<br>
            >                       if (too_small)<br>
            >                               continue;<br>
            >   <br>
            >                       /* 128-bit formats are
            unusupported */<br>
            > @@ -517,21 +517,21 @@ static void
            si_do_fast_color_clear(struct si_context *sctx,<br>
            >   <br>
            >                       /* ensure CMASK is enabled */<br>
            >                     
             si_alloc_separate_cmask(sctx->screen, tex);<br>
            >                       if (!tex->cmask_buffer)<br>
            >                               continue;<br>
            >   <br>
            >                       /* Do the fast clear. */<br>
            >                       uint32_t clear_value = 0;<br>
            >                       si_clear_buffer(sctx,
            &tex->cmask_buffer->b.b,<br>
            >                                     
             tex->cmask_offset, tex->surface.cmask_size,<br>
            > -                                     &clear_value,
            4, SI_COHERENCY_CB_META);<br>
            > +                                     &clear_value,
            4, SI_COHERENCY_CB_META, false);<br>
            >                       eliminate_needed = true;<br>
            >               }<br>
            >   <br>
            >               if ((eliminate_needed ||
            fmask_decompress_needed) &&<br>
            >                   !(tex->dirty_level_mask & (1
            << level))) {<br>
            >                       tex->dirty_level_mask |= 1
            << level;<br>
            >                     
             p_atomic_inc(&sctx->screen->compressed_colortex_counter);<br>
            >               }<br>
            >   <br>
            >               /* We can change the micro tile mode
            before a full clear. */<br>
            > diff --git
            a/src/gallium/drivers/radeonsi/si_compute_blit.c
            b/src/gallium/drivers/radeonsi/si_compute_blit.c<br>
            > index 1abeac6adb0..fb0d8d2f1b6 100644<br>
            > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c<br>
            > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c<br>
            > @@ -179,21 +179,22 @@ static void
            si_compute_do_clear_or_copy(struct si_context *sctx,<br>
            >   <br>
            >       /* Restore states. */<br>
            >       ctx->bind_compute_state(ctx, saved_cs);<br>
            >       ctx->set_shader_buffers(ctx,
            PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,<br>
            >                               saved_writable_mask);<br>
            >       si_compute_internal_end(sctx);<br>
            >   }<br>
            >   <br>
            >   void si_clear_buffer(struct si_context *sctx, struct
            pipe_resource *dst,<br>
            >                    uint64_t offset, uint64_t size,
            uint32_t *clear_value,<br>
            > -                  uint32_t clear_value_size, enum
            si_coherency coher)<br>
            > +                  uint32_t clear_value_size, enum
            si_coherency coher,<br>
            > +                  bool force_cpdma)<br>
            >   {<br>
            >       if (!size)<br>
            >               return;<br>
            >   <br>
            >       unsigned clear_alignment = MIN2(clear_value_size,
            4);<br>
            >   <br>
            >       assert(clear_value_size != 3 &&
            clear_value_size != 6); /* 12 is allowed. */<br>
            >       assert(offset % clear_alignment == 0);<br>
            >       assert(size % clear_alignment == 0);<br>
            >       assert(size < (UINT_MAX & ~0xf)); /* TODO:
            test 64-bit sizes in all codepaths */<br>
            > @@ -243,21 +244,22 @@ void si_clear_buffer(struct
            si_context *sctx, struct pipe_resource *dst,<br>
            >               return;<br>
            >       }<br>
            >   <br>
            >       uint64_t aligned_size = size & ~3ull;<br>
            >       if (aligned_size >= 4) {<br>
            >               /* Before GFX9, CP DMA was very slow when
            clearing GTT, so never<br>
            >                * use CP DMA clears on those chips,
            because we can't be certain<br>
            >                * about buffer placements.<br>
            >                */<br>
            >               if (clear_value_size > 4 ||<br>
            > -                 (clear_value_size == 4 &&<br>
            > +                 (!force_cpdma &&<br>
            > +                  clear_value_size == 4 &&<br>
            >                    offset % 4 == 0 &&<br>
            >                    (size > 32*1024 ||
            sctx->chip_class <= VI))) {<br>
            >                       si_compute_do_clear_or_copy(sctx,
            dst, offset, NULL, 0,<br>
            >                                                 
             aligned_size, clear_value,<br>
            >                                                 
             clear_value_size, coher);<br>
            >               } else {<br>
            >                       assert(clear_value_size == 4);<br>
            >                       si_cp_dma_clear_buffer(sctx,
            sctx->gfx_cs, dst, offset,<br>
            >                                             
            aligned_size, *clear_value, 0, coher,<br>
            >                                             
            get_cache_policy(sctx, coher, size));<br>
            > @@ -277,21 +279,21 @@ void si_clear_buffer(struct
            si_context *sctx, struct pipe_resource *dst,<br>
            >       }<br>
            >   }<br>
            >   <br>
            >   static void si_pipe_clear_buffer(struct pipe_context
            *ctx,<br>
            >                                struct pipe_resource
            *dst,<br>
            >                                unsigned offset,
            unsigned size,<br>
            >                                const void *clear_value,<br>
            >                                int clear_value_size)<br>
            >   {<br>
            >       si_clear_buffer((struct si_context*)ctx, dst,
            offset, size, (uint32_t*)clear_value,<br>
            > -                     clear_value_size,
            SI_COHERENCY_SHADER);<br>
            > +                     clear_value_size,
            SI_COHERENCY_SHADER, false);<br>
            >   }<br>
            >   <br>
            >   void si_copy_buffer(struct si_context *sctx,<br>
            >                   struct pipe_resource *dst, struct
            pipe_resource *src,<br>
            >                   uint64_t dst_offset, uint64_t
            src_offset, unsigned size)<br>
            >   {<br>
            >       if (!size)<br>
            >               return;<br>
            >   <br>
            >       enum si_coherency coher = SI_COHERENCY_SHADER;<br>
            > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
            b/src/gallium/drivers/radeonsi/si_pipe.c<br>
            > index 5caeb575623..5d376e6181a 100644<br>
            > --- a/src/gallium/drivers/radeonsi/si_pipe.c<br>
            > +++ b/src/gallium/drivers/radeonsi/si_pipe.c<br>
            > @@ -634,21 +634,21 @@ static struct pipe_context
            *si_create_context(struct pipe_screen *screen,<br>
            >                       
             sizeof(sctx->sample_positions),
            &sctx->sample_positions);<br>
            >   <br>
            >       /* this must be last */<br>
            >       si_begin_new_gfx_cs(sctx);<br>
            >   <br>
            >       if (sctx->chip_class == CIK) {<br>
            >               /* Clear the NULL constant buffer,
            because loads should return zeros. */<br>
            >               uint32_t clear_value = 0;<br>
            >               si_clear_buffer(sctx,
            sctx->null_const_buf.buffer, 0,<br>
            >                             
             sctx->null_const_buf.buffer->width0,<br>
            > -                             &clear_value, 4,
            SI_COHERENCY_SHADER);<br>
            > +                             &clear_value, 4,
            SI_COHERENCY_SHADER, true);<br>
            >       }<br>
            >       return &sctx->b;<br>
            >   fail:<br>
            >       fprintf(stderr, "radeonsi: Failed to create a
            context.\n");<br>
            >       si_destroy_context(&sctx->b);<br>
            >       return NULL;<br>
            >   }<br>
            >   <br>
            >   static struct pipe_context
            *si_pipe_create_context(struct pipe_screen *screen,<br>
            >                                                  void
            *priv, unsigned flags)<br>
            > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
            b/src/gallium/drivers/radeonsi/si_pipe.h<br>
            > index 301d38649bf..aaa95f32d20 100644<br>
            > --- a/src/gallium/drivers/radeonsi/si_pipe.h<br>
            > +++ b/src/gallium/drivers/radeonsi/si_pipe.h<br>
            > @@ -1182,21 +1182,22 @@ bool vi_alpha_is_on_msb(enum
            pipe_format format);<br>
            >   void vi_dcc_clear_level(struct si_context *sctx,<br>
            >                       struct si_texture *tex,<br>
            >                       unsigned level, unsigned
            clear_value);<br>
            >   void si_init_clear_functions(struct si_context
            *sctx);<br>
            >   <br>
            >   /* si_compute_blit.c */<br>
            >   unsigned si_get_flush_flags(struct si_context *sctx,
            enum si_coherency coher,<br>
            >                           enum si_cache_policy
            cache_policy);<br>
            >   void si_clear_buffer(struct si_context *sctx, struct
            pipe_resource *dst,<br>
            >                    uint64_t offset, uint64_t size,
            uint32_t *clear_value,<br>
            > -                  uint32_t clear_value_size, enum
            si_coherency coher);<br>
            > +                  uint32_t clear_value_size, enum
            si_coherency coher,<br>
            > +                  bool force_cpdma);<br>
            >   void si_copy_buffer(struct si_context *sctx,<br>
            >                   struct pipe_resource *dst, struct
            pipe_resource *src,<br>
            >                   uint64_t dst_offset, uint64_t
            src_offset, unsigned size);<br>
            >   void si_compute_copy_image(struct si_context *sctx,<br>
            >                          struct pipe_resource *dst,<br>
            >                          unsigned dst_level,<br>
            >                          struct pipe_resource *src,<br>
            >                          unsigned src_level,<br>
            >                          unsigned dstx, unsigned dsty,
            unsigned dstz,<br>
            >                          const struct pipe_box
            *src_box);<br>
            > diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c
            b/src/gallium/drivers/radeonsi/si_test_dma.c<br>
            > index 90a2032cd80..7e396e671be 100644<br>
            > --- a/src/gallium/drivers/radeonsi/si_test_dma.c<br>
            > +++ b/src/gallium/drivers/radeonsi/si_test_dma.c<br>
            > @@ -302,21 +302,21 @@ void si_test_dma(struct si_screen
            *sscreen)<br>
            >                      tsrc.width0, tsrc.height0,
            tsrc.array_size,<br>
            >                      array_mode_to_string(sscreen,
            &ssrc->surface), bpp);<br>
            >               fflush(stdout);<br>
            >   <br>
            >               /* set src pixels */<br>
            >               set_random_pixels(ctx, src,
            &src_cpu);<br>
            >   <br>
            >               /* clear dst pixels */<br>
            >               uint32_t zero = 0;<br>
            >               si_clear_buffer(sctx, dst, 0,
            sdst->surface.surf_size, &zero, 4,<br>
            > -                             SI_COHERENCY_SHADER);<br>
            > +                             SI_COHERENCY_SHADER,
            false);<br>
            >               memset(dst_cpu.ptr, 0,
            dst_cpu.layer_stride * tdst.array_size);<br>
            >   <br>
            >               /* preparation */<br>
            >               max_width = MIN2(tsrc.width0,
            tdst.width0);<br>
            >               max_height = MIN2(tsrc.height0,
            tdst.height0);<br>
            >               max_depth = MIN2(tsrc.array_size,
            tdst.array_size);<br>
            >   <br>
            >               num = do_partial_copies ?
            num_partial_copies : 1;<br>
            >               for (j = 0; j < num; j++) {<br>
            >                       int width, height, depth;<br>
          </blockquote>
        </div>
      </div>
    </blockquote>
  </body>
</html>