<div dir="ltr"><div>Thanks. It looks good.</div><div><br></div><div>Marek<br></div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Fri, May 17, 2019 at 3:56 AM Juan A. Suarez Romero <<a href="mailto:jasuarez@igalia.com">jasuarez@igalia.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On Fri, 2019-05-10 at 01:19 -0400, Marek Olšák wrote:<br>
> From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank">marek.olsak@amd.com</a>><br>
> <br>
> Bugzilla: <a href="https://bugs.freedesktop.org/show_bug.cgi?id=108824" rel="noreferrer" target="_blank">https://bugs.freedesktop.org/show_bug.cgi?id=108824</a><br>
> <br>
> Cc: 19.1 <<a href="mailto:mesa-stable@lists.freedesktop.org" target="_blank">mesa-stable@lists.freedesktop.org</a>><br>
<br>
Hi, Marek,<br>
<br>
This patch didn't apply cleanly on 19.1 branch, so I've fixed the conflicts.<br>
<br>
You can find the solved patch in <br>
<a href="https://gitlab.freedesktop.org/mesa/mesa/commit/21620c889e3fc78be13f096fcccca273cfd27a3d" rel="noreferrer" target="_blank">https://gitlab.freedesktop.org/mesa/mesa/commit/21620c889e3fc78be13f096fcccca273cfd27a3d</a><br>
<br>
<br>
Thanks!<br>
<br>
<br>
        J.A.<br>
<br>
> ---<br>
>  src/gallium/drivers/radeonsi/si_descriptors.c | 94 ++++++++++++-------<br>
>  src/gallium/drivers/radeonsi/si_pipe.h        |  2 +<br>
>  src/gallium/drivers/radeonsi/si_state_draw.c  |  9 +-<br>
>  3 files changed, 72 insertions(+), 33 deletions(-)<br>
> <br>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c<br>
> index 744fc9a15d7..6a4dcacc0f3 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c<br>
> @@ -1580,242 +1580,272 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx)<br>
>               si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);<br>
>               si_images_update_needs_color_decompress_mask(&sctx->images[i]);<br>
>               si_update_shader_needs_decompress_mask(sctx, i);<br>
>       }<br>
>  <br>
>       si_resident_handles_update_needs_color_decompress(sctx);<br>
>  }<br>
>  <br>
>  /* BUFFER DISCARD/INVALIDATION */<br>
>  <br>
> -/** Reset descriptors of buffer resources after \p buf has been invalidated. */<br>
> +/* Reset descriptors of buffer resources after \p buf has been invalidated.<br>
> + * If buf == NULL, reset all descriptors.<br>
> + */<br>
>  static void si_reset_buffer_resources(struct si_context *sctx,<br>
>                                     struct si_buffer_resources *buffers,<br>
>                                     unsigned descriptors_idx,<br>
>                                     unsigned slot_mask,<br>
>                                     struct pipe_resource *buf,<br>
>                                     enum radeon_bo_priority priority)<br>
>  {<br>
>       struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];<br>
>       unsigned mask = buffers->enabled_mask & slot_mask;<br>
>  <br>
>       while (mask) {<br>
>               unsigned i = u_bit_scan(&mask);<br>
> -             if (buffers->buffers[i] == buf) {<br>
> -                     si_set_buf_desc_address(si_resource(buf), buffers->offsets[i],<br>
> +             struct pipe_resource *buffer = buffers->buffers[i];<br>
> +<br>
> +             if (buffer && (!buf || buffer == buf)) {<br>
> +                     si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],<br>
>                                               descs->list + i*4);<br>
>                       sctx->descriptors_dirty |= 1u << descriptors_idx;<br>
>  <br>
>                       radeon_add_to_gfx_buffer_list_check_mem(sctx,<br>
> -                                                             si_resource(buf),<br>
> +                                                             si_resource(buffer),<br>
>                                                               buffers->writable_mask & (1u << i) ?<br>
>                                                                       RADEON_USAGE_READWRITE :<br>
>                                                                       RADEON_USAGE_READ,<br>
>                                                               priority, true);<br>
>               }<br>
>       }<br>
>  }<br>
>  <br>
> -/* Update all resource bindings where the buffer is bound, including<br>
> +/* Update all buffer bindings where the buffer is bound, including<br>
>   * all resource descriptors. This is invalidate_buffer without<br>
> - * the invalidation. */<br>
> + * the invalidation.<br>
> + *<br>
> + * If buf == NULL, update all buffer bindings.<br>
> + */<br>
>  void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)<br>
>  {<br>
>       struct si_resource *buffer = si_resource(buf);<br>
>       unsigned i, shader;<br>
>       unsigned num_elems = sctx->vertex_elements ?<br>
>                                      sctx->vertex_elements->count : 0;<br>
>  <br>
>       /* We changed the buffer, now we need to bind it where the old one<br>
>        * was bound. This consists of 2 things:<br>
>        *   1) Updating the resource descriptor and dirtying it.<br>
>        *   2) Adding a relocation to the CS, so that it's usable.<br>
>        */<br>
>  <br>
>       /* Vertex buffers. */<br>
> -     if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {<br>
> +     if (!buffer) {<br>
> +             if (num_elems)<br>
> +                     sctx->vertex_buffers_dirty = true;<br>
> +     } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {<br>
>               for (i = 0; i < num_elems; i++) {<br>
>                       int vb = sctx->vertex_elements->vertex_buffer_index[i];<br>
>  <br>
>                       if (vb >= ARRAY_SIZE(sctx->vertex_buffer))<br>
>                               continue;<br>
>                       if (!sctx->vertex_buffer[vb].buffer.resource)<br>
>                               continue;<br>
>  <br>
>                       if (sctx->vertex_buffer[vb].buffer.resource == buf) {<br>
>                               sctx->vertex_buffers_dirty = true;<br>
>                               break;<br>
>                       }<br>
>               }<br>
>       }<br>
>  <br>
>       /* Streamout buffers. (other internal buffers can't be invalidated) */<br>
> -     if (buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {<br>
> +     if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {<br>
>               for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {<br>
>                       struct si_buffer_resources *buffers = &sctx->rw_buffers;<br>
>                       struct si_descriptors *descs =<br>
>                               &sctx->descriptors[SI_DESCS_RW_BUFFERS];<br>
> +                     struct pipe_resource *buffer = buffers->buffers[i];<br>
>  <br>
> -                     if (buffers->buffers[i] != buf)<br>
> +                     if (!buffer || (buf && buffer != buf))<br>
>                               continue;<br>
>  <br>
> -                     si_set_buf_desc_address(si_resource(buf), buffers->offsets[i],<br>
> +                     si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],<br>
>                                               descs->list + i*4);<br>
>                       sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;<br>
>  <br>
>                       radeon_add_to_gfx_buffer_list_check_mem(sctx,<br>
> -                                                             buffer, RADEON_USAGE_WRITE,<br>
> +                                                             si_resource(buffer),<br>
> +                                                             RADEON_USAGE_WRITE,<br>
>                                                               RADEON_PRIO_SHADER_RW_BUFFER,<br>
>                                                               true);<br>
>  <br>
>                       /* Update the streamout state. */<br>
>                       if (sctx->streamout.begin_emitted)<br>
>                               si_emit_streamout_end(sctx);<br>
>                       sctx->streamout.append_bitmask =<br>
>                                       sctx->streamout.enabled_mask;<br>
>                       si_streamout_buffers_dirty(sctx);<br>
>               }<br>
>       }<br>
>  <br>
>       /* Constant and shader buffers. */<br>
> -     if (buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {<br>
> +     if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {<br>
>               for (shader = 0; shader < SI_NUM_SHADERS; shader++)<br>
>                       si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],<br>
>                                                 si_const_and_shader_buffer_descriptors_idx(shader),<br>
>                                                 u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),<br>
>                                                 buf,<br>
>                                                 sctx->const_and_shader_buffers[shader].priority_constbuf);<br>
>       }<br>
>  <br>
> -     if (buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {<br>
> +     if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {<br>
>               for (shader = 0; shader < SI_NUM_SHADERS; shader++)<br>
>                       si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],<br>
>                                                 si_const_and_shader_buffer_descriptors_idx(shader),<br>
>                                                 u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS),<br>
>                                                 buf,<br>
>                                                 sctx->const_and_shader_buffers[shader].priority);<br>
>       }<br>
>  <br>
> -     if (buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {<br>
> +     if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {<br>
>               /* Texture buffers - update bindings. */<br>
>               for (shader = 0; shader < SI_NUM_SHADERS; shader++) {<br>
>                       struct si_samplers *samplers = &sctx->samplers[shader];<br>
>                       struct si_descriptors *descs =<br>
>                               si_sampler_and_image_descriptors(sctx, shader);<br>
>                       unsigned mask = samplers->enabled_mask;<br>
>  <br>
>                       while (mask) {<br>
>                               unsigned i = u_bit_scan(&mask);<br>
> -                             if (samplers->views[i]->texture == buf) {<br>
> +                             struct pipe_resource *buffer = samplers->views[i]->texture;<br>
> +<br>
> +                             if (buffer && (!buf || buffer == buf)) {<br>
>                                       unsigned desc_slot = si_get_sampler_slot(i);<br>
>  <br>
> -                                     si_set_buf_desc_address(si_resource(buf),<br>
> +                                     si_set_buf_desc_address(si_resource(buffer),<br>
>                                                               samplers->views[i]->u.buf.offset,<br>
>                                                               descs->list + desc_slot * 16 + 4);<br>
>                                       sctx->descriptors_dirty |=<br>
>                                               1u << si_sampler_and_image_descriptors_idx(shader);<br>
>  <br>
> -                                     radeon_add_to_gfx_buffer_list_check_mem(sctx,<br>
> -                                                                         buffer, RADEON_USAGE_READ,<br>
> -                                                                         RADEON_PRIO_SAMPLER_BUFFER,<br>
> -                                                                         true);<br>
> +                                     radeon_add_to_gfx_buffer_list_check_mem(<br>
> +                                             sctx, si_resource(buffer),<br>
> +                                             RADEON_USAGE_READ,<br>
> +                                             RADEON_PRIO_SAMPLER_BUFFER, true);<br>
>                               }<br>
>                       }<br>
>               }<br>
>       }<br>
>  <br>
>       /* Shader images */<br>
> -     if (buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {<br>
> +     if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {<br>
>               for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {<br>
>                       struct si_images *images = &sctx->images[shader];<br>
>                       struct si_descriptors *descs =<br>
>                               si_sampler_and_image_descriptors(sctx, shader);<br>
>                       unsigned mask = images->enabled_mask;<br>
>  <br>
>                       while (mask) {<br>
>                               unsigned i = u_bit_scan(&mask);<br>
> +                             struct pipe_resource *buffer = images->views[i].resource;<br>
>  <br>
> -                             if (images->views[i].resource == buf) {<br>
> +                             if (buffer && (!buf || buffer == buf)) {<br>
>                                       unsigned desc_slot = si_get_image_slot(i);<br>
>  <br>
>                                       if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)<br>
>                                               si_mark_image_range_valid(&images->views[i]);<br>
>  <br>
> -                                     si_set_buf_desc_address(si_resource(buf),<br>
> +                                     si_set_buf_desc_address(si_resource(buffer),<br>
>                                                               images->views[i].u.buf.offset,<br>
>                                                               descs->list + desc_slot * 8 + 4);<br>
>                                       sctx->descriptors_dirty |=<br>
>                                               1u << si_sampler_and_image_descriptors_idx(shader);<br>
>  <br>
>                                       radeon_add_to_gfx_buffer_list_check_mem(<br>
> -                                             sctx, buffer,<br>
> +                                             sctx, si_resource(buffer),<br>
>                                               RADEON_USAGE_READWRITE,<br>
>                                               RADEON_PRIO_SAMPLER_BUFFER, true);<br>
>                               }<br>
>                       }<br>
>               }<br>
>       }<br>
>  <br>
>       /* Bindless texture handles */<br>
> -     if (buffer->texture_handle_allocated) {<br>
> +     if (!buffer || buffer->texture_handle_allocated) {<br>
>               struct si_descriptors *descs = &sctx->bindless_descriptors;<br>
>  <br>
>               util_dynarray_foreach(&sctx->resident_tex_handles,<br>
>                                     struct si_texture_handle *, tex_handle) {<br>
>                       struct pipe_sampler_view *view = (*tex_handle)->view;<br>
>                       unsigned desc_slot = (*tex_handle)->desc_slot;<br>
> +                     struct pipe_resource *buffer = view->texture;<br>
>  <br>
> -                     if (view->texture == buf) {<br>
> -                             si_set_buf_desc_address(buffer,<br>
> +                     if (buffer && (!buf || buffer == buf)) {<br>
> +                             si_set_buf_desc_address(si_resource(buffer),<br>
>                                                       view->u.buf.offset,<br>
>                                                       descs->list +<br>
>                                                       desc_slot * 16 + 4);<br>
>  <br>
>                               (*tex_handle)->desc_dirty = true;<br>
>                               sctx->bindless_descriptors_dirty = true;<br>
>  <br>
>                               radeon_add_to_gfx_buffer_list_check_mem(<br>
> -                                     sctx, buffer,<br>
> +                                     sctx, si_resource(buffer),<br>
>                                       RADEON_USAGE_READ,<br>
>                                       RADEON_PRIO_SAMPLER_BUFFER, true);<br>
>                       }<br>
>               }<br>
>       }<br>
>  <br>
>       /* Bindless image handles */<br>
> -     if (buffer->image_handle_allocated) {<br>
> +     if (!buffer || buffer->image_handle_allocated) {<br>
>               struct si_descriptors *descs = &sctx->bindless_descriptors;<br>
>  <br>
>               util_dynarray_foreach(&sctx->resident_img_handles,<br>
>                                     struct si_image_handle *, img_handle) {<br>
>                       struct pipe_image_view *view = &(*img_handle)->view;<br>
>                       unsigned desc_slot = (*img_handle)->desc_slot;<br>
> +                     struct pipe_resource *buffer = view->resource;<br>
>  <br>
> -                     if (view->resource == buf) {<br>
> +                     if (buffer && (!buf || buffer == buf)) {<br>
>                               if (view->access & PIPE_IMAGE_ACCESS_WRITE)<br>
>                                       si_mark_image_range_valid(view);<br>
>  <br>
> -                             si_set_buf_desc_address(buffer,<br>
> +                             si_set_buf_desc_address(si_resource(buffer),<br>
>                                                       view->u.buf.offset,<br>
>                                                       descs->list +<br>
>                                                       desc_slot * 16 + 4);<br>
>  <br>
>                               (*img_handle)->desc_dirty = true;<br>
>                               sctx->bindless_descriptors_dirty = true;<br>
>  <br>
>                               radeon_add_to_gfx_buffer_list_check_mem(<br>
> -                                     sctx, buffer,<br>
> +                                     sctx, si_resource(buffer),<br>
>                                       RADEON_USAGE_READWRITE,<br>
>                                       RADEON_PRIO_SAMPLER_BUFFER, true);<br>
>                       }<br>
>               }<br>
>       }<br>
> +<br>
> +     if (buffer) {<br>
> +             /* Do the same for other contexts. They will invoke this function<br>
> +              * with buffer == NULL.<br>
> +              */<br>
> +             unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter);<br>
> +<br>
> +             /* Skip the update for the current context, because we have already updated<br>
> +              * the buffer bindings.<br>
> +              */<br>
> +             if (new_counter == sctx->last_dirty_buf_counter + 1)<br>
> +                     sctx->last_dirty_buf_counter = new_counter;<br>
> +     }<br>
>  }<br>
>  <br>
>  static void si_upload_bindless_descriptor(struct si_context *sctx,<br>
>                                         unsigned desc_slot,<br>
>                                         unsigned num_dwords)<br>
>  {<br>
>       struct si_descriptors *desc = &sctx->bindless_descriptors;<br>
>       unsigned desc_slot_offset = desc_slot * 16;<br>
>       uint32_t *data;<br>
>       uint64_t va;<br>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> index d3ddb912245..949fa0755cb 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_pipe.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> @@ -519,20 +519,21 @@ struct si_screen {<br>
>       struct si_perfcounters  *perfcounters;<br>
>  <br>
>       /* If pipe_screen wants to recompute and re-emit the framebuffer,<br>
>        * sampler, and image states of all contexts, it should atomically<br>
>        * increment this.<br>
>        *<br>
>        * Each context will compare this with its own last known value of<br>
>        * the counter before drawing and re-emit the states accordingly.<br>
>        */<br>
>       unsigned                        dirty_tex_counter;<br>
> +     unsigned                        dirty_buf_counter;<br>
>  <br>
>       /* Atomically increment this counter when an existing texture's<br>
>        * metadata is enabled or disabled in a way that requires changing<br>
>        * contexts' compressed texture binding masks.<br>
>        */<br>
>       unsigned                        compressed_colortex_counter;<br>
>  <br>
>       struct {<br>
>               /* Context flags to set so that all writes from earlier jobs<br>
>                * in the CP are seen by L2 clients.<br>
> @@ -845,20 +846,21 @@ struct si_context {<br>
>  <br>
>       bool                            has_graphics;<br>
>       bool                            gfx_flush_in_progress:1;<br>
>       bool                            gfx_last_ib_is_busy:1;<br>
>       bool                            compute_is_busy:1;<br>
>  <br>
>       unsigned                        num_gfx_cs_flushes;<br>
>       unsigned                        initial_gfx_cs_size;<br>
>       unsigned                        gpu_reset_counter;<br>
>       unsigned                        last_dirty_tex_counter;<br>
> +     unsigned                        last_dirty_buf_counter;<br>
>       unsigned                        last_compressed_colortex_counter;<br>
>       unsigned                        last_num_draw_calls;<br>
>       unsigned                        flags; /* flush flags */<br>
>       /* Current unaccounted memory usage. */<br>
>       uint64_t                        vram;<br>
>       uint64_t                        gtt;<br>
>  <br>
>       /* Atoms (direct states). */<br>
>       union si_state_atoms            atoms;<br>
>       unsigned                        dirty_atoms; /* mask */<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> index 8e01e1b35e1..d9dfef0a381 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> @@ -1247,21 +1247,21 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i<br>
>       /* Emit draw states. */<br>
>       si_emit_vs_state(sctx, info);<br>
>       si_emit_draw_registers(sctx, info, num_patches);<br>
>  }<br>
>  <br>
>  static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)<br>
>  {<br>
>       struct si_context *sctx = (struct si_context *)ctx;<br>
>       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;<br>
>       struct pipe_resource *indexbuf = info->index.resource;<br>
> -     unsigned dirty_tex_counter;<br>
> +     unsigned dirty_tex_counter, dirty_buf_counter;<br>
>       enum pipe_prim_type rast_prim;<br>
>       unsigned index_size = info->index_size;<br>
>       unsigned index_offset = info->indirect ? info->start * index_size : 0;<br>
>  <br>
>       if (likely(!info->indirect)) {<br>
>               /* SI-CI treat instance_count==0 as instance_count==1. There is<br>
>                * no workaround for indirect draws, but we can at least skip<br>
>                * direct draws.<br>
>                */<br>
>               if (unlikely(!info->instance_count))<br>
> @@ -1285,20 +1285,27 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i<br>
>       dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);<br>
>       if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {<br>
>               sctx->last_dirty_tex_counter = dirty_tex_counter;<br>
>               sctx->framebuffer.dirty_cbufs |=<br>
>                       ((1 << sctx->framebuffer.state.nr_cbufs) - 1);<br>
>               sctx->framebuffer.dirty_zsbuf = true;<br>
>               si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);<br>
>               si_update_all_texture_descriptors(sctx);<br>
>       }<br>
>  <br>
> +     dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);<br>
> +     if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {<br>
> +             sctx->last_dirty_buf_counter = dirty_buf_counter;<br>
> +             /* Rebind all buffers unconditionally. */<br>
> +             si_rebind_buffer(sctx, NULL);<br>
> +     }<br>
> +<br>
>       si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));<br>
>  <br>
>       /* Set the rasterization primitive type.<br>
>        *<br>
>        * This must be done after si_decompress_textures, which can call<br>
>        * draw_vbo recursively, and before si_update_shaders, which uses<br>
>        * current_rast_prim for this draw_vbo call. */<br>
>       if (sctx->gs_shader.cso)<br>
>               rast_prim = sctx->gs_shader.cso->gs_output_prim;<br>
>       else if (sctx->tes_shader.cso) {<br>
<br>
</blockquote></div>