[Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch

Marek Olšák maraeo at gmail.com
Tue Apr 19 10:50:16 UTC 2016


There can be read-after-write hazards when transitioning from compute
to graphics and vice versa. Is the user expected to call
glMemoryBarrier in this case or do we need to synchronize explicitly
in the driver?

Marek

On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> v2: Add more CS_PARTIAL_FLUSH events.
>
> Essentially every place with waits on finishing for pixel shaders
> also has a write after read hazard with compute shaders.
>
> Invalidating L2 waits implicitly on pixel and compute shaders,
> so, we don't need a CS_PARTIAL_FLUSH for switching FBO.
>
> v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.
>
> According to Marek the INV_GLOBAL_L2 events don't wait for compute
> shaders to finish, so wait for them explicitly.
>
> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> ---
>  src/gallium/drivers/radeonsi/si_compute.c     | 17 ++---------------
>  src/gallium/drivers/radeonsi/si_cp_dma.c      |  6 ++++--
>  src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
>  src/gallium/drivers/radeonsi/si_hw_context.c  |  1 +
>  src/gallium/drivers/radeonsi/si_state.c       | 12 ++++++++----
>  5 files changed, 17 insertions(+), 22 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 10b88b3..6803334 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -439,13 +439,8 @@ static void si_launch_grid(
>         if (!sctx->cs_shader_state.initialized)
>                 si_initialize_compute(sctx);
>
> -       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
> -                        SI_CONTEXT_INV_GLOBAL_L2 |
> -                        SI_CONTEXT_INV_ICACHE |
> -                        SI_CONTEXT_INV_SMEM_L1 |
> -                        SI_CONTEXT_FLUSH_WITH_INV_L2 |
> -                        SI_CONTEXT_FLAG_COMPUTE;
> -       si_emit_cache_flush(sctx, NULL);
> +       if (sctx->b.flags)
> +               si_emit_cache_flush(sctx, NULL);
>
>         if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
>                 return;
> @@ -478,14 +473,6 @@ static void si_launch_grid(
>                 si_setup_tgsi_grid(sctx, info);
>
>         si_emit_dispatch_packets(sctx, info);
> -
> -       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> -                        SI_CONTEXT_INV_VMEM_L1 |
> -                        SI_CONTEXT_INV_GLOBAL_L2 |
> -                        SI_CONTEXT_INV_ICACHE |
> -                        SI_CONTEXT_INV_SMEM_L1 |
> -                        SI_CONTEXT_FLAG_COMPUTE;
> -       si_emit_cache_flush(sctx, NULL);
>  }
>
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 001ddd4..38e0ee6 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>         uint64_t va = r600_resource(dst)->gpu_address + offset;
>
>         /* Flush the caches. */
> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>
>         while (size) {
>                 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
> @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
>         }
>
>         /* Flush the caches. */
> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>
>         /* This is the main part doing the copying. Src is always aligned. */
>         main_dst_offset = dst_offset + skipped_size;
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 5b65fae..98ad3a7 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
>          * start writing to the targets.
>          */
>         if (num_targets)
> -               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
> +               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +                                SI_CONTEXT_CS_PARTIAL_FLUSH;
>
>         /* Streamout buffers must be bound in 2 places:
>          * 1) in VGT by setting the VGT_STRMOUT registers
> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
> index 9862f07..b179092e 100644
> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
> @@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
>         ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>                         SI_CONTEXT_INV_VMEM_L1 |
>                         SI_CONTEXT_INV_GLOBAL_L2 |
> +                       SI_CONTEXT_CS_PARTIAL_FLUSH |
>                         /* this is probably not needed anymore */
>                         SI_CONTEXT_PS_PARTIAL_FLUSH;
>         si_emit_cache_flush(ctx, NULL);
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index af9ffdd..305a70b 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>          */
>         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>                          SI_CONTEXT_INV_GLOBAL_L2 |
> -                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
> +                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>
>         /* Take the maximum of the old and new count. If the new count is lower,
>          * dirtying is needed to disable the unbound colorbuffers.
> @@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
>
>         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>                          SI_CONTEXT_INV_GLOBAL_L2 |
> -                        SI_CONTEXT_FLUSH_AND_INV_CB;
> +                        SI_CONTEXT_FLUSH_AND_INV_CB |
> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>  }
>
>  static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
> @@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>
>         /* Subsequent commands must wait for all shader invocations to
>          * complete. */
> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>
>         if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
>                 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
> @@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>                      PIPE_BARRIER_SHADER_BUFFER |
>                      PIPE_BARRIER_TEXTURE |
>                      PIPE_BARRIER_IMAGE |
> -                    PIPE_BARRIER_STREAMOUT_BUFFER)) {
> +                    PIPE_BARRIER_STREAMOUT_BUFFER |
> +                    PIPE_BARRIER_GLOBAL_BUFFER)) {
>                 /* As far as I can tell, L1 contents are written back to L2
>                  * automatically at end of shader, but the contents of other
>                  * L1 caches might still be stale. */
> --
> 2.8.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list