[Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch

Tue Apr 19 11:19:57 UTC 2016

Write by ssbo/image is completely synchronized by glMemoryBarrier, not
by the driver. Even for the write after read hazard, the user needs to
put a barrier before the write. This includes all writes by compute.

For transform feedback we apply the same rules as for graphics:
reading from the feedback bfufer might go wrong if transform feedback
is still active, otherwise the driver synchronizes. I can't really
find anything in the GL spec that disallows the loop though.

For write via framebuffer, writing to a framebuffer and reading it
from a CS is a render feedback loop, both for write after read and
read after write. The texture regions therefore need to be disjunct or
a glTextureBarrier() needs to be used by the user. I am not sure if
the glTextureBarrier should be enough to prevent a write after read
hazard, as the spec only talks about it solving the read after write
hazard. However, in my current patch it solves both, and I think the
user needs some form of synchronization there, probably rebinding the
framebuffer. Furthermore, we have this issue too without compute
shaders.

- Bas

On Tue, Apr 19, 2016 at 12:50 PM, Marek Olšák <maraeo at gmail.com> wrote:
> There can be read-after-write hazards when transitioning from compute
> to graphics and vice versa. Is the user expected to call
> glMemoryBarrier in this case or do we need to synchronize explicitly
> in the driver?
>
> Marek
>
> On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen
> <bas at basnieuwenhuizen.nl> wrote:
>> v2: Add more CS_PARTIAL_FLUSH events.
>>
>> Essentially every place with waits on finishing for pixel shaders
>> also has a write after read hazard with compute shaders.
>>
>> Invalidating L2 waits implicitly on pixel and compute shaders,
>> so, we don't need a CS_PARTIAL_FLUSH for switching FBO.
>>
>> v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.
>>
>> According to Marek the INV_GLOBAL_L2 events don't wait for compute
>> shaders to finish, so wait for them explicitly.
>>
>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>> ---
>>  src/gallium/drivers/radeonsi/si_compute.c     | 17 ++---------------
>>  src/gallium/drivers/radeonsi/si_cp_dma.c      |  6 ++++--
>>  src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
>>  src/gallium/drivers/radeonsi/si_hw_context.c  |  1 +
>>  src/gallium/drivers/radeonsi/si_state.c       | 12 ++++++++----
>>  5 files changed, 17 insertions(+), 22 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>> index 10b88b3..6803334 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -439,13 +439,8 @@ static void si_launch_grid(
>>         if (!sctx->cs_shader_state.initialized)
>>                 si_initialize_compute(sctx);
>>
>> -       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>> -                        SI_CONTEXT_INV_GLOBAL_L2 |
>> -                        SI_CONTEXT_INV_ICACHE |
>> -                        SI_CONTEXT_INV_SMEM_L1 |
>> -                        SI_CONTEXT_FLUSH_WITH_INV_L2 |
>> -                        SI_CONTEXT_FLAG_COMPUTE;
>> -       si_emit_cache_flush(sctx, NULL);
>> +       if (sctx->b.flags)
>> +               si_emit_cache_flush(sctx, NULL);
>>
>>         if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
>>                 return;
>> @@ -478,14 +473,6 @@ static void si_launch_grid(
>>                 si_setup_tgsi_grid(sctx, info);
>>
>>         si_emit_dispatch_packets(sctx, info);
>> -
>> -       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
>> -                        SI_CONTEXT_INV_VMEM_L1 |
>> -                        SI_CONTEXT_INV_GLOBAL_L2 |
>> -                        SI_CONTEXT_INV_ICACHE |
>> -                        SI_CONTEXT_INV_SMEM_L1 |
>> -                        SI_CONTEXT_FLAG_COMPUTE;
>> -       si_emit_cache_flush(sctx, NULL);
>>  }
>>
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> index 001ddd4..38e0ee6 100644
>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>>         uint64_t va = r600_resource(dst)->gpu_address + offset;
>>
>>         /* Flush the caches. */
>> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>
>>         while (size) {
>>                 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
>> @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
>>         }
>>
>>         /* Flush the caches. */
>> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>
>>         /* This is the main part doing the copying. Src is always aligned. */
>>         main_dst_offset = dst_offset + skipped_size;
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index 5b65fae..98ad3a7 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
>>          * start writing to the targets.
>>          */
>>         if (num_targets)
>> -               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>> +               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> +                                SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>>         /* Streamout buffers must be bound in 2 places:
>>          * 1) in VGT by setting the VGT_STRMOUT registers
>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
>> index 9862f07..b179092e 100644
>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
>> @@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
>>         ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>>                         SI_CONTEXT_INV_VMEM_L1 |
>>                         SI_CONTEXT_INV_GLOBAL_L2 |
>> +                       SI_CONTEXT_CS_PARTIAL_FLUSH |
>>                         /* this is probably not needed anymore */
>>                         SI_CONTEXT_PS_PARTIAL_FLUSH;
>>         si_emit_cache_flush(ctx, NULL);
>> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
>> index af9ffdd..305a70b 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.c
>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>> @@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>>          */
>>         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>                          SI_CONTEXT_INV_GLOBAL_L2 |
>> -                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
>> +                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>>         /* Take the maximum of the old and new count. If the new count is lower,
>>          * dirtying is needed to disable the unbound colorbuffers.
>> @@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
>>
>>         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>                          SI_CONTEXT_INV_GLOBAL_L2 |
>> -                        SI_CONTEXT_FLUSH_AND_INV_CB;
>> +                        SI_CONTEXT_FLUSH_AND_INV_CB |
>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>>  }
>>
>>  static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>> @@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>
>>         /* Subsequent commands must wait for all shader invocations to
>>          * complete. */
>> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>>         if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
>>                 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
>> @@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>                      PIPE_BARRIER_SHADER_BUFFER |
>>                      PIPE_BARRIER_TEXTURE |
>>                      PIPE_BARRIER_IMAGE |
>> -                    PIPE_BARRIER_STREAMOUT_BUFFER)) {
>> +                    PIPE_BARRIER_STREAMOUT_BUFFER |
>> +                    PIPE_BARRIER_GLOBAL_BUFFER)) {
>>                 /* As far as I can tell, L1 contents are written back to L2
>>                  * automatically at end of shader, but the contents of other
>>                  * L1 caches might still be stale. */
>> --
>> 2.8.0
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev