[Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch

Tue Apr 19 11:35:50 UTC 2016

OK.

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek

On Tue, Apr 19, 2016 at 1:19 PM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> Write by ssbo/image is completely synchronized by glMemoryBarrier, not
> by the driver. Even for the write after read hazard, the user needs to
> put a barrier before the write. This includes all writes by compute.
>
> For transform feedback we apply the same rules as for graphics:
> reading from the feedback bfufer might go wrong if transform feedback
> is still active, otherwise the driver synchronizes. I can't really
> find anything in the GL spec that disallows the loop though.
>
> For write via framebuffer, writing to a framebuffer and reading it
> from a CS is a render feedback loop, both for write after read and
> read after write. The texture regions therefore need to be disjunct or
> a glTextureBarrier() needs to be used by the user. I am not sure if
> the glTextureBarrier should be enough to prevent a write after read
> hazard, as the spec only talks about it solving the read after write
> hazard. However, in my current patch it solves both, and I think the
> user needs some form of synchronization there, probably rebinding the
> framebuffer. Furthermore, we have this issue too without compute
> shaders.
>
> - Bas
>
> On Tue, Apr 19, 2016 at 12:50 PM, Marek Olšák <maraeo at gmail.com> wrote:
>> There can be read-after-write hazards when transitioning from compute
>> to graphics and vice versa. Is the user expected to call
>> glMemoryBarrier in this case or do we need to synchronize explicitly
>> in the driver?
>>
>> Marek
>>
>> On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen
>> <bas at basnieuwenhuizen.nl> wrote:
>>> v2: Add more CS_PARTIAL_FLUSH events.
>>>
>>> Essentially every place with waits on finishing for pixel shaders
>>> also has a write after read hazard with compute shaders.
>>>
>>> Invalidating L2 waits implicitly on pixel and compute shaders,
>>> so, we don't need a CS_PARTIAL_FLUSH for switching FBO.
>>>
>>> v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.
>>>
>>> According to Marek the INV_GLOBAL_L2 events don't wait for compute
>>> shaders to finish, so wait for them explicitly.
>>>
>>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>>> ---
>>>  src/gallium/drivers/radeonsi/si_compute.c     | 17 ++---------------
>>>  src/gallium/drivers/radeonsi/si_cp_dma.c      |  6 ++++--
>>>  src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
>>>  src/gallium/drivers/radeonsi/si_hw_context.c  |  1 +
>>>  src/gallium/drivers/radeonsi/si_state.c       | 12 ++++++++----
>>>  5 files changed, 17 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>>> index 10b88b3..6803334 100644
>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>> @@ -439,13 +439,8 @@ static void si_launch_grid(
>>>         if (!sctx->cs_shader_state.initialized)
>>>                 si_initialize_compute(sctx);
>>>
>>> -       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>> -                        SI_CONTEXT_INV_GLOBAL_L2 |
>>> -                        SI_CONTEXT_INV_ICACHE |
>>> -                        SI_CONTEXT_INV_SMEM_L1 |
>>> -                        SI_CONTEXT_FLUSH_WITH_INV_L2 |
>>> -                        SI_CONTEXT_FLAG_COMPUTE;
>>> -       si_emit_cache_flush(sctx, NULL);
>>> +       if (sctx->b.flags)
>>> +               si_emit_cache_flush(sctx, NULL);
>>>
>>>         if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
>>>                 return;
>>> @@ -478,14 +473,6 @@ static void si_launch_grid(
>>>                 si_setup_tgsi_grid(sctx, info);
>>>
>>>         si_emit_dispatch_packets(sctx, info);
>>> -
>>> -       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
>>> -                        SI_CONTEXT_INV_VMEM_L1 |
>>> -                        SI_CONTEXT_INV_GLOBAL_L2 |
>>> -                        SI_CONTEXT_INV_ICACHE |
>>> -                        SI_CONTEXT_INV_SMEM_L1 |
>>> -                        SI_CONTEXT_FLAG_COMPUTE;
>>> -       si_emit_cache_flush(sctx, NULL);
>>>  }
>>>
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> index 001ddd4..38e0ee6 100644
>>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>>>         uint64_t va = r600_resource(dst)->gpu_address + offset;
>>>
>>>         /* Flush the caches. */
>>> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>>> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>>
>>>         while (size) {
>>>                 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
>>> @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
>>>         }
>>>
>>>         /* Flush the caches. */
>>> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>>> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>>
>>>         /* This is the main part doing the copying. Src is always aligned. */
>>>         main_dst_offset = dst_offset + skipped_size;
>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> index 5b65fae..98ad3a7 100644
>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
>>>          * start writing to the targets.
>>>          */
>>>         if (num_targets)
>>> -               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>>> +               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> +                                SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>
>>>         /* Streamout buffers must be bound in 2 places:
>>>          * 1) in VGT by setting the VGT_STRMOUT registers
>>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
>>> index 9862f07..b179092e 100644
>>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
>>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
>>> @@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
>>>         ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>>>                         SI_CONTEXT_INV_VMEM_L1 |
>>>                         SI_CONTEXT_INV_GLOBAL_L2 |
>>> +                       SI_CONTEXT_CS_PARTIAL_FLUSH |
>>>                         /* this is probably not needed anymore */
>>>                         SI_CONTEXT_PS_PARTIAL_FLUSH;
>>>         si_emit_cache_flush(ctx, NULL);
>>> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
>>> index af9ffdd..305a70b 100644
>>> --- a/src/gallium/drivers/radeonsi/si_state.c
>>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>>> @@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>>>          */
>>>         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>>                          SI_CONTEXT_INV_GLOBAL_L2 |
>>> -                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
>>> +                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>
>>>         /* Take the maximum of the old and new count. If the new count is lower,
>>>          * dirtying is needed to disable the unbound colorbuffers.
>>> @@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
>>>
>>>         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>>                          SI_CONTEXT_INV_GLOBAL_L2 |
>>> -                        SI_CONTEXT_FLUSH_AND_INV_CB;
>>> +                        SI_CONTEXT_FLUSH_AND_INV_CB |
>>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>  }
>>>
>>>  static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>> @@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>>
>>>         /* Subsequent commands must wait for all shader invocations to
>>>          * complete. */
>>> -       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>>> +       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> +                        SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>
>>>         if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
>>>                 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
>>> @@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>>                      PIPE_BARRIER_SHADER_BUFFER |
>>>                      PIPE_BARRIER_TEXTURE |
>>>                      PIPE_BARRIER_IMAGE |
>>> -                    PIPE_BARRIER_STREAMOUT_BUFFER)) {
>>> +                    PIPE_BARRIER_STREAMOUT_BUFFER |
>>> +                    PIPE_BARRIER_GLOBAL_BUFFER)) {
>>>                 /* As far as I can tell, L1 contents are written back to L2
>>>                  * automatically at end of shader, but the contents of other
>>>                  * L1 caches might still be stale. */
>>> --
>>> 2.8.0
>>>
>>> _______________________________________________
>>> mesa-dev mailing list
>>> mesa-dev at lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev