[Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch
Marek Olšák
maraeo at gmail.com
Tue Apr 19 11:35:50 UTC 2016
OK.
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
On Tue, Apr 19, 2016 at 1:19 PM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> Write by ssbo/image is completely synchronized by glMemoryBarrier, not
> by the driver. Even for the write after read hazard, the user needs to
> put a barrier before the write. This includes all writes by compute.
>
> For transform feedback we apply the same rules as for graphics:
> reading from the feedback bfufer might go wrong if transform feedback
> is still active, otherwise the driver synchronizes. I can't really
> find anything in the GL spec that disallows the loop though.
>
> For write via framebuffer, writing to a framebuffer and reading it
> from a CS is a render feedback loop, both for write after read and
> read after write. The texture regions therefore need to be disjunct or
> a glTextureBarrier() needs to be used by the user. I am not sure if
> the glTextureBarrier should be enough to prevent a write after read
> hazard, as the spec only talks about it solving the read after write
> hazard. However, in my current patch it solves both, and I think the
> user needs some form of synchronization there, probably rebinding the
> framebuffer. Furthermore, we have this issue too without compute
> shaders.
>
> - Bas
>
> On Tue, Apr 19, 2016 at 12:50 PM, Marek Olšák <maraeo at gmail.com> wrote:
>> There can be read-after-write hazards when transitioning from compute
>> to graphics and vice versa. Is the user expected to call
>> glMemoryBarrier in this case or do we need to synchronize explicitly
>> in the driver?
>>
>> Marek
>>
>> On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen
>> <bas at basnieuwenhuizen.nl> wrote:
>>> v2: Add more CS_PARTIAL_FLUSH events.
>>>
>>> Essentially every place with waits on finishing for pixel shaders
>>> also has a write after read hazard with compute shaders.
>>>
>>> Invalidating L2 waits implicitly on pixel and compute shaders,
>>> so, we don't need a CS_PARTIAL_FLUSH for switching FBO.
>>>
>>> v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.
>>>
>>> According to Marek the INV_GLOBAL_L2 events don't wait for compute
>>> shaders to finish, so wait for them explicitly.
>>>
>>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>>> ---
>>> src/gallium/drivers/radeonsi/si_compute.c | 17 ++---------------
>>> src/gallium/drivers/radeonsi/si_cp_dma.c | 6 ++++--
>>> src/gallium/drivers/radeonsi/si_descriptors.c | 3 ++-
>>> src/gallium/drivers/radeonsi/si_hw_context.c | 1 +
>>> src/gallium/drivers/radeonsi/si_state.c | 12 ++++++++----
>>> 5 files changed, 17 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>>> index 10b88b3..6803334 100644
>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>> @@ -439,13 +439,8 @@ static void si_launch_grid(
>>> if (!sctx->cs_shader_state.initialized)
>>> si_initialize_compute(sctx);
>>>
>>> - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>> - SI_CONTEXT_INV_GLOBAL_L2 |
>>> - SI_CONTEXT_INV_ICACHE |
>>> - SI_CONTEXT_INV_SMEM_L1 |
>>> - SI_CONTEXT_FLUSH_WITH_INV_L2 |
>>> - SI_CONTEXT_FLAG_COMPUTE;
>>> - si_emit_cache_flush(sctx, NULL);
>>> + if (sctx->b.flags)
>>> + si_emit_cache_flush(sctx, NULL);
>>>
>>> if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
>>> return;
>>> @@ -478,14 +473,6 @@ static void si_launch_grid(
>>> si_setup_tgsi_grid(sctx, info);
>>>
>>> si_emit_dispatch_packets(sctx, info);
>>> -
>>> - sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
>>> - SI_CONTEXT_INV_VMEM_L1 |
>>> - SI_CONTEXT_INV_GLOBAL_L2 |
>>> - SI_CONTEXT_INV_ICACHE |
>>> - SI_CONTEXT_INV_SMEM_L1 |
>>> - SI_CONTEXT_FLAG_COMPUTE;
>>> - si_emit_cache_flush(sctx, NULL);
>>> }
>>>
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> index 001ddd4..38e0ee6 100644
>>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
>>> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>>> uint64_t va = r600_resource(dst)->gpu_address + offset;
>>>
>>> /* Flush the caches. */
>>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>>
>>> while (size) {
>>> unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
>>> @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
>>> }
>>>
>>> /* Flush the caches. */
>>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>>
>>> /* This is the main part doing the copying. Src is always aligned. */
>>> main_dst_offset = dst_offset + skipped_size;
>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> index 5b65fae..98ad3a7 100644
>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
>>> * start writing to the targets.
>>> */
>>> if (num_targets)
>>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>
>>> /* Streamout buffers must be bound in 2 places:
>>> * 1) in VGT by setting the VGT_STRMOUT registers
>>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
>>> index 9862f07..b179092e 100644
>>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
>>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
>>> @@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
>>> ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>>> SI_CONTEXT_INV_VMEM_L1 |
>>> SI_CONTEXT_INV_GLOBAL_L2 |
>>> + SI_CONTEXT_CS_PARTIAL_FLUSH |
>>> /* this is probably not needed anymore */
>>> SI_CONTEXT_PS_PARTIAL_FLUSH;
>>> si_emit_cache_flush(ctx, NULL);
>>> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
>>> index af9ffdd..305a70b 100644
>>> --- a/src/gallium/drivers/radeonsi/si_state.c
>>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>>> @@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>>> */
>>> sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>> SI_CONTEXT_INV_GLOBAL_L2 |
>>> - SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
>>> + SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>
>>> /* Take the maximum of the old and new count. If the new count is lower,
>>> * dirtying is needed to disable the unbound colorbuffers.
>>> @@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
>>>
>>> sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>>> SI_CONTEXT_INV_GLOBAL_L2 |
>>> - SI_CONTEXT_FLUSH_AND_INV_CB;
>>> + SI_CONTEXT_FLUSH_AND_INV_CB |
>>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>>> }
>>>
>>> static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>> @@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>>
>>> /* Subsequent commands must wait for all shader invocations to
>>> * complete. */
>>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>
>>> if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
>>> sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
>>> @@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>> PIPE_BARRIER_SHADER_BUFFER |
>>> PIPE_BARRIER_TEXTURE |
>>> PIPE_BARRIER_IMAGE |
>>> - PIPE_BARRIER_STREAMOUT_BUFFER)) {
>>> + PIPE_BARRIER_STREAMOUT_BUFFER |
>>> + PIPE_BARRIER_GLOBAL_BUFFER)) {
>>> /* As far as I can tell, L1 contents are written back to L2
>>> * automatically at end of shader, but the contents of other
>>> * L1 caches might still be stale. */
>>> --
>>> 2.8.0
>>>
>>> _______________________________________________
>>> mesa-dev mailing list
>>> mesa-dev at lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list