[Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Tue Apr 19 11:19:57 UTC 2016
Write by ssbo/image is completely synchronized by glMemoryBarrier, not
by the driver. Even for the write after read hazard, the user needs to
put a barrier before the write. This includes all writes by compute.
For transform feedback we apply the same rules as for graphics:
reading from the feedback bfufer might go wrong if transform feedback
is still active, otherwise the driver synchronizes. I can't really
find anything in the GL spec that disallows the loop though.
For write via framebuffer, writing to a framebuffer and reading it
from a CS is a render feedback loop, both for write after read and
read after write. The texture regions therefore need to be disjunct or
a glTextureBarrier() needs to be used by the user. I am not sure if
the glTextureBarrier should be enough to prevent a write after read
hazard, as the spec only talks about it solving the read after write
hazard. However, in my current patch it solves both, and I think the
user needs some form of synchronization there, probably rebinding the
framebuffer. Furthermore, we have this issue too without compute
shaders.
- Bas
On Tue, Apr 19, 2016 at 12:50 PM, Marek Olšák <maraeo at gmail.com> wrote:
> There can be read-after-write hazards when transitioning from compute
> to graphics and vice versa. Is the user expected to call
> glMemoryBarrier in this case or do we need to synchronize explicitly
> in the driver?
>
> Marek
>
> On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen
> <bas at basnieuwenhuizen.nl> wrote:
>> v2: Add more CS_PARTIAL_FLUSH events.
>>
>> Essentially every place with waits on finishing for pixel shaders
>> also has a write after read hazard with compute shaders.
>>
>> Invalidating L2 waits implicitly on pixel and compute shaders,
>> so, we don't need a CS_PARTIAL_FLUSH for switching FBO.
>>
>> v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.
>>
>> According to Marek the INV_GLOBAL_L2 events don't wait for compute
>> shaders to finish, so wait for them explicitly.
>>
>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>> ---
>> src/gallium/drivers/radeonsi/si_compute.c | 17 ++---------------
>> src/gallium/drivers/radeonsi/si_cp_dma.c | 6 ++++--
>> src/gallium/drivers/radeonsi/si_descriptors.c | 3 ++-
>> src/gallium/drivers/radeonsi/si_hw_context.c | 1 +
>> src/gallium/drivers/radeonsi/si_state.c | 12 ++++++++----
>> 5 files changed, 17 insertions(+), 22 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>> index 10b88b3..6803334 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -439,13 +439,8 @@ static void si_launch_grid(
>> if (!sctx->cs_shader_state.initialized)
>> si_initialize_compute(sctx);
>>
>> - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>> - SI_CONTEXT_INV_GLOBAL_L2 |
>> - SI_CONTEXT_INV_ICACHE |
>> - SI_CONTEXT_INV_SMEM_L1 |
>> - SI_CONTEXT_FLUSH_WITH_INV_L2 |
>> - SI_CONTEXT_FLAG_COMPUTE;
>> - si_emit_cache_flush(sctx, NULL);
>> + if (sctx->b.flags)
>> + si_emit_cache_flush(sctx, NULL);
>>
>> if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
>> return;
>> @@ -478,14 +473,6 @@ static void si_launch_grid(
>> si_setup_tgsi_grid(sctx, info);
>>
>> si_emit_dispatch_packets(sctx, info);
>> -
>> - sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
>> - SI_CONTEXT_INV_VMEM_L1 |
>> - SI_CONTEXT_INV_GLOBAL_L2 |
>> - SI_CONTEXT_INV_ICACHE |
>> - SI_CONTEXT_INV_SMEM_L1 |
>> - SI_CONTEXT_FLAG_COMPUTE;
>> - si_emit_cache_flush(sctx, NULL);
>> }
>>
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> index 001ddd4..38e0ee6 100644
>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
>> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>> uint64_t va = r600_resource(dst)->gpu_address + offset;
>>
>> /* Flush the caches. */
>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>
>> while (size) {
>> unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
>> @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
>> }
>>
>> /* Flush the caches. */
>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>>
>> /* This is the main part doing the copying. Src is always aligned. */
>> main_dst_offset = dst_offset + skipped_size;
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index 5b65fae..98ad3a7 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
>> * start writing to the targets.
>> */
>> if (num_targets)
>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>> /* Streamout buffers must be bound in 2 places:
>> * 1) in VGT by setting the VGT_STRMOUT registers
>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
>> index 9862f07..b179092e 100644
>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
>> @@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
>> ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>> SI_CONTEXT_INV_VMEM_L1 |
>> SI_CONTEXT_INV_GLOBAL_L2 |
>> + SI_CONTEXT_CS_PARTIAL_FLUSH |
>> /* this is probably not needed anymore */
>> SI_CONTEXT_PS_PARTIAL_FLUSH;
>> si_emit_cache_flush(ctx, NULL);
>> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
>> index af9ffdd..305a70b 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.c
>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>> @@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>> */
>> sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>> SI_CONTEXT_INV_GLOBAL_L2 |
>> - SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
>> + SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>> /* Take the maximum of the old and new count. If the new count is lower,
>> * dirtying is needed to disable the unbound colorbuffers.
>> @@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
>>
>> sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
>> SI_CONTEXT_INV_GLOBAL_L2 |
>> - SI_CONTEXT_FLUSH_AND_INV_CB;
>> + SI_CONTEXT_FLUSH_AND_INV_CB |
>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>> }
>>
>> static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>> @@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>
>> /* Subsequent commands must wait for all shader invocations to
>> * complete. */
>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> + SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>> if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
>> sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
>> @@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>> PIPE_BARRIER_SHADER_BUFFER |
>> PIPE_BARRIER_TEXTURE |
>> PIPE_BARRIER_IMAGE |
>> - PIPE_BARRIER_STREAMOUT_BUFFER)) {
>> + PIPE_BARRIER_STREAMOUT_BUFFER |
>> + PIPE_BARRIER_GLOBAL_BUFFER)) {
>> /* As far as I can tell, L1 contents are written back to L2
>> * automatically at end of shader, but the contents of other
>> * L1 caches might still be stale. */
>> --
>> 2.8.0
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list