[Mesa-dev] [PATCH 14/25] radeonsi: implement PIPE_FLUSH_{TOP, BOTTOM}_OF_PIPE

Nicolai Hähnle nhaehnle at gmail.com
Fri Nov 3 14:48:19 UTC 2017


On 31.10.2017 17:21, Marek Olšák wrote:
> On Sun, Oct 22, 2017 at 9:07 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>
>> ---
>>   src/gallium/drivers/radeonsi/si_fence.c | 83 ++++++++++++++++++++++++++++++++-
>>   1 file changed, 82 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
>> index b7b02b55831..bc0ae302945 100644
>> --- a/src/gallium/drivers/radeonsi/si_fence.c
>> +++ b/src/gallium/drivers/radeonsi/si_fence.c
>> @@ -22,33 +22,41 @@
>>    *
>>    */
>>
>>   #include <libsync.h>
>>
>>   #include "util/os_time.h"
>>   #include "util/u_memory.h"
>>   #include "util/u_queue.h"
>>
>>   #include "si_pipe.h"
>> +#include "radeon/r600_cs.h"
>> +
>> +struct si_fine_fence {
>> +       struct r600_resource *buf;
>> +       unsigned offset;
>> +};
>>
>>   struct si_multi_fence {
>>          struct pipe_reference reference;
>>          struct pipe_fence_handle *gfx;
>>          struct pipe_fence_handle *sdma;
>>          struct tc_unflushed_batch_token *tc_token;
>>          struct util_queue_fence ready;
>>
>>          /* If the context wasn't flushed at fence creation, this is non-NULL. */
>>          struct {
>>                  struct r600_common_context *ctx;
>>                  unsigned ib_index;
>>          } gfx_unflushed;
>> +
>> +       struct si_fine_fence fine;
>>   };
>>
>>   static void si_add_fence_dependency(struct r600_common_context *rctx,
>>                                      struct pipe_fence_handle *fence)
>>   {
>>          struct radeon_winsys *ws = rctx->ws;
>>
>>          if (rctx->dma.cs)
>>                  ws->cs_add_fence_dependency(rctx->dma.cs, fence);
>>          ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
>> @@ -59,20 +67,21 @@ static void si_fence_reference(struct pipe_screen *screen,
>>                                 struct pipe_fence_handle *src)
>>   {
>>          struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
>>          struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
>>          struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
>>
>>          if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
>>                  ws->fence_reference(&(*rdst)->gfx, NULL);
>>                  ws->fence_reference(&(*rdst)->sdma, NULL);
>>                  tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL);
>> +               r600_resource_reference(&(*rdst)->fine.buf, NULL);
>>                  FREE(*rdst);
>>          }
>>           *rdst = rsrc;
>>   }
>>
>>   static struct si_multi_fence *si_create_multi_fence()
>>   {
>>          struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
>>          if (!fence)
>>                  return NULL;
>> @@ -132,20 +141,66 @@ static void si_fence_server_sync(struct pipe_context *ctx,
>>           * this fence dependency is signalled.
>>           *
>>           * Should we flush the context to allow more GPU parallelism?
>>           */
>>          if (rfence->sdma)
>>                  si_add_fence_dependency(rctx, rfence->sdma);
>>          if (rfence->gfx)
>>                  si_add_fence_dependency(rctx, rfence->gfx);
>>   }
>>
>> +static bool si_fine_fence_signaled(struct radeon_winsys *rws,
>> +                                  const struct si_fine_fence *fine)
>> +{
>> +       char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
>> +                                                         PIPE_TRANSFER_UNSYNCHRONIZED);
>> +       if (!map)
>> +               return false;
>> +
>> +       uint32_t *fence = (uint32_t*)(map + fine->offset);
>> +       return *fence != 0;
> 
> I think this is not coherent on GFX9 if MTYPE != UC. MTYPE is set by the
> GEM_CREATE ioctl.

Hmm, this stuff tends to give me headaches.

For both top-of-pipe and bottom-of-pipe, a direct write to memory is 
used (WRITE_DATA with dst_sel=5 (memory); RELEASE_MEM with dst_sel=0 
(memory_controller)), bypassing the GPU L2.

So the only issue could be with CPU caches, and with HDP if the buffer 
lands in VRAM for some reason. The buffer we're using here is the same 
as for queries, and AFAIK we don't do any special flushing for those 
either, so I don't think this is really a problem?


>> +}
>> +
>> +static void si_fine_fence_set(struct si_context *ctx,
>> +                             struct si_fine_fence *fine,
>> +                             unsigned flags)
>> +{
>> +       assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
>> +
>> +       u_suballocator_alloc(ctx->b.allocator_zeroed_memory, 4, 4,
>> +                            &fine->offset, (struct pipe_resource **)&fine->buf);
>> +       if (!fine->buf)
>> +               return;
>> +
>> +       uint64_t fence_va = fine->buf->gpu_address + fine->offset;
>> +
>> +       radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, fine->buf,
>> +                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
>> +       if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
>> +               struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
>> +               radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
>> +               radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
>> +                       S_370_WR_CONFIRM(1) |
>> +                       S_370_ENGINE_SEL(V_370_PFP));
>> +               radeon_emit(cs, fence_va);
>> +               radeon_emit(cs, fence_va >> 32);
>> +               radeon_emit(cs, 0x80000000);
>> +       } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
>> +               si_gfx_write_event_eop(&ctx->b, V_028A90_BOTTOM_OF_PIPE_TS, 0,
>> +                                      EOP_DATA_SEL_VALUE_32BIT,
>> +                                      NULL, fence_va, 0x80000000,
>> +                                      PIPE_QUERY_GPU_FINISHED);
> 
> What's the difference between this and the winsys fence? The winsys
> fence is also a bottom-of-pipe fence. I don't see much difference
> between them because PIPE_FLUSH_BOTTOM_OF_PIPE doesn't prevent IB
> submission, so you'll always get a bottom-of-pipe fence in the winsys.

The intention is that you use PIPE_FLUSH_BOTTOM_OF_PIPE together with 
PIPE_FLUSH_DEFERRED. Perhaps I should add an assert to enforce this.

Cheers,
Nicolai

> 
> Marek
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list