[Mesa-dev] [PATCH 14/25] radeonsi: implement PIPE_FLUSH_{TOP, BOTTOM}_OF_PIPE

Fri Nov 3 18:46:38 UTC 2017

On Fri, Nov 3, 2017 at 3:48 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 31.10.2017 17:21, Marek Olšák wrote:
>>
>> On Sun, Oct 22, 2017 at 9:07 PM, Nicolai Hähnle <nhaehnle at gmail.com>
>> wrote:
>>>
>>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>>
>>> ---
>>>   src/gallium/drivers/radeonsi/si_fence.c | 83
>>> ++++++++++++++++++++++++++++++++-
>>>   1 file changed, 82 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_fence.c
>>> b/src/gallium/drivers/radeonsi/si_fence.c
>>> index b7b02b55831..bc0ae302945 100644
>>> --- a/src/gallium/drivers/radeonsi/si_fence.c
>>> +++ b/src/gallium/drivers/radeonsi/si_fence.c
>>> @@ -22,33 +22,41 @@
>>>    *
>>>    */
>>>
>>>   #include <libsync.h>
>>>
>>>   #include "util/os_time.h"
>>>   #include "util/u_memory.h"
>>>   #include "util/u_queue.h"
>>>
>>>   #include "si_pipe.h"
>>> +#include "radeon/r600_cs.h"
>>> +
>>> +struct si_fine_fence {
>>> +       struct r600_resource *buf;
>>> +       unsigned offset;
>>> +};
>>>
>>>   struct si_multi_fence {
>>>          struct pipe_reference reference;
>>>          struct pipe_fence_handle *gfx;
>>>          struct pipe_fence_handle *sdma;
>>>          struct tc_unflushed_batch_token *tc_token;
>>>          struct util_queue_fence ready;
>>>
>>>          /* If the context wasn't flushed at fence creation, this is
>>> non-NULL. */
>>>          struct {
>>>                  struct r600_common_context *ctx;
>>>                  unsigned ib_index;
>>>          } gfx_unflushed;
>>> +
>>> +       struct si_fine_fence fine;
>>>   };
>>>
>>>   static void si_add_fence_dependency(struct r600_common_context *rctx,
>>>                                      struct pipe_fence_handle *fence)
>>>   {
>>>          struct radeon_winsys *ws = rctx->ws;
>>>
>>>          if (rctx->dma.cs)
>>>                  ws->cs_add_fence_dependency(rctx->dma.cs, fence);
>>>          ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
>>> @@ -59,20 +67,21 @@ static void si_fence_reference(struct pipe_screen
>>> *screen,
>>>                                 struct pipe_fence_handle *src)
>>>   {
>>>          struct radeon_winsys *ws = ((struct
>>> r600_common_screen*)screen)->ws;
>>>          struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
>>>          struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
>>>
>>>          if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
>>>                  ws->fence_reference(&(*rdst)->gfx, NULL);
>>>                  ws->fence_reference(&(*rdst)->sdma, NULL);
>>>                  tc_unflushed_batch_token_reference(&(*rdst)->tc_token,
>>> NULL);
>>> +               r600_resource_reference(&(*rdst)->fine.buf, NULL);
>>>                  FREE(*rdst);
>>>          }
>>>           *rdst = rsrc;
>>>   }
>>>
>>>   static struct si_multi_fence *si_create_multi_fence()
>>>   {
>>>          struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
>>>          if (!fence)
>>>                  return NULL;
>>> @@ -132,20 +141,66 @@ static void si_fence_server_sync(struct
>>> pipe_context *ctx,
>>>           * this fence dependency is signalled.
>>>           *
>>>           * Should we flush the context to allow more GPU parallelism?
>>>           */
>>>          if (rfence->sdma)
>>>                  si_add_fence_dependency(rctx, rfence->sdma);
>>>          if (rfence->gfx)
>>>                  si_add_fence_dependency(rctx, rfence->gfx);
>>>   }
>>>
>>> +static bool si_fine_fence_signaled(struct radeon_winsys *rws,
>>> +                                  const struct si_fine_fence *fine)
>>> +{
>>> +       char *map = rws->buffer_map(fine->buf->buf, NULL,
>>> PIPE_TRANSFER_READ |
>>> +
>>> PIPE_TRANSFER_UNSYNCHRONIZED);
>>> +       if (!map)
>>> +               return false;
>>> +
>>> +       uint32_t *fence = (uint32_t*)(map + fine->offset);
>>> +       return *fence != 0;
>>
>>
>> I think this is not coherent on GFX9 if MTYPE != UC. MTYPE is set by the
>> GEM_CREATE ioctl.
>
>
> Hmm, this stuff tends to give me headaches.
>
> For both top-of-pipe and bottom-of-pipe, a direct write to memory is used
> (WRITE_DATA with dst_sel=5 (memory); RELEASE_MEM with dst_sel=0
> (memory_controller)), bypassing the GPU L2.
>
> So the only issue could be with CPU caches, and with HDP if the buffer lands
> in VRAM for some reason. The buffer we're using here is the same as for
> queries, and AFAIK we don't do any special flushing for those either, so I
> don't think this is really a problem?

Right. It's not a problem. In the worst case, the fence write will be
visible after the next L2 flush on GFX9. I don't know the L2 cache
policy for CP packets on GFX9, but I guess it's "stream" or something
stronger even.

Note that allocator_zeroed_memory allocates from VRAM and the buffer
is actually never mapped (i.e. we could make it unmappable). We don't
have any suballocator instance for cached GTT memory at the moment.

Also, u_suballocator clears memory using the GPU. If the clear hasn't
been started by the GPU, the memory contains garbage, which is not
ideal for CPU fences.

Marek