[Mesa-dev] [PATCH 14/25] radeonsi: implement PIPE_FLUSH_{TOP, BOTTOM}_OF_PIPE

Fri Nov 3 19:42:21 UTC 2017

On 03.11.2017 19:46, Marek Olšák wrote:
> On Fri, Nov 3, 2017 at 3:48 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> On 31.10.2017 17:21, Marek Olšák wrote:
>>>
>>> On Sun, Oct 22, 2017 at 9:07 PM, Nicolai Hähnle <nhaehnle at gmail.com>
>>> wrote:
>>>>
>>>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>>>
>>>> ---
>>>>    src/gallium/drivers/radeonsi/si_fence.c | 83
>>>> ++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 82 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_fence.c
>>>> b/src/gallium/drivers/radeonsi/si_fence.c
>>>> index b7b02b55831..bc0ae302945 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_fence.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_fence.c
>>>> @@ -22,33 +22,41 @@
>>>>     *
>>>>     */
>>>>
>>>>    #include <libsync.h>
>>>>
>>>>    #include "util/os_time.h"
>>>>    #include "util/u_memory.h"
>>>>    #include "util/u_queue.h"
>>>>
>>>>    #include "si_pipe.h"
>>>> +#include "radeon/r600_cs.h"
>>>> +
>>>> +struct si_fine_fence {
>>>> +       struct r600_resource *buf;
>>>> +       unsigned offset;
>>>> +};
>>>>
>>>>    struct si_multi_fence {
>>>>           struct pipe_reference reference;
>>>>           struct pipe_fence_handle *gfx;
>>>>           struct pipe_fence_handle *sdma;
>>>>           struct tc_unflushed_batch_token *tc_token;
>>>>           struct util_queue_fence ready;
>>>>
>>>>           /* If the context wasn't flushed at fence creation, this is
>>>> non-NULL. */
>>>>           struct {
>>>>                   struct r600_common_context *ctx;
>>>>                   unsigned ib_index;
>>>>           } gfx_unflushed;
>>>> +
>>>> +       struct si_fine_fence fine;
>>>>    };
>>>>
>>>>    static void si_add_fence_dependency(struct r600_common_context *rctx,
>>>>                                       struct pipe_fence_handle *fence)
>>>>    {
>>>>           struct radeon_winsys *ws = rctx->ws;
>>>>
>>>>           if (rctx->dma.cs)
>>>>                   ws->cs_add_fence_dependency(rctx->dma.cs, fence);
>>>>           ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
>>>> @@ -59,20 +67,21 @@ static void si_fence_reference(struct pipe_screen
>>>> *screen,
>>>>                                  struct pipe_fence_handle *src)
>>>>    {
>>>>           struct radeon_winsys *ws = ((struct
>>>> r600_common_screen*)screen)->ws;
>>>>           struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
>>>>           struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
>>>>
>>>>           if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
>>>>                   ws->fence_reference(&(*rdst)->gfx, NULL);
>>>>                   ws->fence_reference(&(*rdst)->sdma, NULL);
>>>>                   tc_unflushed_batch_token_reference(&(*rdst)->tc_token,
>>>> NULL);
>>>> +               r600_resource_reference(&(*rdst)->fine.buf, NULL);
>>>>                   FREE(*rdst);
>>>>           }
>>>>            *rdst = rsrc;
>>>>    }
>>>>
>>>>    static struct si_multi_fence *si_create_multi_fence()
>>>>    {
>>>>           struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
>>>>           if (!fence)
>>>>                   return NULL;
>>>> @@ -132,20 +141,66 @@ static void si_fence_server_sync(struct
>>>> pipe_context *ctx,
>>>>            * this fence dependency is signalled.
>>>>            *
>>>>            * Should we flush the context to allow more GPU parallelism?
>>>>            */
>>>>           if (rfence->sdma)
>>>>                   si_add_fence_dependency(rctx, rfence->sdma);
>>>>           if (rfence->gfx)
>>>>                   si_add_fence_dependency(rctx, rfence->gfx);
>>>>    }
>>>>
>>>> +static bool si_fine_fence_signaled(struct radeon_winsys *rws,
>>>> +                                  const struct si_fine_fence *fine)
>>>> +{
>>>> +       char *map = rws->buffer_map(fine->buf->buf, NULL,
>>>> PIPE_TRANSFER_READ |
>>>> +
>>>> PIPE_TRANSFER_UNSYNCHRONIZED);
>>>> +       if (!map)
>>>> +               return false;
>>>> +
>>>> +       uint32_t *fence = (uint32_t*)(map + fine->offset);
>>>> +       return *fence != 0;
>>>
>>>
>>> I think this is not coherent on GFX9 if MTYPE != UC. MTYPE is set by the
>>> GEM_CREATE ioctl.
>>
>>
>> Hmm, this stuff tends to give me headaches.
>>
>> For both top-of-pipe and bottom-of-pipe, a direct write to memory is used
>> (WRITE_DATA with dst_sel=5 (memory); RELEASE_MEM with dst_sel=0
>> (memory_controller)), bypassing the GPU L2.
>>
>> So the only issue could be with CPU caches, and with HDP if the buffer lands
>> in VRAM for some reason. The buffer we're using here is the same as for
>> queries, and AFAIK we don't do any special flushing for those either, so I
>> don't think this is really a problem?
> 
> Right. It's not a problem. In the worst case, the fence write will be
> visible after the next L2 flush on GFX9. I don't know the L2 cache
> policy for CP packets on GFX9, but I guess it's "stream" or something
> stronger even.
> 
> Note that allocator_zeroed_memory allocates from VRAM and the buffer
> is actually never mapped (i.e. we could make it unmappable). We don't
> have any suballocator instance for cached GTT memory at the moment.
> 
> Also, u_suballocator clears memory using the GPU. If the clear hasn't
> been started by the GPU, the memory contains garbage, which is not
> ideal for CPU fences.

Good points, I'll rework the allocation.

Cheers,
Nicolai

> 
> Marek
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.