[Mesa-dev] [RFC 3/3] anv: Do relocations in userspace before execbuf ioctl

Tue Nov 1 05:06:02 UTC 2016

There are a couple of other options than doing relocations in userspace.
Unfortunately, all of the options are gross for one reason or another:

 1) Do relocations in userspace.  This depends on certain behavior of the
kernel which has never really changed but could, in theory.

 2) Have a separate surface state buffer for each batch buffer.  This would
work for now since we can duplicate all of the surface states when we go to
emit the binding table.  However, it won't work once bindless becomes a
thing and I really like the way we allocate surface states today.

 3) Add a flag to execbuf2 to tell the kernel to do relocations but don't
stall the GPU in order to do so.  While the proof of correctness below
seems to be ok, I would be more convinced if the kernel were doing the
stall-free relocations.

 4) Start using softpin and/or SVM.  The ultimate solution here is to just
stop doing relocations.  However, due to the 2GB limit on Haswell and the
lack of PPGTT on IVB (I think?), that's not a solution we can use
everywhere.

If kernel types think that this is entirely bogus, I'll implement 2.  It
shouldn't be too bad.

--Jason

On Mon, Oct 31, 2016 at 9:48 PM, Jason Ekstrand <jason at jlekstrand.net>
wrote:

> From: Kristian Høgsberg Kristensen <kristian.h.kristensen at intel.com>
>
> This reduces the amount of stalling that the kernel does between batches
> and improves the performance of Dota 2 on a Sky Lake GT2 desktop by around
> 30%.  Ideally, we would have a simple execbuf2 flag that would let us tell
> the kernel "Trust me, you don't need to stall to relocate" but the kernel
> has no such flag at the moment.
>
> Signed-off-by: Jason Ekstrand <jason at jlekstrand.net>
> Cc: Kristian Høgsberg <krh at bitplanet.net>
> Cc: Ben Widawsky <ben at bwidawsk.net>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>
> ---
>  src/intel/vulkan/anv_device.c | 96 ++++++++++++++++++++++++++++++
> +++++++++++--
>  1 file changed, 93 insertions(+), 3 deletions(-)
>
> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
> index baa767e..ef371a7 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -1068,6 +1068,89 @@ void anv_GetDeviceQueue(
>     *pQueue = anv_queue_to_handle(&device->queue);
>  }
>
> +static void
> +write_reloc(const struct anv_device *device, void *p, uint64_t v)
> +{
> +   if (device->info.gen >= 8)
> +      *(uint64_t *)p = v;
> +   else
> +      *(uint32_t *)p = v;
> +}
> +
> +static void
> +anv_reloc_list_apply(struct anv_reloc_list *list,
> +                     struct anv_device *device, struct anv_bo *bo)
> +{
> +   for (size_t i = 0; i < list->num_relocs; i++) {
> +      void *p = bo->map + list->relocs[i].offset;
> +
> +      struct anv_bo *target_bo = list->reloc_bos[i];
> +      write_reloc(device, p, target_bo->offset + list->relocs[i].delta);
> +      list->relocs[i].presumed_offset = bo->offset;
> +   }
> +}
> +
> +/**
> + * This function applies the relocation for a command buffer and writes
> the
> + * actual addresses into the buffers as per what we were told by the
> kernel on
> + * the previous execbuf2 call.  This should be safe to do because, for
> each
> + * relocated address, we have two cases:
> + *
> + *  1) The buffer to which the address points is not in use by the GPU.
> In
> + *     this case, neither is the 32 or 64-bit chunk of GPU memory storing
> the
> + *     address so it's completely safe to update the address.
> + *
> + *  2) The buffer to which the address points is currently in use by the
> GPU.
> + *     If it's still in use, then some execbuf2 call before this one used
> it
> + *     and it has been in use ever since.  In this case, it can't have
> moved
> + *     since the last execbuffer2 call.  We then have two subcases:
> + *
> + *      a) The 32 or 64-bit chunk of GPU memory storing the address is
> not in
> + *         use by the GPU.  In this case, it is safe to write the
> relocation.
> + *
> + *      a) The 32 or 64-bit chunk of GPU memory storing the address is
> + *         currently in use by the GPU.  In this case the the relocated
> + *         address we will write is exactly the same as the last address
> we
> + *         wrote there.
> + *
> + * By doing relocations on the CPU, we can tell the kernel that it doesn't
> + * need to bother.  We want to do this because the surface state buffer is
> + * used by every command buffer so, if the kernel does the relocations, it
> + * will always be busy and the kernel will always stall.
> + */
> +static void
> +relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer)
> +{
> +   if (!cmd_buffer->execbuf2.need_reloc) {
> +      assert(cmd_buffer->execbuf2.execbuf.flags & I915_EXEC_NO_RELOC);
> +      return;
> +   }
> +
> +   for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) {
> +      if (cmd_buffer->execbuf2.bos[i]->offset == (uint64_t)-1)
> +         return;
> +   }
> +
> +   anv_reloc_list_apply(&cmd_buffer->surface_relocs,
> +                        cmd_buffer->device,
> +                        &cmd_buffer->device->surface_state_block_pool.bo
> );
> +
> +   struct anv_batch_bo **bbo;
> +   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
> +      anv_reloc_list_apply(&(*bbo)->relocs,
> +                           cmd_buffer->device, &(*bbo)->bo);
> +   }
> +
> +   for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) {
> +      struct anv_bo *bo = cmd_buffer->execbuf2.bos[i];
> +
> +      cmd_buffer->execbuf2.objects[i].offset = bo->offset;
> +   }
> +
> +   cmd_buffer->execbuf2.execbuf.flags |= I915_EXEC_NO_RELOC;
> +   cmd_buffer->execbuf2.need_reloc = false;
> +}
> +
>  VkResult
>  anv_device_execbuf(struct anv_device *device,
>                     struct drm_i915_gem_execbuffer2 *execbuf,
> @@ -1097,16 +1180,20 @@ VkResult anv_QueueSubmit(
>     struct anv_device *device = queue->device;
>     VkResult result = VK_SUCCESS;
>
> +   pthread_mutex_lock(&device->mutex);
> +
>     for (uint32_t i = 0; i < submitCount; i++) {
>        for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
>           ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
>                           pSubmits[i].pCommandBuffers[j]);
>           assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
>
> +         relocate_cmd_buffer(cmd_buffer);
> +
>           result = anv_device_execbuf(device,
> &cmd_buffer->execbuf2.execbuf,
>                                       cmd_buffer->execbuf2.bos);
>           if (result != VK_SUCCESS)
> -            return result;
> +            goto out;
>        }
>     }
>
> @@ -1114,10 +1201,13 @@ VkResult anv_QueueSubmit(
>        struct anv_bo *fence_bo = &fence->bo;
>        result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
>        if (result != VK_SUCCESS)
> -         return result;
> +         goto out;
>     }
>
> -   return VK_SUCCESS;
> +out:
> +   pthread_mutex_unlock(&device->mutex);
> +
> +   return result;
>  }
>
>  VkResult anv_QueueWaitIdle(
> --
> 2.5.0.400.gff86faf
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20161031/cb7ab712/attachment.html>