[Mesa-dev] [PATCH v4 8/8] anv: Do relocations in userspace before execbuf ioctl

Jason Ekstrand jason at jlekstrand.net
Thu Nov 3 21:57:15 UTC 2016


From: Kristian Høgsberg Kristensen <kristian.h.kristensen at intel.com>

Since our surface state buffer is shared by all batches, the kernel does a
full stall and sync with the CPU between batches every time we call
execbuf2 because it refuses to do relocations on an active buffer.  Doing
them in userspace and passing the NO_RELOC flag to the kernel allows us to
perform the relocations without stalling.

This improves the performance of Dota 2 by around 30% on a Sky Lake GT2.

v2 (Jason Ekstrand):
 - Better comments (Chris Wilson)
 - Fixed write_reloc for correct canonical form (Chris Wilson)

v3 (Jason Ekstrand):
 - Skip relocations which aren't needed
 - Provide an environment variable to always use the kernel
 - More comments about correctness (Chris Wilson)

v4 (Jason Ekstrand):
 - More comments (Chris Wilson)

Signed-off-by: Jason Ekstrand <jason at jlekstrand.net>
---
 src/intel/vulkan/anv_batch_chain.c | 166 ++++++++++++++++++++++++++++++++++---
 src/intel/vulkan/anv_device.c      |  20 ++++-
 2 files changed, 173 insertions(+), 13 deletions(-)

diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index 681bbb6..806f9c6 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -32,6 +32,8 @@
 #include "genxml/gen7_pack.h"
 #include "genxml/gen8_pack.h"
 
+#include "util/debug.h"
+
 /** \file anv_batch_chain.c
  *
  * This file contains functions related to anv_cmd_buffer as a data
@@ -1104,19 +1106,163 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
    };
 }
 
+static void
+anv_reloc_list_apply(struct anv_reloc_list *list,
+                     struct anv_device *device, struct anv_bo *bo,
+                     bool always_relocate)
+{
+   for (size_t i = 0; i < list->num_relocs; i++) {
+      struct anv_bo *target_bo = list->reloc_bos[i];
+      if (list->relocs[i].presumed_offset == target_bo->offset &&
+          !always_relocate)
+         continue;
+
+      void *p = bo->map + list->relocs[i].offset;
+      write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);
+      list->relocs[i].presumed_offset = target_bo->offset;
+   }
+}
+
+/**
+ * This function applies the relocation for a command buffer and writes the
+ * actual addresses into the buffers as per what we were told by the kernel on
+ * the previous execbuf2 call.  This should be safe to do because, for each
+ * relocated address, we have two cases:
+ *
+ *  1) The target BO is inactive (as seen by the kernel).  In this case, it is
+ *     not in use by the GPU so updating the address is 100% ok.  It won't be
+ *     in-use by the GPU (from our context) again until the next execbuf2
+ *     happens.  If the kernel decides to move it in the next execbuf2, it
+ *     will have to do the relocations itself, but that's ok because it should
+ *     have all of the information needed to do so.
+ *
+ *  2) The target BO is active (as seen by the kernel).  In this case, it
+ *     hasn't moved since the last execbuffer2 call because GTT shuffling
+ *     *only* happens when the BO is idle. (From our perspective, it only
+ *     happens inside the execbuffer2 ioctl, but the shuffling may be
+ *     triggered by another ioctl, with full-ppgtt this is limited to only
+ *     execbuffer2 ioctls on the same context, or memory pressure.)  Since the
+ *     target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT
+ *     address and the relocated value we are writing into the BO will be the
+ *     same as the value that is already there.
+ *
+ *     There is also a possibility that the target BO is active but the exact
+ *     RENDER_SURFACE_STATE object we are writing the relocation into isn't in
+ *     use.  In this case, the address currently in the RENDER_SURFACE_STATE
+ *     may be stale but it's still safe to write the relocation because that
+ *     particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
+ *     won't be until the next execbuf2 call.
+ *
+ * By doing relocations on the CPU, we can tell the kernel that it doesn't
+ * need to bother.  We want to do this because the surface state buffer is
+ * used by every command buffer so, if the kernel does the relocations, it
+ * will always be busy and the kernel will always stall.  This is also
+ * probably the fastest mechanism for doing relocations since the kernel would
+ * have to make a full copy of all the relocations lists.
+ */
+static bool
+relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer)
+{
+   static int userspace_relocs = -1;
+   if (userspace_relocs < 0)
+      userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
+   if (!userspace_relocs)
+      return false;
+
+   /* First, we have to check to see whether or not we can even do the
+    * relocation.  New buffers which have never been submitted to the kernel
+    * don't have a valid offset so we need to let the kernel do relocations so
+    * that we can get offsets for them.  On future execbuf2 calls, those
+    * buffers will have offsets and we will be able to skip relocating.
+    * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.
+    */
+   bool offsets_changed = false;
+   for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) {
+      struct anv_bo *bo = cmd_buffer->execbuf2.bos[i];
+
+      if (bo->offset == (uint64_t)-1)
+         return false;
+
+      if (cmd_buffer->execbuf2.objects[i].offset != bo->offset)
+         offsets_changed = true;
+   }
+
+   /* Since surface states are shared between command buffers and we don't
+    * know what order they will be submitted to the kernel, we don't know
+    * what address is actually written in the surface state object at any
+    * given time.  The only option is to always relocate them.
+    */
+   anv_reloc_list_apply(&cmd_buffer->surface_relocs,
+                        cmd_buffer->device,
+                        &cmd_buffer->device->surface_state_block_pool.bo,
+                        true /* always relocate surface states */);
+
+   if (offsets_changed) {
+      /* Since we own all of the batch buffers, we know what values are stored
+       * in the relocated addresses and only have to update them if the
+       * offsets have changed.
+       */
+      struct anv_batch_bo **bbo;
+      u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+         anv_reloc_list_apply(&(*bbo)->relocs,
+                              cmd_buffer->device, &(*bbo)->bo, false);
+      }
+   }
+
+   for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) {
+      struct anv_bo *bo = cmd_buffer->execbuf2.bos[i];
+
+      cmd_buffer->execbuf2.objects[i].offset = bo->offset;
+   }
+
+   return true;
+}
+
 VkResult
 anv_cmd_buffer_execbuf(struct anv_device *device,
                        struct anv_cmd_buffer *cmd_buffer)
 {
-   /* Since surface states are shared between command buffers and we don't
-    * know what order they will be submitted to the kernel, we don't know what
-    * address is actually written in the surface state object at any given
-    * time.  The only option is to set a bogus presumed offset and let
-    * relocations do their job.
-    */
-   for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
-      cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
+   /* Make a copy of the execbuffer2 so we can alter the flags field */
+   struct drm_i915_gem_execbuffer2 execbuf = cmd_buffer->execbuf2.execbuf;
+
+   if (relocate_cmd_buffer(cmd_buffer)) {
+      /* If we were able to successfully relocate everything, tell the kernel
+       * that it can skip doing relocations. The requirement for using
+       * NO_RELOC is:
+       *
+       *  1) The addresses written in the objects must match the corresponding
+       *     reloc.presumed_offset which in turn must match the corresponding
+       *     execobject.offset.
+       *
+       *  2) The address in execobject.offset should match the current address
+       *     of that object within the active context.  (This address was
+       *     returned by the last execbuf2 ioctl.)
+       *
+       * If we were able to successfully relocate everything, tell the kernel
+       * that it can skip doing relocations.
+       *
+       * The kernel may still choose to do relocations anyway if something has
+       * moved in the GTT. In this case, the relocation list still needs to be
+       * valid.  All relocations on the batch buffers are already valid and
+       * kept up-to-date.  For surface state relocations, by applying the
+       * relocations in relocate_cmd_buffer, we ensured that the address in
+       * the RENDER_SURFACE_STATE matches presumed_offset, so it should be
+       * safe for the kernel to relocate them as needed.
+       */
+      execbuf.flags |= I915_EXEC_NO_RELOC;
+   } else {
+      /* In the case where we fall back to doing kernel relocations, we need
+       * to ensure that the relocation list is valid.  All relocations on the
+       * batch buffers are already valid and kept up-to-date.  Since surface
+       * states are shared between command buffers and we don't know what
+       * order they will be submitted to the kernel, we don't know what
+       * address is actually written in the surface state object at any given
+       * time.  The only option is to set a bogus presumed offset and let the
+       * kernel relocate them.
+       */
+      for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
+         cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
+   }
 
-   return anv_device_execbuf(device, &cmd_buffer->execbuf2.execbuf,
-                             cmd_buffer->execbuf2.bos);
+   return anv_device_execbuf(device, &execbuf, cmd_buffer->execbuf2.bos);
 }
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index c40598c..a884b03 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1097,6 +1097,17 @@ VkResult anv_QueueSubmit(
    struct anv_device *device = queue->device;
    VkResult result = VK_SUCCESS;
 
+   /* The anv_cmd_buffer_execbuf function does relocations in userspace.  Due
+    * to the fact that the surface state buffer is shared between batches, we
+    * can't afford to have that happen from multiple threads at the same time.
+    * Fortunately, this should almost never be contended because this lock is
+    * taken very few places and the client isn't allowed to call vkQueueSubmit
+    * from multiple threads on the same queue.  However, for safety against
+    * stupid apps (that bug would be a real pain to track down) we should lock
+    * around command buffer submission.
+    */
+   pthread_mutex_lock(&device->mutex);
+
    for (uint32_t i = 0; i < submitCount; i++) {
       for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
          ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
@@ -1105,7 +1116,7 @@ VkResult anv_QueueSubmit(
 
          result = anv_cmd_buffer_execbuf(device, cmd_buffer);
          if (result != VK_SUCCESS)
-            return result;
+            goto out;
       }
    }
 
@@ -1113,10 +1124,13 @@ VkResult anv_QueueSubmit(
       struct anv_bo *fence_bo = &fence->bo;
       result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
       if (result != VK_SUCCESS)
-         return result;
+         goto out;
    }
 
-   return VK_SUCCESS;
+out:
+   pthread_mutex_unlock(&device->mutex);
+
+   return result;
 }
 
 VkResult anv_QueueWaitIdle(
-- 
2.5.0.400.gff86faf



More information about the mesa-dev mailing list